From 7dc45842f268c82fd86f24e2d5f7236ee6f0a4af Mon Sep 17 00:00:00 2001
From: mauryaland <amaury@fouret.org>
Date: Thu, 6 Apr 2023 14:01:05 +0200
Subject: [PATCH 001/124] Fix inconsistency between best path and scores

---
 flair/models/sequence_tagger_utils/viterbi.py | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 607a39cf66..7a57388b5f 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -219,16 +219,32 @@ def decode(
             )
 
         if probabilities_for_all_classes:
-            all_tags = self._all_scores_for_token(scores.cpu(), lengths, sentences)
+            all_tags = self._all_scores_for_token(scores.cpu(), tag_seq, lengths, sentences)
 
         return tags, all_tags
 
-    def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]):
+    def _all_scores_for_token(
+        self, scores: torch.Tensor, tag_seq: torch.IntTensor, lengths: torch.IntTensor, sentences: List[Sentence]
+    ):
         """
         Returns all scores for each tag in tag dictionary.
         :param scores: Scores for current sentence.
         """
         scores = scores.numpy()
+        for i_batch, batch in enumerate(scores):
+            for i, (tag_id, tag_scores) in enumerate(zip(tag_seq, batch)):
+                if type(tag_id) != int and tag_id.item() != np.argmax(tag_scores):
+                    swap_index_score = np.argmax(tag_scores)
+                    scores[i_batch][i][tag_id.item()], scores[i_batch][i][swap_index_score] = (
+                        scores[i_batch][i][swap_index_score],
+                        scores[i_batch][i][tag_id.item()],
+                    )
+                elif type(tag_id) == int and tag_id != np.argmax(tag_scores):
+                    swap_index_score = np.argmax(tag_scores)
+                    scores[i_batch][i][tag_id], scores[i_batch][i][swap_index_score] = (
+                        scores[i_batch][i][swap_index_score],
+                        scores[i_batch][i][tag_id],
+                    )
         prob_tags_per_sentence = []
         for scores_sentence, length, sentence in zip(scores, lengths, sentences):
             scores_sentence = scores_sentence[:length]

From 386d153fca185681e50fa32fefb7d6123353b97e Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 28 Apr 2023 15:17:56 +0200
Subject: [PATCH 002/124] Rename EntityLinker to SpanClassifier

---
 flair/models/entity_linker_model.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py
index a13619d23e..ae4f2ecb07 100644
--- a/flair/models/entity_linker_model.py
+++ b/flair/models/entity_linker_model.py
@@ -75,7 +75,7 @@ def get_candidates(self, mention: str) -> Set[str]:
         return set(self.mention_to_candidates_map[mention]) if mention in self.mention_to_candidates_map else set()
 
 
-class EntityLinker(flair.nn.DefaultClassifier[Sentence, Span]):
+class SpanClassifier(flair.nn.DefaultClassifier[Sentence, Span]):
     """Entity Linking Model.
 
     The model expects text/sentences with annotated entity mentions and predicts entities to these mentions.
@@ -222,7 +222,14 @@ def _mask_scores(self, scores: torch.Tensor, data_points: List[Span]):
         return masked_scores
 
     @classmethod
-    def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "EntityLinker":
+    def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "SpanClassifier":
         from typing import cast
 
-        return cast("EntityLinker", super().load(model_path=model_path))
+        return cast("SpanClassifier", super().load(model_path=model_path))
+
+
+def EntityLinker(**classifierargs):
+    from warnings import warn
+
+    warn("The EntityLinker class is deprecated and will be removed in Flair 1.0. Use SpanClassifier instead!")
+    return SpanClassifier(**classifierargs)

From 816d1f8d5aad257d83b8cd8b7d1b9d131ea16aa5 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 28 Apr 2023 15:18:38 +0200
Subject: [PATCH 003/124] Update deprecation note for WordTagger

---
 flair/models/word_tagger_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index 2ba81fe25f..32f58e17b0 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -14,7 +14,7 @@
 def WordTagger(embeddings, tag_dictionary, tag_type, **classifierargs):
     from warnings import warn
 
-    warn("The WordTagger class is deprecated after Flair version 0.12.2. Use TokenClassifier instead!")
+    warn("The WordTagger class is deprecated and will be removed in Flair 1.0. Use TokenClassifier instead!")
     return TokenClassifier(
         embeddings=embeddings, label_dictionary=tag_dictionary, label_type=tag_type, **classifierargs
     )

From 67dcf3bc2ef2d478396914575b0bf46a1694f614 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 28 Apr 2023 15:20:08 +0200
Subject: [PATCH 004/124] Rename EntityLinker to SpanClassifier

---
 flair/models/__init__.py           | 4 ++--
 tests/models/test_entity_linker.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
index c75e280fae..118ff0794d 100644
--- a/flair/models/__init__.py
+++ b/flair/models/__init__.py
@@ -1,5 +1,5 @@
 from .clustering import ClusteringModel
-from .entity_linker_model import EntityLinker
+from .entity_linker_model import SpanClassifier
 from .language_model import LanguageModel
 from .lemmatizer_model import Lemmatizer
 from .multitask_model import MultitaskModel
@@ -15,7 +15,7 @@
 from .word_tagger_model import TokenClassifier, WordTagger
 
 __all__ = [
-    "EntityLinker",
+    "SpanClassifier",
     "LanguageModel",
     "Lemmatizer",
     "TextPairClassifier",
diff --git a/tests/models/test_entity_linker.py b/tests/models/test_entity_linker.py
index c6680eb5a9..e867e272c1 100644
--- a/tests/models/test_entity_linker.py
+++ b/tests/models/test_entity_linker.py
@@ -3,12 +3,12 @@
 from flair.data import Sentence
 from flair.datasets import NEL_ENGLISH_AIDA
 from flair.embeddings import TransformerWordEmbeddings
-from flair.models import EntityLinker
+from flair.models import SpanClassifier
 from tests.model_test_utils import BaseModelTest
 
 
 class TestEntityLinker(BaseModelTest):
-    model_cls = EntityLinker
+    model_cls = SpanClassifier
     train_label_type = "nel"
     training_args = {"max_epochs": 2}
 

From 0fb6cb2751fc644d3a9706a8d3b864b8bd2ff2f3 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Fri, 28 Apr 2023 15:22:53 +0200
Subject: [PATCH 005/124] Remove legacy embeddings to fix pytest flair

---
 flair/embeddings/__init__.py |  18 -
 flair/embeddings/legacy.py   | 922 +----------------------------------
 2 files changed, 1 insertion(+), 939 deletions(-)

diff --git a/flair/embeddings/__init__.py b/flair/embeddings/__init__.py
index d03cb87067..04e1d1376b 100644
--- a/flair/embeddings/__init__.py
+++ b/flair/embeddings/__init__.py
@@ -32,19 +32,10 @@
 
 # Expose legacy embedding classes
 from .legacy import (
-    BertEmbeddings,
-    CamembertEmbeddings,
     CharLMEmbeddings,
     DocumentLSTMEmbeddings,
     DocumentMeanEmbeddings,
     ELMoEmbeddings,
-    ELMoTransformerEmbeddings,
-    OpenAIGPT2Embeddings,
-    OpenAIGPTEmbeddings,
-    RoBERTaEmbeddings,
-    XLMEmbeddings,
-    XLMRobertaEmbeddings,
-    XLNetEmbeddings,
 )
 
 # Expose token embedding classes
@@ -81,18 +72,9 @@
     "ImageEmbeddings",
     "NetworkImageEmbeddings",
     "PrecomputedImageEmbeddings",
-    "BertEmbeddings",
-    "CamembertEmbeddings",
     "CharLMEmbeddings",
     "DocumentLSTMEmbeddings",
     "DocumentMeanEmbeddings",
-    "ELMoTransformerEmbeddings",
-    "OpenAIGPT2Embeddings",
-    "OpenAIGPTEmbeddings",
-    "RoBERTaEmbeddings",
-    "XLMEmbeddings",
-    "XLMRobertaEmbeddings",
-    "XLNetEmbeddings",
     "BPEmbSerializable",
     "BytePairEmbeddings",
     "CharacterEmbeddings",
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 39275a16a6..75983da8a8 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -1,37 +1,13 @@
 import logging
 import re
-from abc import abstractmethod
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Union
 
 import torch
 from deprecated import deprecated
-from transformers import (
-    AlbertModel,
-    AlbertTokenizer,
-    BertModel,
-    BertTokenizer,
-    CamembertModel,
-    CamembertTokenizer,
-    GPT2Model,
-    GPT2Tokenizer,
-    OpenAIGPTModel,
-    OpenAIGPTTokenizer,
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    RobertaModel,
-    RobertaTokenizer,
-    XLMModel,
-    XLMRobertaModel,
-    XLMRobertaTokenizer,
-    XLMTokenizer,
-    XLNetModel,
-    XLNetTokenizer,
-)
 
 import flair
 from flair.data import Sentence, Token
-from flair.embeddings.base import ScalarMix
 from flair.embeddings.document import DocumentEmbeddings
 from flair.embeddings.token import StackedEmbeddings, TokenEmbeddings
 from flair.file_utils import cached_path
@@ -481,824 +457,6 @@ def __str__(self) -> str:
         return self.name
 
 
-class XLNetEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "xlnet-large-cased",
-        layers: str = "1",
-        pooling_operation: str = "first_last",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """XLNet embeddings, as proposed in Yang et al., 2019.
-        :param pretrained_model_name_or_path: name or path of XLNet model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = XLNetModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<s>",
-            eos_token="</s>",
-        )
-
-        return sentences
-
-    def extra_repr(self):
-        return f"model={self.name}"
-
-    def __str__(self) -> str:
-        return self.name
-
-
-class XLMEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "xlm-mlm-en-2048",
-        layers: str = "1",
-        pooling_operation: str = "first_last",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """XLM embeddings, as proposed in Guillaume et al., 2019.
-        :param pretrained_model_name_or_path: name or path of XLM model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = XLMTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = XLMModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<s>",
-            eos_token="</s>",
-        )
-
-        return sentences
-
-    def extra_repr(self):
-        return f"model={self.name}"
-
-    def __str__(self) -> str:
-        return self.name
-
-
-class OpenAIGPTEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "openai-gpt",
-        layers: str = "1",
-        pooling_operation: str = "first_last",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """OpenAI GPT embeddings, as proposed in Radford et al. 2018.
-        :param pretrained_model_name_or_path: name or path of OpenAI GPT model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = OpenAIGPTModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-        )
-
-        return sentences
-
-    def extra_repr(self):
-        return f"model={self.name}"
-
-    def __str__(self) -> str:
-        return self.name
-
-
-class OpenAIGPT2Embeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "gpt2-medium",
-        layers: str = "1",
-        pooling_operation: str = "first_last",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019.
-        :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = GPT2Model.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<|endoftext|>",
-            eos_token="<|endoftext|>",
-        )
-
-        return sentences
-
-
-class RoBERTaEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "roberta-base",
-        layers: str = "-1",
-        pooling_operation: str = "first",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """RoBERTa, as proposed by Liu et al. 2019.
-        :param pretrained_model_name_or_path: name or path of RoBERTa model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = RobertaModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<s>",
-            eos_token="</s>",
-        )
-
-        return sentences
-
-
-class CamembertEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "camembert-base",
-        layers: str = "-1",
-        pooling_operation: str = "first",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """CamemBERT, a Tasty French Language Model, as proposed by Martin et al. 2019.
-        :param pretrained_model_name_or_path: name or path of RoBERTa model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = CamembertTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = CamembertModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["tokenizer"] = None
-        return state
-
-    def __setstate__(self, d):
-        super().__setstate__(d)
-
-        # 1-camembert-base -> camembert-base
-        if any(char.isdigit() for char in self.name):
-            self.tokenizer = CamembertTokenizer.from_pretrained("-".join(self.name.split("-")[1:]))
-        else:
-            self.tokenizer = CamembertTokenizer.from_pretrained(self.name)
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<s>",
-            eos_token="</s>",
-        )
-
-        return sentences
-
-
-class XLMRobertaEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        pretrained_model_name_or_path: str = "xlm-roberta-large",
-        layers: str = "-1",
-        pooling_operation: str = "first",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """XLM-RoBERTa as proposed by Conneau et al. 2019.
-        :param pretrained_model_name_or_path: name or path of XLM-R model
-        :param layers: comma-separated list of layers
-        :param pooling_operation: defines pooling operation for subwords
-        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s).
-        """
-        super().__init__()
-
-        self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path)
-        self.model = XLMRobertaModel.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path,
-            output_hidden_states=True,
-        )
-        self.name = pretrained_model_name_or_path
-        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.static_embeddings = True
-
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["tokenizer"] = None
-        return state
-
-    def __setstate__(self, d):
-        super().__setstate__(d)
-
-        # 1-xlm-roberta-large -> xlm-roberta-large
-        self.tokenizer = self.tokenizer = XLMRobertaTokenizer.from_pretrained("-".join(self.name.split("-")[1:]))
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        self.model.to(flair.device)
-        self.model.eval()
-
-        sentences = _get_transformer_sentence_embeddings(
-            sentences=sentences,
-            tokenizer=self.tokenizer,
-            model=self.model,
-            name=self.name,
-            layers=self.layers,
-            pooling_operation=self.pooling_operation,
-            use_scalar_mix=self.use_scalar_mix,
-            bos_token="<s>",
-            eos_token="</s>",
-        )
-
-        return sentences
-
-
-def _extract_embeddings(
-    hidden_states: List[torch.FloatTensor],
-    layers: List[int],
-    pooling_operation: str,
-    subword_start_idx: int,
-    subword_end_idx: int,
-    use_scalar_mix: bool = False,
-) -> List[torch.FloatTensor]:
-    """Extracts subword embeddings from specified layers from hidden states.
-    :param hidden_states: list of hidden states from model
-    :param layers: list of layers
-    :param pooling_operation: pooling operation for subword embeddings (supported: first, last, first_last and mean)
-    :param subword_start_idx: defines start index for subword
-    :param subword_end_idx: defines end index for subword
-    :param use_scalar_mix: determines, if scalar mix should be used
-    :return: list of extracted subword embeddings.
-    """
-    subtoken_embeddings: List[torch.FloatTensor] = []
-
-    for layer in layers:
-        current_embeddings = hidden_states[layer][0][subword_start_idx:subword_end_idx]
-
-        first_embedding: torch.FloatTensor = current_embeddings[0]
-        if pooling_operation == "first_last":
-            last_embedding: torch.FloatTensor = current_embeddings[-1]
-            final_embedding: torch.FloatTensor = torch.cat([first_embedding, last_embedding])
-        elif pooling_operation == "last":
-            final_embedding: torch.FloatTensor = current_embeddings[-1]
-        elif pooling_operation == "mean":
-            all_embeddings: List[torch.FloatTensor] = [embedding.unsqueeze(0) for embedding in current_embeddings]
-            final_embedding: torch.FloatTensor = torch.mean(torch.cat(all_embeddings, dim=0), dim=0)
-        else:
-            final_embedding: torch.FloatTensor = first_embedding
-
-        subtoken_embeddings.append(final_embedding)
-
-    if use_scalar_mix:
-        sm = ScalarMix(mixture_size=len(subtoken_embeddings))
-        sm_embeddings = sm(subtoken_embeddings)
-
-        subtoken_embeddings = [sm_embeddings]
-
-    return subtoken_embeddings
-
-
-def _build_token_subwords_mapping(sentence: Sentence, tokenizer: PreTrainedTokenizer) -> Tuple[Dict[int, int], str]:
-    """Builds a dictionary that stores the following information:
-    Token index (key) and number of corresponding subwords (value) for a sentence.
-
-    :param sentence: input sentence
-    :param tokenizer: Transformers tokenization object
-    :return: dictionary of token index to corresponding number of subwords, tokenized string
-    """
-    token_subwords_mapping: Dict[int, int] = {}
-
-    tokens = []
-
-    for token in sentence.tokens:
-        token_text = token.text
-
-        subwords = tokenizer.tokenize(token_text)
-
-        tokens.append(token.text if subwords else tokenizer.unk_token)
-
-        token_subwords_mapping[token.idx] = len(subwords) if subwords else 1
-
-    return token_subwords_mapping, " ".join(tokens)
-
-
-def _build_token_subwords_mapping_gpt2(
-    sentence: Sentence, tokenizer: PreTrainedTokenizer
-) -> Tuple[Dict[int, int], str]:
-    """Builds a dictionary that stores the following information:
-    Token index (key) and number of corresponding subwords (value) for a sentence.
-
-    :param sentence: input sentence
-    :param tokenizer: Transformers tokenization object
-    :return: dictionary of token index to corresponding number of subwords, tokenized string
-    """
-    token_subwords_mapping: Dict[int, int] = {}
-
-    tokens = []
-
-    for token in sentence.tokens:
-        # Dummy token is needed to get the actually token tokenized correctly with special ``Ġ`` symbol
-
-        if token.idx == 1:
-            token_text = token.text
-            subwords = tokenizer.tokenize(token_text)
-        else:
-            token_text = "X " + token.text
-            subwords = tokenizer.tokenize(token_text)[1:]
-
-        tokens.append(token.text if subwords else tokenizer.unk_token)
-
-        token_subwords_mapping[token.idx] = len(subwords) if subwords else 1
-
-    return token_subwords_mapping, " ".join(tokens)
-
-
-def _get_transformer_sentence_embeddings(
-    sentences: List[Sentence],
-    tokenizer: PreTrainedTokenizer,
-    model: PreTrainedModel,
-    name: str,
-    layers: List[int],
-    pooling_operation: str,
-    use_scalar_mix: bool,
-    bos_token: Optional[str] = None,
-    eos_token: Optional[str] = None,
-) -> List[Sentence]:
-    """Builds sentence embeddings for Transformer-based architectures.
-    :param sentences: input sentences
-    :param tokenizer: tokenization object
-    :param model: model object
-    :param name: name of the Transformer-based model
-    :param layers: list of layers
-    :param pooling_operation: defines pooling operation for subword extraction
-    :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
-    :param bos_token: defines begin of sentence token (used for left padding)
-    :param eos_token: defines end of sentence token (used for right padding)
-    :return: list of sentences (each token of a sentence is now embedded).
-    """
-    with torch.no_grad():
-        for sentence in sentences:
-            token_subwords_mapping: Dict[int, int] = {}
-
-            if ("gpt2" in name or "roberta" in name) and "xlm" not in name:
-                (
-                    token_subwords_mapping,
-                    tokenized_string,
-                ) = _build_token_subwords_mapping_gpt2(sentence=sentence, tokenizer=tokenizer)
-            else:
-                (
-                    token_subwords_mapping,
-                    tokenized_string,
-                ) = _build_token_subwords_mapping(sentence=sentence, tokenizer=tokenizer)
-
-            subwords = tokenizer.tokenize(tokenized_string)
-
-            offset = 0
-
-            if bos_token:
-                subwords = [bos_token, *subwords]
-                offset = 1
-
-            if eos_token:
-                subwords = [*subwords, eos_token]
-
-            indexed_tokens = tokenizer.convert_tokens_to_ids(subwords)
-            tokens_tensor = torch.tensor([indexed_tokens])
-            tokens_tensor = tokens_tensor.to(flair.device)
-
-            hidden_states = model(tokens_tensor)[-1]
-
-            for token in sentence.tokens:
-                len_subwords = token_subwords_mapping[token.idx]
-
-                subtoken_embeddings = _extract_embeddings(
-                    hidden_states=hidden_states,
-                    layers=layers,
-                    pooling_operation=pooling_operation,
-                    subword_start_idx=offset,
-                    subword_end_idx=offset + len_subwords,
-                    use_scalar_mix=use_scalar_mix,
-                )
-
-                offset += len_subwords
-
-                final_subtoken_embedding = torch.cat(subtoken_embeddings)
-                token.set_embedding(name, final_subtoken_embedding)
-
-    return sentences
-
-
-class BertEmbeddings(TokenEmbeddings):
-    @deprecated(
-        version="0.4.5",
-        reason="Use 'TransformerWordEmbeddings' for all transformer-based word embeddings",
-    )
-    def __init__(
-        self,
-        bert_model_or_path: str = "bert-base-uncased",
-        layers: str = "-1,-2,-3,-4",
-        pooling_operation: str = "first",
-        use_scalar_mix: bool = False,
-    ) -> None:
-        """Bidirectional transformer embeddings of words, as proposed in Devlin et al., 2018.
-        :param bert_model_or_path: name of BERT model ('') or directory path containing custom model, configuration file
-        and vocab file (names of three files should be - config.json, pytorch_model.bin/model.chkpt, vocab.txt)
-        :param layers: string indicating which layers to take for embedding
-        :param pooling_operation: how to get from token piece embeddings to token embedding. Either pool them and take
-        the average ('mean') or use first word piece embedding as token embedding ('first).
-        """
-        super().__init__()
-
-        if "distilbert" in bert_model_or_path:
-            try:
-                from transformers import DistilBertModel, DistilBertTokenizer
-            except ImportError:
-                log.warning("-" * 100)
-                log.warning("ATTENTION! To use DistilBert, please first install a recent version of transformers!")
-                log.warning("-" * 100)
-                pass
-
-            self.tokenizer = DistilBertTokenizer.from_pretrained(bert_model_or_path)
-            self.model = DistilBertModel.from_pretrained(
-                pretrained_model_name_or_path=bert_model_or_path,
-                output_hidden_states=True,
-            )
-        elif "albert" in bert_model_or_path:
-            self.tokenizer = AlbertTokenizer.from_pretrained(bert_model_or_path)
-            self.model = AlbertModel.from_pretrained(
-                pretrained_model_name_or_path=bert_model_or_path,
-                output_hidden_states=True,
-            )
-        else:
-            self.tokenizer = BertTokenizer.from_pretrained(bert_model_or_path)
-            self.model = BertModel.from_pretrained(
-                pretrained_model_name_or_path=bert_model_or_path,
-                output_hidden_states=True,
-            )
-        self.layer_indexes = [int(x) for x in layers.split(",")]
-        self.pooling_operation = pooling_operation
-        self.use_scalar_mix = use_scalar_mix
-        self.name = str(bert_model_or_path)
-        self.static_embeddings = True
-
-    class BertInputFeatures:
-        """Private helper class for holding BERT-formatted features."""
-
-        def __init__(
-            self,
-            unique_id,
-            tokens,
-            input_ids,
-            input_mask,
-            input_type_ids,
-            token_subtoken_count,
-        ) -> None:
-            self.unique_id = unique_id
-            self.tokens = tokens
-            self.input_ids = input_ids
-            self.input_mask = input_mask
-            self.input_type_ids = input_type_ids
-            self.token_subtoken_count = token_subtoken_count
-
-    def _convert_sentences_to_features(self, sentences, max_sequence_length: int) -> [BertInputFeatures]:
-        max_sequence_length = max_sequence_length + 2
-
-        features: List[BertEmbeddings.BertInputFeatures] = []
-        for sentence_index, sentence in enumerate(sentences):
-            bert_tokenization: List[str] = []
-            token_subtoken_count: Dict[int, int] = {}
-
-            for token in sentence:
-                subtokens = self.tokenizer.tokenize(token.text)
-                bert_tokenization.extend(subtokens)
-                token_subtoken_count[token.idx] = len(subtokens)
-
-            if len(bert_tokenization) > max_sequence_length - 2:
-                bert_tokenization = bert_tokenization[0 : (max_sequence_length - 2)]
-
-            tokens = []
-            input_type_ids = []
-            tokens.append("[CLS]")
-            input_type_ids.append(0)
-            for token in bert_tokenization:
-                tokens.append(token)
-                input_type_ids.append(0)
-            tokens.append("[SEP]")
-            input_type_ids.append(0)
-
-            input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
-            # The mask has 1 for real tokens and 0 for padding tokens. Only real
-            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            while len(input_ids) < max_sequence_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                input_type_ids.append(0)
-
-            features.append(
-                BertEmbeddings.BertInputFeatures(
-                    unique_id=sentence_index,
-                    tokens=tokens,
-                    input_ids=input_ids,
-                    input_mask=input_mask,
-                    input_type_ids=input_type_ids,
-                    token_subtoken_count=token_subtoken_count,
-                )
-            )
-
-        return features
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        """Add embeddings to all words in a list of sentences. If embeddings are already added,
-        updates only if embeddings are non-static.
-        """
-        # first, find longest sentence in batch
-        longest_sentence_in_batch: int = len(
-            max(
-                [self.tokenizer.tokenize(sentence.to_tokenized_string()) for sentence in sentences],
-                key=len,
-            )
-        )
-
-        # prepare id maps for BERT model
-        features = self._convert_sentences_to_features(sentences, longest_sentence_in_batch)
-        all_input_ids = torch.LongTensor([f.input_ids for f in features]).to(flair.device)
-        all_input_masks = torch.LongTensor([f.input_mask for f in features]).to(flair.device)
-
-        # put encoded batch through BERT model to get all hidden states of all encoder layers
-        self.model.to(flair.device)
-        self.model.eval()
-        all_encoder_layers = self.model(all_input_ids, attention_mask=all_input_masks)[-1]
-
-        with torch.no_grad():
-            for sentence_index, sentence in enumerate(sentences):
-                feature = features[sentence_index]
-
-                # get aggregated embeddings for each BERT-subtoken in sentence
-                subtoken_embeddings = []
-                for token_index, _ in enumerate(feature.tokens):
-                    all_layers = []
-                    for layer_index in self.layer_indexes:
-                        layer_output = all_encoder_layers[int(layer_index)][sentence_index]
-                        all_layers.append(layer_output[token_index])
-
-                    if self.use_scalar_mix:
-                        sm = ScalarMix(mixture_size=len(all_layers))
-                        sm_embeddings = sm(all_layers)
-                        all_layers = [sm_embeddings]
-
-                    subtoken_embeddings.append(torch.cat(all_layers))
-
-                # get the current sentence object
-                token_idx = 0
-                for token in sentence:
-                    # add concatenated embedding to sentence
-                    token_idx += 1
-
-                    if self.pooling_operation == "first":
-                        # use first subword embedding if pooling operation is 'first'
-                        token.set_embedding(self.name, subtoken_embeddings[token_idx])
-                    else:
-                        # otherwise, do a mean over all subwords in token
-                        embeddings = subtoken_embeddings[
-                            token_idx : token_idx + feature.token_subtoken_count[token.idx]
-                        ]
-                        embeddings = [embedding.unsqueeze(0) for embedding in embeddings]
-                        mean = torch.mean(torch.cat(embeddings, dim=0), dim=0)
-                        token.set_embedding(self.name, mean)
-
-                    token_idx += feature.token_subtoken_count[token.idx] - 1
-
-        return sentences
-
-    @property
-    @abstractmethod
-    def embedding_length(self) -> int:
-        """Returns the length of the embedding vector."""
-        return (
-            len(self.layer_indexes) * self.model.config.hidden_size
-            if not self.use_scalar_mix
-            else self.model.config.hidden_size
-        )
-
-
 class DocumentMeanEmbeddings(DocumentEmbeddings):
     @deprecated(
         version="0.3.1",
@@ -1512,81 +670,3 @@ def embed(self, sentences: Union[List[Sentence], Sentence]):
 
     def _add_embeddings_internal(self, sentences: List[Sentence]):
         pass
-
-
-class ELMoTransformerEmbeddings(TokenEmbeddings):
-    """Contextual word embeddings using word-level Transformer-based LM, as proposed in Peters et al., 2018."""
-
-    @deprecated(
-        version="0.4.2",
-        reason="Not possible to load or save ELMo Transformer models. @stefan-it is working on it.",
-    )
-    def __init__(self, model_file: str) -> None:
-        super().__init__()
-
-        try:
-            from allennlp.data.token_indexers.elmo_indexer import (
-                ELMoTokenCharactersIndexer,
-            )
-            from allennlp.modules.token_embedders.bidirectional_language_model_token_embedder import (
-                BidirectionalLanguageModelTokenEmbedder,
-            )
-        except ModuleNotFoundError:
-            log.warning("-" * 100)
-            log.warning('ATTENTION! The library "allennlp" is not installed!')
-            log.warning(
-                "To use ELMoTransformerEmbeddings, please first install a recent version from https://github.com/allenai/allennlp"
-            )
-            log.warning("-" * 100)
-            pass
-
-        self.name = "elmo-transformer"
-        self.static_embeddings = True
-        self.lm_embedder = BidirectionalLanguageModelTokenEmbedder(
-            archive_file=model_file,
-            dropout=0.2,
-            bos_eos_tokens=("<S>", "</S>"),
-            remove_bos_eos=True,
-            requires_grad=False,
-        )
-        self.lm_embedder = self.lm_embedder.to(device=flair.device)
-        self.vocab = self.lm_embedder._lm.vocab
-        self.indexer = ELMoTokenCharactersIndexer()
-
-        # embed a dummy sentence to determine embedding_length
-        dummy_sentence: Sentence = Sentence(["hello"])
-        embedded_dummy = self.embed(dummy_sentence)
-        self.__embedding_length: int = len(embedded_dummy[0].get_token(1).get_embedding())
-
-    @property
-    def embedding_length(self) -> int:
-        return self.__embedding_length
-
-    def _add_embeddings_internal(self, sentences: List[Sentence]) -> List[Sentence]:
-        # Avoid conflicts with flair's Token class
-        import allennlp.data.tokenizers.token as allen_nlp_token
-
-        indexer = self.indexer
-        vocab = self.vocab
-
-        for sentence in sentences:
-            character_indices = indexer.tokens_to_indices(
-                [allen_nlp_token.Token(token.text) for token in sentence], vocab, "elmo"
-            )["elmo"]
-
-            indices_tensor = torch.LongTensor([character_indices])
-            indices_tensor = indices_tensor.to(device=flair.device)
-            embeddings = self.lm_embedder(indices_tensor)[0].detach().cpu().numpy()
-
-            for token, token_idx in zip(sentence.tokens, range(len(sentence.tokens))):
-                embedding = embeddings[token_idx]
-                word_embedding = torch.FloatTensor(embedding)
-                token.set_embedding(self.name, word_embedding)
-
-        return sentences
-
-    def extra_repr(self):
-        return f"model={self.name}"
-
-    def __str__(self) -> str:
-        return self.name

From 7f3b0f1731de5be03bf4ae1e2b257d1b909a5136 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Sat, 29 Apr 2023 22:00:41 +0200
Subject: [PATCH 006/124] Adapt LabelVerbalizer so that it also works for
 non-BIOES span labes

---
 flair/nn/decoder.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flair/nn/decoder.py b/flair/nn/decoder.py
index c44076f021..65f802148a 100644
--- a/flair/nn/decoder.py
+++ b/flair/nn/decoder.py
@@ -174,6 +174,7 @@ def verbalize_labels(label_dictionary: Dictionary) -> List[Sentence]:
         for byte_label, idx in label_dictionary.item2idx.items():
             str_label = byte_label.decode("utf-8")
             if label_dictionary.span_labels:
+                # verbalize BIOES labels
                 if str_label == "O":
                     verbalized_labels.append("outside")
                 elif str_label.startswith("B-"):
@@ -184,6 +185,9 @@ def verbalize_labels(label_dictionary: Dictionary) -> List[Sentence]:
                     verbalized_labels.append("ending " + str_label.split("-")[1])
                 elif str_label.startswith("S-"):
                     verbalized_labels.append("single " + str_label.split("-")[1])
+                # if label is not BIOES, use label itself
+                else:
+                    verbalized_labels.append(str_label)
             else:
                 verbalized_labels.append(str_label)
         return list(map(Sentence, verbalized_labels))

From 00d9d49c472e1fabba8b692f4e8542198e9c3dff Mon Sep 17 00:00:00 2001
From: Max Ploner <plonerma@maxploner.de>
Date: Tue, 16 May 2023 16:30:28 +0200
Subject: [PATCH 007/124] Made gradient clipping optional & max gradient norm
 variable

---
 flair/trainers/trainer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index c8ba7f09db..47ff6bd518 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -294,6 +294,7 @@ def train_custom(
         optimizer: Type[torch.optim.Optimizer] = SGD,
         train_with_dev: bool = False,
         train_with_test: bool = False,
+        max_grad_norm: Optional[float] = 5.0,
         # evaluation and monitoring
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
         monitor_test: bool = False,
@@ -335,6 +336,8 @@ def train_custom(
         optimizer: The optimizer to use (typically SGD or Adam)
         train_with_dev (bool): If True, the data from dev split is added to the training data
         train_with_test (bool): If True, the data from test split is added to the training data
+        max_grad_norm (Optional[float]): If not None, gradients are clipped to this value before an optimizer.step is
+            called.
         main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
         monitor_test (bool): If True, test data is evaluated at end of each epoch
         monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
@@ -584,7 +587,8 @@ def train_custom(
                     self.dispatch("before_training_optimizer_step", **batch_kw)
 
                     # do the optimizer step
-                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 5.0)
+                    if max_grad_norm is not None:
+                        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
                     self.optimizer.step()
 
                     if batch_train_samples > 0:

From 0baf5470d51257962551e5d3d1514d170cfd435e Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Fri, 19 May 2023 16:34:51 +0900
Subject: [PATCH 008/124] Update README.md

HuggingFace -> Hugging Face
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8831cad778..6aea091510 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ Flair ships with state-of-the-art models for a range of NLP tasks. For instance,
 | Spanish  | Conll-03 (4-class)   |  **90.54** | *90.3 [(Yu et al., 2020)](https://www.aclweb.org/anthology/2020.acl-main.577.pdf)* | [Flair Spanish 4-class NER demo](https://huggingface.co/flair/ner-spanish-large)  |
 
 Many Flair sequence tagging models (named entity recognition, part-of-speech tagging etc.) are also hosted
-on the [__🤗 HuggingFace model hub__](https://huggingface.co/models?library=flair&sort=downloads)! You can browse models, check detailed information on how they were trained, and even try each model out online!
+on the [__🤗 Hugging Face model hub__](https://huggingface.co/models?library=flair&sort=downloads)! You can browse models, check detailed information on how they were trained, and even try each model out online!
 
 
 ## Quick Start

From 3fcf4474815a362fcd35eb6b3e065fb895daae88 Mon Sep 17 00:00:00 2001
From: Max Ploner <plonerma@maxploner.de>
Date: Thu, 25 May 2023 15:37:51 +0200
Subject: [PATCH 009/124] GH-3250: Save final model only if `save_final_model`
 is True (even when the training is interrupted).

---
 flair/trainers/trainer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index c8ba7f09db..180d9b477f 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -703,8 +703,9 @@ def train_custom(
 
             self.dispatch("training_interrupt")  # TODO: no plugin calls this event
 
-            log.info("Saving model ...")
-            self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)
+            if save_final_model:
+                log.info("Saving model ...")
+                self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)
             log.info("Done.")
 
         except TrainingInterrupt as exc:
@@ -713,8 +714,9 @@ def train_custom(
             log_line(log)
             self.dispatch("training_interrupt")  # TODO: no plugin calls this event
 
-            log.info("Saving model ...")
-            self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)
+            if save_final_model:
+                log.info("Saving model ...")
+                self.model.save(base_path / "final-model.pt", checkpoint=save_optimizer_state)
             log.info("Done.")
 
         except Exception:

From 6dcb645adbea1f927ab11a912cf094b4bdb1c29b Mon Sep 17 00:00:00 2001
From: Max Ploner <plonerma@maxploner.de>
Date: Wed, 28 Jun 2023 12:11:14 +0200
Subject: [PATCH 010/124] GH3275: Log warning if test or dev splits are sampled
 from the train data

---
 flair/data.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 04e10a8dd7..1c390eeff3 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -1212,15 +1212,27 @@ def __init__(
 
         # sample test data from train if none is provided
         if test is None and sample_missing_splits and train and sample_missing_splits != "only_dev":
+            test_portion = 0.1
             train_length = _len_dataset(train)
-            test_size: int = round(train_length / 10)
+            test_size: int = round(train_length * test_portion)
             test, train = randomly_split_into_two_datasets(train, test_size)
+            log.warning(
+                "No test split found. Using %.0f%% (i.e. %d samples) of the train split as test data",
+                test_portion,
+                test_size,
+            )
 
         # sample dev data from train if none is provided
         if dev is None and sample_missing_splits and train and sample_missing_splits != "only_test":
+            dev_portion = 0.1
             train_length = _len_dataset(train)
-            dev_size: int = round(train_length / 10)
+            dev_size: int = round(train_length * dev_portion)
             dev, train = randomly_split_into_two_datasets(train, dev_size)
+            log.warning(
+                "No dev split found. Using %.0f%% (i.e. %d samples) of the train split as dev data",
+                dev_portion,
+                dev_size,
+            )
 
         # set train dev and test data
         self._train: Optional[Dataset[T_co]] = train

From ce57a2f58903969e31f4093d77e5e4303b0a736a Mon Sep 17 00:00:00 2001
From: Max Ploner <plonerma@maxploner.de>
Date: Wed, 28 Jun 2023 12:17:09 +0200
Subject: [PATCH 011/124] GH3275: Add sample_missing_splits to SST-2

---
 flair/datasets/document_classification.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 0f4c7cd620..58ee3ecf66 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -324,6 +324,7 @@ def __init__(
         skip_header: bool = False,
         encoding: str = "utf-8",
         no_class_label=None,
+        sample_missing_splits: Union[bool, str] = True,
         **fmtparams,
     ) -> None:
         """Instantiates a Corpus for text classification from CSV column formatted data.
@@ -396,7 +397,7 @@ def __init__(
             else None
         )
 
-        super().__init__(train, dev, test, name=name)
+        super().__init__(train, dev, test, name=name, sample_missing_splits=sample_missing_splits)
 
 
 class CSVClassificationDataset(FlairDataset):
@@ -1488,6 +1489,7 @@ def __init__(
         tokenizer: Tokenizer = SegtokTokenizer(),
         in_memory: bool = False,
         encoding: str = "utf-8",
+        sample_missing_splits: bool = True,
         **datasetargs,
     ) -> None:
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
@@ -1514,6 +1516,7 @@ def __init__(
             tokenizer=tokenizer,
             in_memory=in_memory,
             encoding=encoding,
+            sample_missing_splits=sample_missing_splits,
             skip_header=True,
             **datasetargs,
         )

From b24313ade216bb2db17d08656f03f7f391331243 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 28 Jul 2023 13:38:15 +0200
Subject: [PATCH 012/124] datasets: add support for recently introduced dataset
 split for German LER dataset

---
 flair/datasets/sequence_labeling.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 6ed657d9cc..bc5b51315d 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -2577,7 +2577,6 @@ def __init__(
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
         to point to a different folder but typically this should not be necessary.
         :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2591,13 +2590,17 @@ def __init__(
 
         # download data if necessary
         ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
-        cached_path(f"{ler_path}ler.conll", Path("datasets") / dataset_name)
+
+        for split in ["train", "dev", "test"]:
+          cached_path(f"{ler_path}ler_{split}.conll", Path("datasets") / dataset_name)
 
         super().__init__(
             data_folder,
             columns,
             in_memory=in_memory,
-            train_file="ler.conll",
+            train_file="ler_train.conll",
+            dev_file="ler_dev.conll",
+            test_file="ler_test.conll",
             **corpusargs,
         )
 

From f9aa527380f16003b757ea193a618502b36779f9 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 28 Jul 2023 13:38:44 +0200
Subject: [PATCH 013/124] tests: add checks for German LER dataset

---
 tests/test_datasets.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 91a1742f85..3837f3bf8b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -799,6 +799,15 @@ def check_number_sentences(reference: int, actual: int, split_name: str):
         check_number_sentences(len(corpus.dev), stats["dev"], "dev")
 
 
+def test_german_ler_corpus(tasks_base_path):
+    corpus = flair.datasets.NER_GERMAN_LEGAL()
+
+    # Number of instances per dataset split are taken from https://huggingface.co/datasets/elenanereiss/german-ler
+    assert len(corpus.train) == 53384, "Mismatch in number of sentences for train split"
+    assert len(corpus.dev) == 6666, "Mismatch in number of sentences for train split"
+    assert len(corpus.test) == 6673, "Mismatch in number of sentences for train split"
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],

From 08424191e14be0fabc15fab688638743f9f06c5c Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 28 Jul 2023 13:46:20 +0200
Subject: [PATCH 014/124] test: fix assertion message

---
 tests/test_datasets.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 3837f3bf8b..8ff4b05396 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -804,8 +804,8 @@ def test_german_ler_corpus(tasks_base_path):
 
     # Number of instances per dataset split are taken from https://huggingface.co/datasets/elenanereiss/german-ler
     assert len(corpus.train) == 53384, "Mismatch in number of sentences for train split"
-    assert len(corpus.dev) == 6666, "Mismatch in number of sentences for train split"
-    assert len(corpus.test) == 6673, "Mismatch in number of sentences for train split"
+    assert len(corpus.dev) == 6666, "Mismatch in number of sentences for dev split"
+    assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):

From f1a51ab1b92ef5e6ac75756a88fff0f7fcb662e9 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 28 Jul 2023 14:45:41 +0200
Subject: [PATCH 015/124] datasets: fix code style

---
 flair/datasets/sequence_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index bc5b51315d..8c34b991dd 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -2592,7 +2592,7 @@ def __init__(
         ler_path = "https://raw.githubusercontent.com/elenanereiss/Legal-Entity-Recognition/master/data/"
 
         for split in ["train", "dev", "test"]:
-          cached_path(f"{ler_path}ler_{split}.conll", Path("datasets") / dataset_name)
+            cached_path(f"{ler_path}ler_{split}.conll", Path("datasets") / dataset_name)
 
         super().__init__(
             data_folder,

From 72da117830a7dcf2de4c252eaf5353b1e65103c3 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 09:16:44 +0200
Subject: [PATCH 016/124] recreate `to_dict` and add relations

---
 flair/data.py                             | 41 +++++++++++++++--------
 flair/models/relation_classifier_model.py |  2 +-
 2 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index ecbcb55ba4..d73195e4f0 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -604,6 +604,14 @@ def __len__(self) -> int:
     def embedding(self):
         return self.get_embedding()
 
+    def to_dict(self, tag_type: Optional[str] = None):
+        return {
+            "text": self.text,
+            "start_pos": self.start_position,
+            "end_pos": self.end_position,
+            "labels": [label.to_dict() for label in self.get_labels(tag_type)],
+        }
+
 
 class Relation(_PartOfSentence):
     def __new__(self, first: Span, second: Span):
@@ -664,6 +672,15 @@ def end_position(self) -> int:
     def embedding(self):
         pass
 
+    def to_dict(self, tag_type: Optional[str] = None):
+        return {
+            "from_text": self.first.text,
+            "to_text": self.second.text,
+            "from_idx": self.first.tokens[0].idx - 1,
+            "to_idx": self.second.tokens[0].idx - 1,
+            "labels": [label.to_dict() for label in self.get_labels(tag_type)],
+        }
+
 
 class Sentence(DataPoint):
     """A Sentence is a list of tokens and is used to represent a sentence or text fragment."""
@@ -760,17 +777,17 @@ def __init__(
     def unlabeled_identifier(self):
         return f'Sentence[{len(self)}]: "{self.text}"'
 
-    def get_relations(self, type: str) -> List[Relation]:
+    def get_relations(self, label_type: Optional[str] = None) -> List[Relation]:
         relations: List[Relation] = []
-        for label in self.get_labels(type):
+        for label in self.get_labels(label_type):
             if isinstance(label.data_point, Relation):
                 relations.append(label.data_point)
         return relations
 
-    def get_spans(self, type: str) -> List[Span]:
+    def get_spans(self, label_type: Optional[str] = None) -> List[Span]:
         spans: List[Span] = []
         for potential_span in self._known_spans.values():
-            if isinstance(potential_span, Span) and potential_span.has_label(type):
+            if isinstance(potential_span, Span) and (label_type is None or potential_span.has_label(label_type)):
                 spans.append(potential_span)
         return sorted(spans)
 
@@ -937,16 +954,12 @@ def to_original_text(self) -> str:
         ).strip()
 
     def to_dict(self, tag_type: Optional[str] = None):
-        labels = []
-
-        if tag_type:
-            labels = [label.to_dict() for label in self.get_labels(tag_type)]
-            return {"text": self.to_original_text(), tag_type: labels}
-
-        if self.labels:
-            labels = [label.to_dict() for label in self.labels]
-
-        return {"text": self.to_original_text(), "all labels": labels}
+        return {
+            "text": self.to_original_text(),
+            "labels": [label.to_dict() for label in self.get_labels(tag_type) if label.data_point is self],
+            "entities": [span.to_dict() for span in self.get_spans(tag_type)],
+            "relations": [relation.to_dict() for relation in self.get_relations(tag_type)],
+        }
 
     def get_span(self, start: int, stop: int):
         span_slice = slice(start, stop)
diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py
index 82fafa71fa..43b7dc203e 100644
--- a/flair/models/relation_classifier_model.py
+++ b/flair/models/relation_classifier_model.py
@@ -347,7 +347,7 @@ def _valid_entities(self, sentence: Sentence) -> Iterator[_Entity]:
         :return: Valid entities as `_Entity`
         """
         for label_type, valid_labels in self.entity_label_types.items():
-            for entity_span in sentence.get_spans(type=label_type):
+            for entity_span in sentence.get_spans(label_type=label_type):
                 entity_label: Label = entity_span.get_label(label_type=label_type)
 
                 # Only use entities labelled with the specified labels for each label type

From 0e6c0b614afbb44501e8fa5837e02354804cb302 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 13:29:23 +0200
Subject: [PATCH 017/124] add tokens

---
 flair/data.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index d73195e4f0..8779b48b6c 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -543,6 +543,14 @@ def set_label(self, typename: str, value: str, score: float = 1.0):
         else:
             DataPoint.set_label(self, typename=typename, value=value, score=score)
 
+    def to_dict(self, tag_type: Optional[str] = None):
+        return {
+            "text": self.text,
+            "start_pos": self.start_position,
+            "end_pos": self.end_position,
+            "labels": [label.to_dict() for label in self.get_labels(tag_type)],
+        }
+
 
 class Span(_PartOfSentence):
     """This class represents one textual span consisting of Tokens."""
@@ -957,8 +965,9 @@ def to_dict(self, tag_type: Optional[str] = None):
         return {
             "text": self.to_original_text(),
             "labels": [label.to_dict() for label in self.get_labels(tag_type) if label.data_point is self],
-            "entities": [span.to_dict() for span in self.get_spans(tag_type)],
-            "relations": [relation.to_dict() for relation in self.get_relations(tag_type)],
+            "entities": [span.to_dict(tag_type) for span in self.get_spans(tag_type)],
+            "relations": [relation.to_dict(tag_type) for relation in self.get_relations(tag_type)],
+            "tokens": [token.to_dict(tag_type) for token in self.tokens]
         }
 
     def get_span(self, start: int, stop: int):

From 14d5a073ec68471c7b12bf213754d3284770f608 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 16:34:37 +0200
Subject: [PATCH 018/124] black formatting and ruff fixes

---
 flair/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/data.py b/flair/data.py
index 8779b48b6c..182b128362 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -967,7 +967,7 @@ def to_dict(self, tag_type: Optional[str] = None):
             "labels": [label.to_dict() for label in self.get_labels(tag_type) if label.data_point is self],
             "entities": [span.to_dict(tag_type) for span in self.get_spans(tag_type)],
             "relations": [relation.to_dict(tag_type) for relation in self.get_relations(tag_type)],
-            "tokens": [token.to_dict(tag_type) for token in self.tokens]
+            "tokens": [token.to_dict(tag_type) for token in self.tokens],
         }
 
     def get_span(self, start: int, stop: int):

From 21e6ade1143fe6563d3a89ca83c9de0779946542 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Tue, 8 Aug 2023 17:10:23 +0200
Subject: [PATCH 019/124] Ruff

---
 flair/trainers/trainer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index ac866fc482..44b7ef865c 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -345,9 +345,9 @@ def train_custom(
             monitor_test (bool): If True, test data is evaluated at end of each epoch
             monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
                 If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
-                a percentage of data points from train.                
+                a percentage of data points from train.
             max_grad_norm (Optional[float]): If not None, gradients are clipped to this value before an optimizer.step is
-                called.    
+                called.
             use_final_model_for_eval (bool): If True, the final model is used for the final evaluation. If False, the
                 model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
             gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
@@ -596,7 +596,7 @@ def train_custom(
                     self.dispatch("before_training_optimizer_step", **batch_kw)
 
                     # do the optimizer step
-                    scaler.unscale_(self.optimizer)                    
+                    scaler.unscale_(self.optimizer)
                     if max_grad_norm is not None:
                         torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_grad_norm)
                     scale_before = scaler.get_scale()

From d52bb3fda429b2ee5d6a86fda10d1da2586612e6 Mon Sep 17 00:00:00 2001
From: Max Ploner <plonerma@maxploner.de>
Date: Wed, 9 Aug 2023 09:37:50 +0200
Subject: [PATCH 020/124] GH3275: Fixed error in previous commit

---
 flair/datasets/document_classification.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 58ee3ecf66..32fb5e64ca 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -1516,7 +1516,6 @@ def __init__(
             tokenizer=tokenizer,
             in_memory=in_memory,
             encoding=encoding,
-            sample_missing_splits=sample_missing_splits,
             skip_header=True,
             **datasetargs,
         )
@@ -1528,6 +1527,7 @@ def __init__(
             column_name_map={0: "text", 1: "label"},
             train_file=train_file,
             dev_file=data_folder / "dev.tsv",
+            sample_missing_splits=sample_missing_splits,
             **kwargs,
         )
 

From 909f1c2d073c14dfc3272e700609926b9fcbe898 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 9 Aug 2023 11:47:10 +0200
Subject: [PATCH 021/124] Ruff

---
 flair/data.py              | 4 ++--
 flair/embeddings/base.py   | 2 +-
 flair/embeddings/image.py  | 2 +-
 flair/embeddings/legacy.py | 2 +-
 flair/inference_utils.py   | 4 ++--
 flair/tokenization.py      | 6 +++---
 flair/training_utils.py    | 9 ++-------
 7 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 1c390eeff3..7e3a497853 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -143,7 +143,7 @@ def save(self, savefile):
     def __setstate__(self, d):
         self.__dict__ = d
         # set 'add_unk' if the dictionary was created with a version of Flair older than 0.9
-        if "add_unk" not in self.__dict__.keys():
+        if "add_unk" not in self.__dict__:
             self.__dict__["add_unk"] = b"<unk>" in self.__dict__["idx2item"]
 
     @classmethod
@@ -1687,7 +1687,7 @@ def __str__(self) -> str:
             f"{len(self.dev) if self.dev else 0} dev + "
             f"{len(self.test) if self.test else 0} test sentences\n - "
         )
-        output += "\n - ".join([f"{type(corpus).__name__} {str(corpus)} - {corpus.name}" for corpus in self.corpora])
+        output += "\n - ".join([f"{type(corpus).__name__} {corpus!s} - {corpus.name}" for corpus in self.corpora])
         return output
 
 
diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py
index a9521142ba..bf3e7645ab 100644
--- a/flair/embeddings/base.py
+++ b/flair/embeddings/base.py
@@ -194,7 +194,7 @@ def embedding_type(self) -> str:
     def _everything_embedded(self, data_points: Sequence[Sentence]) -> bool:
         for sentence in data_points:
             for token in sentence.tokens:
-                if self.name not in token._embeddings.keys():
+                if self.name not in token._embeddings:
                     return False
         return True
 
diff --git a/flair/embeddings/image.py b/flair/embeddings/image.py
index 6a14a0e009..faf3a78b17 100644
--- a/flair/embeddings/image.py
+++ b/flair/embeddings/image.py
@@ -73,7 +73,7 @@ class PrecomputedImageEmbeddings(ImageEmbeddings):
     def __init__(self, url2tensor_dict, name) -> None:
         self.url2tensor_dict = url2tensor_dict
         self.name = name
-        self.__embedding_length = len(list(self.url2tensor_dict.values())[0])
+        self.__embedding_length = len(next(iter(self.url2tensor_dict.values())))
         self.static_embeddings = True
         super().__init__()
 
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 39275a16a6..237bf6affd 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -1330,7 +1330,7 @@ def embed(self, sentences: Union[List[Sentence], Sentence]):
             sentences = [sentences]
 
         for sentence in sentences:
-            if self.name not in sentence._embeddings.keys():
+            if self.name not in sentence._embeddings:
                 everything_embedded = False
 
         if not everything_embedded:
diff --git a/flair/inference_utils.py b/flair/inference_utils.py
index e96e16da17..035025c005 100644
--- a/flair/inference_utils.py
+++ b/flair/inference_utils.py
@@ -177,7 +177,7 @@ def __init__(self, embedding, verbose) -> None:
                 self.k = len(result[0]) - 1
                 return
             except sqlite3.Error as err:
-                logger.exception(f"Fail to open sqlite database {self.store_path!s}: {str(err)}")
+                logger.exception(f"Fail to open sqlite database {self.store_path!s}: {err!s}")
         # otherwise, push embedding to database
         if hasattr(embedding, "precomputed_word_embeddings"):
             self.db = sqlite3.connect(str(self.store_path))
@@ -239,7 +239,7 @@ def __init__(self, embedding, verbose) -> None:
                             cursor.close()
                         return
                 except lmdb.Error as err:
-                    logger.exception(f"Fail to open lmdb database {self.store_path!s}: {str(err)}")
+                    logger.exception(f"Fail to open lmdb database {self.store_path!s}: {err!s}")
             # create and load the database in write mode
             if hasattr(embedding, "precomputed_word_embeddings"):
                 pwe = embedding.precomputed_word_embeddings
diff --git a/flair/tokenization.py b/flair/tokenization.py
index fe2ac33eb3..af0601bad7 100644
--- a/flair/tokenization.py
+++ b/flair/tokenization.py
@@ -256,9 +256,9 @@ def combined_rule_prefixes() -> List[str]:
                 r"/",  # want to split at every slash
                 r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                 rf"(?<=[{char_classes.ALPHA_LOWER}])\.(?=[{char_classes.ALPHA_UPPER}])",
-                r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
-                r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=char_classes.ALPHA, h=char_classes.HYPHENS),
-                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=char_classes.ALPHA),
+                fr"(?<=[{char_classes.ALPHA}]),(?=[{char_classes.ALPHA}])",
+                fr'(?<=[{char_classes.ALPHA}])[?";:=,.]*(?:{char_classes.HYPHENS})(?=[{char_classes.ALPHA}])',
+                fr"(?<=[{char_classes.ALPHA}0-9])[:<>=/](?=[{char_classes.ALPHA}])",
             ]
         )
 
diff --git a/flair/training_utils.py b/flair/training_utils.py
index 51894386dc..5c10ea63e3 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -66,17 +66,12 @@ def micro_avg_f_score(self):
         return self.mean_squared_error()
 
     def to_tsv(self):
-        return "{}\t{}\t{}\t{}".format(
-            self.mean_squared_error(),
-            self.mean_absolute_error(),
-            self.pearsonr(),
-            self.spearmanr(),
-        )
+        return f"{self.mean_squared_error()}\t{self.mean_absolute_error()}\t{self.pearsonr()}\t{self.spearmanr()}"
 
     @staticmethod
     def tsv_header(prefix=None):
         if prefix:
-            return "{0}_MEAN_SQUARED_ERROR\t{0}_MEAN_ABSOLUTE_ERROR\t{0}_PEARSON\t{0}_SPEARMAN".format(prefix)
+            return f"{prefix}_MEAN_SQUARED_ERROR\t{prefix}_MEAN_ABSOLUTE_ERROR\t{prefix}_PEARSON\t{prefix}_SPEARMAN"
 
         return "MEAN_SQUARED_ERROR\tMEAN_ABSOLUTE_ERROR\tPEARSON\tSPEARMAN"
 

From 4ea71a0a6ac584d8f464166e153f5153c32ab8b9 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 9 Aug 2023 12:56:33 +0200
Subject: [PATCH 022/124] Ruff

---
 flair/tokenization.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flair/tokenization.py b/flair/tokenization.py
index af0601bad7..ab4c0d2390 100644
--- a/flair/tokenization.py
+++ b/flair/tokenization.py
@@ -256,9 +256,9 @@ def combined_rule_prefixes() -> List[str]:
                 r"/",  # want to split at every slash
                 r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                 rf"(?<=[{char_classes.ALPHA_LOWER}])\.(?=[{char_classes.ALPHA_UPPER}])",
-                fr"(?<=[{char_classes.ALPHA}]),(?=[{char_classes.ALPHA}])",
-                fr'(?<=[{char_classes.ALPHA}])[?";:=,.]*(?:{char_classes.HYPHENS})(?=[{char_classes.ALPHA}])',
-                fr"(?<=[{char_classes.ALPHA}0-9])[:<>=/](?=[{char_classes.ALPHA}])",
+                rf"(?<=[{char_classes.ALPHA}]),(?=[{char_classes.ALPHA}])",
+                rf'(?<=[{char_classes.ALPHA}])[?";:=,.]*(?:{char_classes.HYPHENS})(?=[{char_classes.ALPHA}])',
+                rf"(?<=[{char_classes.ALPHA}0-9])[:<>=/](?=[{char_classes.ALPHA}])",
             ]
         )
 

From a57020a470711053d3cc35a77cc44ffe310408c9 Mon Sep 17 00:00:00 2001
From: Alan Akbik <alan.akbik@gmail.com>
Date: Wed, 9 Aug 2023 16:31:26 +0200
Subject: [PATCH 023/124] fix Ruff issues

---
 flair/data.py                            | 14 +++++++++-----
 flair/datasets/base.py                   |  2 +-
 flair/datasets/sequence_labeling.py      | 12 ++++++------
 flair/models/tars_model.py               |  2 +-
 flair/tokenization.py                    |  6 +++---
 flair/trainers/language_model_trainer.py |  4 ++--
 flair/trainers/trainer.py                |  6 +++---
 flair/training_utils.py                  |  4 ++--
 flair/visual/training_curves.py          |  4 ++--
 tests/model_test_utils.py                |  2 +-
 tests/test_multitask.py                  |  2 +-
 11 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 182b128362..4c8f4ace7c 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -3,10 +3,10 @@
 import re
 import typing
 from abc import ABC, abstractmethod
-from collections import Counter, defaultdict, namedtuple
+from collections import Counter, defaultdict
 from operator import itemgetter
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Union, cast
+from typing import Dict, Iterable, List, NamedTuple, Optional, Union, cast
 
 import torch
 from deprecated import deprecated
@@ -39,7 +39,11 @@ def _len_dataset(dataset: Optional[Dataset]) -> int:
     return len(loader)
 
 
-BoundingBox = namedtuple("BoundingBox", ["left", "top", "right", "bottom"])
+class BoundingBox(NamedTuple):
+    left: str
+    top: int
+    right: int
+    bottom: int
 
 
 class Dictionary:
@@ -727,7 +731,7 @@ def __init__(
         if isinstance(use_tokenizer, Tokenizer):
             tokenizer = use_tokenizer
 
-        elif type(use_tokenizer) == bool:
+        elif isinstance(use_tokenizer, bool):
             tokenizer = SegtokTokenizer() if use_tokenizer else SpaceTokenizer()
 
         else:
@@ -809,7 +813,7 @@ def _add_token(self, token: Union[Token, str]):
         if isinstance(token, Token):
             assert token.sentence is None
 
-        if type(token) is str:
+        if isinstance(token, str):
             token = Token(token)
         token = cast(Token, token)
 
diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index 98f625e14a..f5550b5bc7 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -229,7 +229,7 @@ def __getitem__(self, index: int = 0) -> Sentence:
 
 
 def find_train_dev_test_files(data_folder, dev_file, test_file, train_file, autofind_splits=True):
-    if type(data_folder) == str:
+    if isinstance(data_folder, str):
         data_folder: Path = Path(data_folder)
 
     if train_file is not None:
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 8c34b991dd..d214873b5b 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -2976,7 +2976,7 @@ def __init__(
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # if only one language is given
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         # column format
@@ -3249,7 +3249,7 @@ def __init__(
         in_memory : bool, optional
             Specify that the dataset should be loaded in memory, which speeds up the training process but takes increases the RAM usage significantly.
         """
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
@@ -3710,7 +3710,7 @@ def __init__(
             ]
 
         # if only one language is given
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
@@ -3802,7 +3802,7 @@ def __init__(
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # if only one language is given
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         # column format
@@ -4748,10 +4748,10 @@ def __init__(
         """
         supported_domains = ["WN", "FIC", "ADG"]
 
-        if type(domains) == str and domains == "all":
+        if isinstance(domains, str) and domains == "all":
             domains = supported_domains
 
-        if type(domains) == str:
+        if isinstance(domains, str):
             domains = [domains]
 
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index de431c4685..6bee5aee1e 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -195,7 +195,7 @@ def add_and_switch_to_new_task(
             # make label dictionary if no Dictionary object is passed
             if isinstance(label_dictionary, Dictionary):
                 label_dictionary = label_dictionary.get_items()
-            if type(label_dictionary) == str:
+            if isinstance(label_dictionary, str):
                 label_dictionary = [label_dictionary]
 
             # prepare dictionary of tags (without B- I- prefixes and without UNK)
diff --git a/flair/tokenization.py b/flair/tokenization.py
index fe2ac33eb3..ab4c0d2390 100644
--- a/flair/tokenization.py
+++ b/flair/tokenization.py
@@ -256,9 +256,9 @@ def combined_rule_prefixes() -> List[str]:
                 r"/",  # want to split at every slash
                 r"(?<=[0-9])[+\-\*^](?=[0-9-])",
                 rf"(?<=[{char_classes.ALPHA_LOWER}])\.(?=[{char_classes.ALPHA_UPPER}])",
-                r"(?<=[{a}]),(?=[{a}])".format(a=char_classes.ALPHA),
-                r'(?<=[{a}])[?";:=,.]*(?:{h})(?=[{a}])'.format(a=char_classes.ALPHA, h=char_classes.HYPHENS),
-                r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=char_classes.ALPHA),
+                rf"(?<=[{char_classes.ALPHA}]),(?=[{char_classes.ALPHA}])",
+                rf'(?<=[{char_classes.ALPHA}])[?";:=,.]*(?:{char_classes.HYPHENS})(?=[{char_classes.ALPHA}])',
+                rf"(?<=[{char_classes.ALPHA}0-9])[:<>=/](?=[{char_classes.ALPHA}])",
             ]
         )
 
diff --git a/flair/trainers/language_model_trainer.py b/flair/trainers/language_model_trainer.py
index 97915f07f9..a596a79796 100644
--- a/flair/trainers/language_model_trainer.py
+++ b/flair/trainers/language_model_trainer.py
@@ -56,7 +56,7 @@ def __len__(self) -> int:
 
     def __getitem__(self, index=0) -> torch.Tensor:
         """Tokenizes a text file on character basis."""
-        if type(self.files[index]) is str:
+        if isinstance(self.files[index], str):
             self.files[index] = Path(self.files[index])
         assert self.files[index].exists()
 
@@ -444,7 +444,7 @@ def load_checkpoint(
         corpus: TextCorpus,
         optimizer: Type[Optimizer] = SGD,
     ):
-        if type(checkpoint_file) is str:
+        if isinstance(checkpoint_file, str):
             checkpoint_file = Path(checkpoint_file)
 
         checkpoint = LanguageModel.load_checkpoint(checkpoint_file)
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 8e25fb0670..3cf96f5599 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -139,7 +139,7 @@ def train(
         # evaluation and monitoring
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
         monitor_test: bool = False,
-        monitor_train_sample: Union[float, int] = 0.0,
+        monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = False,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
         exclude_labels: List[str] = [],
@@ -211,7 +211,7 @@ def fine_tune(
         # evaluation and monitoring
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
         monitor_test: bool = False,
-        monitor_train_sample: Union[float, int] = 0.0,
+        monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = True,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
         exclude_labels: List[str] = [],
@@ -302,7 +302,7 @@ def train_custom(
         # evaluation and monitoring
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
         monitor_test: bool = False,
-        monitor_train_sample: Union[float, int] = 0.0,
+        monitor_train_sample: float = 0.0,
         use_final_model_for_eval: bool = False,
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
         exclude_labels: List[str] = [],
diff --git a/flair/training_utils.py b/flair/training_utils.py
index f36f95a919..e465f86c1d 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -71,7 +71,7 @@ def to_tsv(self):
     @staticmethod
     def tsv_header(prefix=None):
         if prefix:
-            return "{0}_MEAN_SQUARED_ERROR\t{0}_MEAN_ABSOLUTE_ERROR\t{0}_PEARSON\t{0}_SPEARMAN".format(prefix)
+            return f"{prefix}_MEAN_SQUARED_ERROR\t{prefix}_MEAN_ABSOLUTE_ERROR\t{prefix}_PEARSON\t{prefix}_SPEARMAN"
 
         return "MEAN_SQUARED_ERROR\tMEAN_ABSOLUTE_ERROR\tPEARSON\tSPEARMAN"
 
@@ -99,7 +99,7 @@ class EvaluationMetric(Enum):
 
 class WeightExtractor:
     def __init__(self, directory: Union[str, Path], number_of_weights: int = 10) -> None:
-        if type(directory) is str:
+        if isinstance(directory, str):
             directory = Path(directory)
         self.weights_file = init_output_file(directory, "weights.txt")
         self.weights_dict: Dict[str, Dict[int, List[float]]] = defaultdict(lambda: defaultdict(list))
diff --git a/flair/visual/training_curves.py b/flair/visual/training_curves.py
index f9a3c224b2..1fd856b669 100644
--- a/flair/visual/training_curves.py
+++ b/flair/visual/training_curves.py
@@ -67,7 +67,7 @@ def _extract_evaluation_data(file_name: Union[str, Path], score: str = "F1") ->
 
     @staticmethod
     def _extract_weight_data(file_name: Union[str, Path]) -> dict:
-        if type(file_name) is str:
+        if isinstance(file_name, str):
             file_name = Path(file_name)
 
         weights: Dict[str, Dict[str, List[float]]] = defaultdict(lambda: defaultdict(list))
@@ -86,7 +86,7 @@ def _extract_weight_data(file_name: Union[str, Path]) -> dict:
 
     @staticmethod
     def _extract_learning_rate(file_name: Union[str, Path]):
-        if type(file_name) is str:
+        if isinstance(file_name, str):
             file_name = Path(file_name)
 
         lrs = []
diff --git a/tests/model_test_utils.py b/tests/model_test_utils.py
index 3b936d8621..10aab0831f 100644
--- a/tests/model_test_utils.py
+++ b/tests/model_test_utils.py
@@ -204,7 +204,7 @@ def test_train_load_use_model_multi_label(
             print(label)
             assert label.value is not None
             assert 0.0 <= label.score <= 1.0
-            assert type(label.score) is float
+            assert isinstance(label.score, float)
 
         del trainer, model, multi_class_corpus
         loaded_model = self.model_cls.load(results_base_path / "final-model.pt")
diff --git a/tests/test_multitask.py b/tests/test_multitask.py
index 02dc42c1be..7a43e45a82 100644
--- a/tests/test_multitask.py
+++ b/tests/test_multitask.py
@@ -63,5 +63,5 @@ def test_train_load_use_classifier(results_base_path, tasks_base_path):
     for label in sentence.labels:
         assert label.value is not None
         assert 0.0 <= label.score <= 1.0
-        assert type(label.score) is float
+        assert isinstance(label.score, float)
     del loaded_model

From d8fe0b55d07504bcd067ea71df8b05de517ab8ac Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 23 May 2023 23:44:30 +0200
Subject: [PATCH 024/124] datasets: include AFRICA_POS implementation

---
 flair/datasets/__init__.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index e549810d18..092338e4a2 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -163,6 +163,7 @@
 # standard NER datasets
 # Expose all sequence labeling datasets
 from .sequence_labeling import (
+    AFRICA_POS,
     BIOSCOPE,
     CONLL_03,
     CONLL_03_DUTCH,
@@ -312,6 +313,7 @@
     "SentenceDataset",
     "MongoDataset",
     "StringDataset",
+    "AFRICA_POS",
     "ANAT_EM",
     "AZDZ",
     "BC2GM",

From 2bc445b9e3053db8f5fe9207e46401ae73cf3dcb Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 23 May 2023 23:44:55 +0200
Subject: [PATCH 025/124] datasets: add support for AfricaPOS dataset

---
 flair/datasets/sequence_labeling.py | 103 ++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index d214873b5b..3020f15c16 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4795,3 +4795,106 @@ def __init__(
             sample_missing_splits=False,
             name="nermud",
         )
+
+
+class AFRICA_POS(MultiCorpus):
+    def __init__(
+        self,
+        languages: Union[str, List[str]] = "bam",
+        version: str = "v1",
+        base_path: Optional[Union[str, Path]] = None,
+        in_memory: bool = True,
+        **corpusargs,
+    ) -> None:
+        """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
+
+        It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
+        with the languages you require. If you pass "all", all languages will be initialized.
+        :version: Specifies version of the dataset. Currently, only "v1" is supported.
+        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
+        to point to a different folder but typically this should not be necessary.
+        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
+        base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
+
+        # if only one language is given
+        if type(languages) == str:
+            languages = [languages]
+
+        # column format
+        columns = {0: "text", 1: "pos"}
+
+        # this dataset name
+        dataset_name = self.__class__.__name__.lower()
+
+        supported_versions = ["v1"]
+
+        if version not in supported_versions:
+            log.error(f"The specified version '{version}' is not in the list of supported version!")
+            log.error(f"Supported versions are '{supported_versions}'!")
+            raise Exception
+
+        data_folder = base_path / dataset_name / version
+
+        supported_languages = [
+            "bam",
+            "bbj",
+            "ewe",
+            "fon",
+            "hau",
+            "ibo",
+            "kin",
+            "lug",
+            "luo",
+            "mos",
+            "pcm",
+            "nya",
+            "sna",
+            "swa",
+            "tsn",
+            "twi",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        ]
+
+        data_paths = {
+            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS",
+        }
+
+        # use all languages if explicitly set to "all"
+        if languages == ["all"]:
+            languages = supported_languages
+
+        corpora: List[Corpus] = []
+        for language in languages:
+            if language not in supported_languages:
+                log.error(f"Language '{language}' is not in list of supported languages!")
+                log.error(f"Supported are '{supported_languages}'!")
+                log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'")
+                raise Exception
+
+            language_folder = data_folder / language
+
+            # download data if necessary
+            data_path = f"{data_paths[version]}/{language}"
+            cached_path(f"{data_path}/dev.txt", language_folder)
+            cached_path(f"{data_path}/test.txt", language_folder)
+            cached_path(f"{data_path}/train.txt", language_folder)
+
+            # initialize comlumncorpus and add it to list
+            log.info(f"Reading data for language {language}@{version}")
+            corp = ColumnCorpus(
+                data_folder=language_folder,
+                column_format=columns,
+                encoding="utf-8",
+                in_memory=in_memory,
+                name=language,
+                **corpusargs,
+            )
+            corpora.append(corp)
+        super().__init__(
+            corpora,
+            name="africa-pos-" + "-".join(languages),
+        )

From 9e1e26e18a6f11b87fef708c74bd43ba9dc5bc37 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 8 Aug 2023 21:47:29 +0200
Subject: [PATCH 026/124] tests: adjust test cases for MasakhaPOS dataset

---
 tests/test_datasets.py | 68 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index d642be2514..36b97391cd 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -808,6 +808,74 @@ def test_german_ler_corpus(tasks_base_path):
     assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
+def test_afri_pos_corpus(tasks_base_path):
+    # This test covers the complete AfricaPOS dataset.
+    supported_versions = ["v1"]
+
+    supported_languages = {
+        "v1": [
+            "bam",
+            "bbj",
+            "ewe",
+            "fon",
+            "hau",
+            "ibo",
+            "kin",
+            "lug",
+            "luo",
+            "mos",
+            "pcm",
+            "nya",
+            "sna",
+            "swa",
+            "tsn",
+            "twi",
+            "wol",
+            "xho",
+            "yor",
+            "zul",
+        ],
+    }
+
+    africa_pos_stats = {
+        "v1": {
+            "bam": {"train": 793, "dev": 158, "test": 634},
+            "bbj": {"train": 750, "dev": 149, "test": 599},
+            "ewe": {"train": 728, "dev": 145, "test": 582},
+            "fon": {"train": 798, "dev": 159, "test": 637},
+            "hau": {"train": 753, "dev": 150, "test": 601},
+            "ibo": {"train": 803, "dev": 160, "test": 642},
+            "kin": {"train": 757, "dev": 151, "test": 604},
+            "lug": {"train": 733, "dev": 146, "test": 586},
+            "luo": {"train": 757, "dev": 151, "test": 604},
+            "mos": {"train": 757, "dev": 151, "test": 604},
+            "pcm": {"train": 752, "dev": 150, "test": 600},
+            "nya": {"train": 728, "dev": 145, "test": 582},
+            "sna": {"train": 747, "dev": 149, "test": 596},
+            "swa": {"train": 675, "dev": 134, "test": 539},
+            "tsn": {"train": 753, "dev": 150, "test": 602},
+            "twi": {"train": 775, "dev": 154, "test": 618},
+            "wol": {"train": 770, "dev": 154, "test": 616},
+            "xho": {"train": 752, "dev": 150, "test": 601},
+            "yor": {"train": 875, "dev": 174, "test": 698},
+            "zul": {"train": 753, "dev": 150, "test": 601},
+        },
+    }
+
+    def check_number_sentences(reference: int, actual: int, split_name: str, language: str, version: str):
+        assert actual == reference, f"Mismatch in number of sentences for {language}@{version}/{split_name}"
+
+    for version in supported_versions:
+        for language in supported_languages[version]:
+            corpus = flair.datasets.AFRICA_POS(languages=language, version=version)
+
+            gold_stats = africa_pos_stats[version][language]
+
+            check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
+            check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
+            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+
+
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):
     corpus = MultiFileJsonlCorpus(
         train_files=[tasks_base_path / "jsonl/train.jsonl"],

From da36e0fded6bd8a08778fd7289d706e4f98ab148 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 24 May 2023 00:03:08 +0200
Subject: [PATCH 027/124] datasets: fix MASAKHA_POS name

---
 flair/datasets/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
index 092338e4a2..8100e4821e 100644
--- a/flair/datasets/__init__.py
+++ b/flair/datasets/__init__.py
@@ -163,7 +163,6 @@
 # standard NER datasets
 # Expose all sequence labeling datasets
 from .sequence_labeling import (
-    AFRICA_POS,
     BIOSCOPE,
     CONLL_03,
     CONLL_03_DUTCH,
@@ -174,6 +173,7 @@
     KEYPHRASE_INSPEC,
     KEYPHRASE_SEMEVAL2010,
     KEYPHRASE_SEMEVAL2017,
+    MASAKHA_POS,
     NER_ARABIC_ANER,
     NER_ARABIC_AQMAR,
     NER_BASQUE,
@@ -313,7 +313,6 @@
     "SentenceDataset",
     "MongoDataset",
     "StringDataset",
-    "AFRICA_POS",
     "ANAT_EM",
     "AZDZ",
     "BC2GM",
@@ -449,6 +448,7 @@
     "KEYPHRASE_INSPEC",
     "KEYPHRASE_SEMEVAL2010",
     "KEYPHRASE_SEMEVAL2017",
+    "MASAKHA_POS",
     "NER_ARABIC_ANER",
     "NER_ARABIC_AQMAR",
     "NER_BASQUE",

From d077266ea60ae2a6cb137186cdf90679782271b6 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 24 May 2023 00:03:31 +0200
Subject: [PATCH 028/124] datasets: add support for MasakhaPOS dataset

---
 flair/datasets/sequence_labeling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 3020f15c16..9a69f9aa84 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4797,7 +4797,7 @@ def __init__(
         )
 
 
-class AFRICA_POS(MultiCorpus):
+class MASAKHA_POS(MultiCorpus):
     def __init__(
         self,
         languages: Union[str, List[str]] = "bam",
@@ -4806,9 +4806,9 @@ def __init__(
         in_memory: bool = True,
         **corpusargs,
     ) -> None:
-        """Initialize the AfricaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
+        """Initialize the MasakhaPOS corpus available on https://github.com/masakhane-io/masakhane-pos.
 
-        It consists of ten African languages. Pass a language code or a list of language codes to initialize the corpus
+        It consists of 20 African languages. Pass a language code or a list of language codes to initialize the corpus
         with the languages you require. If you pass "all", all languages will be initialized.
         :version: Specifies version of the dataset. Currently, only "v1" is supported.
         :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
@@ -4872,7 +4872,7 @@ def __init__(
             if language not in supported_languages:
                 log.error(f"Language '{language}' is not in list of supported languages!")
                 log.error(f"Supported are '{supported_languages}'!")
-                log.error("Instantiate this Corpus for instance like so 'corpus = AFRICA_POS(languages='luo')'")
+                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'")
                 raise Exception
 
             language_folder = data_folder / language

From 5c53910f4b6ecfcd2e61d69891b93bb69142f4cb Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Tue, 8 Aug 2023 21:50:21 +0200
Subject: [PATCH 029/124] tests: adjust test cases for MasakhaPOS dataset

---
 tests/test_datasets.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 36b97391cd..9aa7f9c809 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -808,8 +808,8 @@ def test_german_ler_corpus(tasks_base_path):
     assert len(corpus.test) == 6673, "Mismatch in number of sentences for test split"
 
 
-def test_afri_pos_corpus(tasks_base_path):
-    # This test covers the complete AfricaPOS dataset.
+def test_masakha_pos_corpus(tasks_base_path):
+    # This test covers the complete MasakhaPOS dataset.
     supported_versions = ["v1"]
 
     supported_languages = {
@@ -867,7 +867,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
     for version in supported_versions:
         for language in supported_languages[version]:
-            corpus = flair.datasets.AFRICA_POS(languages=language, version=version)
+            corpus = flair.datasets.MASAKHA_POS(languages=language, version=version)
 
             gold_stats = africa_pos_stats[version][language]
 

From d84092c5bb03d4ace01c80164e28f11b00be6bc1 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Sun, 11 Jun 2023 11:05:03 +0200
Subject: [PATCH 030/124] datasets: sync with latest MasakhaPOS GitHub version:
 test splits are currently missing and luo + tsn are missing

---
 flair/datasets/sequence_labeling.py |  9 +++++----
 tests/test_datasets.py              | 10 +++++-----
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 9a69f9aa84..22fe85b47d 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4845,13 +4845,13 @@ def __init__(
             "ibo",
             "kin",
             "lug",
-            "luo",
+            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            "tsn",
+            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -4860,7 +4860,7 @@ def __init__(
         ]
 
         data_paths = {
-            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/AfricaPOS",
+            "v1": "https://raw.githubusercontent.com/masakhane-io/masakhane-pos/main/data",
         }
 
         # use all languages if explicitly set to "all"
@@ -4880,7 +4880,7 @@ def __init__(
             # download data if necessary
             data_path = f"{data_paths[version]}/{language}"
             cached_path(f"{data_path}/dev.txt", language_folder)
-            cached_path(f"{data_path}/test.txt", language_folder)
+            #cached_path(f"{data_path}/test.txt", language_folder)
             cached_path(f"{data_path}/train.txt", language_folder)
 
             # initialize comlumncorpus and add it to list
@@ -4891,6 +4891,7 @@ def __init__(
                 encoding="utf-8",
                 in_memory=in_memory,
                 name=language,
+                test_file=None,
                 **corpusargs,
             )
             corpora.append(corp)
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 9aa7f9c809..ce2224aa0e 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -822,13 +822,13 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo",
             "kin",
             "lug",
-            "luo",
+            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            "tsn",
+            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -847,13 +847,13 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo": {"train": 803, "dev": 160, "test": 642},
             "kin": {"train": 757, "dev": 151, "test": 604},
             "lug": {"train": 733, "dev": 146, "test": 586},
-            "luo": {"train": 757, "dev": 151, "test": 604},
+            #"luo": {"train": 757, "dev": 151, "test": 604},
             "mos": {"train": 757, "dev": 151, "test": 604},
             "pcm": {"train": 752, "dev": 150, "test": 600},
             "nya": {"train": 728, "dev": 145, "test": 582},
             "sna": {"train": 747, "dev": 149, "test": 596},
             "swa": {"train": 675, "dev": 134, "test": 539},
-            "tsn": {"train": 753, "dev": 150, "test": 602},
+            #"tsn": {"train": 753, "dev": 150, "test": 602},
             "twi": {"train": 775, "dev": 154, "test": 618},
             "wol": {"train": 770, "dev": 154, "test": 616},
             "xho": {"train": 752, "dev": 150, "test": 601},
@@ -873,7 +873,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
             check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
             check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
-            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+            #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):

From fccf83bb1c6c0023dc11876cbb250b3a7e473001 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 13 Jul 2023 23:42:20 +0200
Subject: [PATCH 031/124] datasets: some minor work on MasakhaPOS dataset
 parsing

---
 flair/datasets/sequence_labeling.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 22fe85b47d..8735a81d91 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4845,13 +4845,11 @@ def __init__(
             "ibo",
             "kin",
             "lug",
-            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -4872,7 +4870,7 @@ def __init__(
             if language not in supported_languages:
                 log.error(f"Language '{language}' is not in list of supported languages!")
                 log.error(f"Supported are '{supported_languages}'!")
-                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='luo')'")
+                log.error("Instantiate this Corpus for instance like so 'corpus = MASAKHA_POS(languages='bam')'")
                 raise Exception
 
             language_folder = data_folder / language
@@ -4880,7 +4878,7 @@ def __init__(
             # download data if necessary
             data_path = f"{data_paths[version]}/{language}"
             cached_path(f"{data_path}/dev.txt", language_folder)
-            #cached_path(f"{data_path}/test.txt", language_folder)
+            cached_path(f"{data_path}/test.txt", language_folder)
             cached_path(f"{data_path}/train.txt", language_folder)
 
             # initialize comlumncorpus and add it to list
@@ -4891,7 +4889,6 @@ def __init__(
                 encoding="utf-8",
                 in_memory=in_memory,
                 name=language,
-                test_file=None,
                 **corpusargs,
             )
             corpora.append(corp)

From 5bd45264f2198a013e12e869e8e4da2c67dd5857 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 13 Jul 2023 23:42:46 +0200
Subject: [PATCH 032/124] tests: sync MasakhaPOS tests with upstream repo

---
 tests/test_datasets.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ce2224aa0e..56d524d041 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -822,13 +822,11 @@ def test_masakha_pos_corpus(tasks_base_path):
             "ibo",
             "kin",
             "lug",
-            #"luo",
             "mos",
             "pcm",
             "nya",
             "sna",
             "swa",
-            #"tsn",
             "twi",
             "wol",
             "xho",
@@ -839,25 +837,23 @@ def test_masakha_pos_corpus(tasks_base_path):
 
     africa_pos_stats = {
         "v1": {
-            "bam": {"train": 793, "dev": 158, "test": 634},
+            "bam": {"train": 775, "dev": 154, "test": 619},
             "bbj": {"train": 750, "dev": 149, "test": 599},
             "ewe": {"train": 728, "dev": 145, "test": 582},
-            "fon": {"train": 798, "dev": 159, "test": 637},
+            "fon": {"train": 810, "dev": 161, "test": 646},
             "hau": {"train": 753, "dev": 150, "test": 601},
             "ibo": {"train": 803, "dev": 160, "test": 642},
             "kin": {"train": 757, "dev": 151, "test": 604},
             "lug": {"train": 733, "dev": 146, "test": 586},
-            #"luo": {"train": 757, "dev": 151, "test": 604},
             "mos": {"train": 757, "dev": 151, "test": 604},
             "pcm": {"train": 752, "dev": 150, "test": 600},
             "nya": {"train": 728, "dev": 145, "test": 582},
             "sna": {"train": 747, "dev": 149, "test": 596},
-            "swa": {"train": 675, "dev": 134, "test": 539},
-            #"tsn": {"train": 753, "dev": 150, "test": 602},
-            "twi": {"train": 775, "dev": 154, "test": 618},
-            "wol": {"train": 770, "dev": 154, "test": 616},
+            "swa": {"train": 693, "dev": 138, "test": 553},
+            "twi": {"train": 785, "dev": 157, "test": 628},
+            "wol": {"train": 782, "dev": 156, "test": 625},
             "xho": {"train": 752, "dev": 150, "test": 601},
-            "yor": {"train": 875, "dev": 174, "test": 698},
+            "yor": {"train": 893, "dev": 178, "test": 713},
             "zul": {"train": 753, "dev": 150, "test": 601},
         },
     }
@@ -873,7 +869,7 @@ def check_number_sentences(reference: int, actual: int, split_name: str, languag
 
             check_number_sentences(len(corpus.train), gold_stats["train"], "train", language, version)
             check_number_sentences(len(corpus.dev), gold_stats["dev"], "dev", language, version)
-            #check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
+            check_number_sentences(len(corpus.test), gold_stats["test"], "test", language, version)
 
 
 def test_multi_file_jsonl_corpus_should_use_label_type(tasks_base_path):

From 2ddae63624cc5519d00c7375a18a51148c9e80d3 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Fri, 11 Aug 2023 12:49:57 +0200
Subject: [PATCH 033/124] datasets: type -> isinstance fix

---
 flair/datasets/sequence_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 8735a81d91..0a5bf1b589 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -4818,7 +4818,7 @@ def __init__(
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # if only one language is given
-        if type(languages) == str:
+        if isinstance(languages, str):
             languages = [languages]
 
         # column format

From 0ca6d6b3f6e13da8f7c0726106ad4e621911a5c2 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 14 Aug 2023 14:27:08 +0200
Subject: [PATCH 034/124] bump min version to 3.8

---
 .github/workflows/ci.yml |  4 ++--
 .travis.yml              | 13 -------------
 CONTRIBUTING.md          |  4 ++--
 README.md                |  2 +-
 setup.py                 |  2 +-
 5 files changed, 6 insertions(+), 19 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 020426175e..62eddad8a7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -12,11 +12,11 @@ jobs:
       FLAIR_CACHE_ROOT: ./cache/flair
     steps:
       - uses: actions/checkout@v3
-      - name: Set up Python 3.7
+      - name: Set up Python 3.8
         id: setup-python
         uses: actions/setup-python@v4
         with:
-          python-version: 3.7
+          python-version: 3.8
       - name: Install Flair dependencies
         run: pip install -e .
       - name: Install unittest dependencies
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index ee783e7778..0000000000
--- a/.travis.yml
+++ /dev/null
@@ -1,13 +0,0 @@
-language: python
-sudo: false
-env:
-  - BOTO_CONFIG=/dev/null
-python:
-  - "3.7"
-install:
-  - python setup.py develop -q
-before_script: cd tests
-script:
-  - pip freeze
-  - 'if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then pytest --runintegration; fi'
-  - 'if [ "$TRAVIS_PULL_REQUEST" = "false" ]; then pytest; fi'
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d89055a214..b40ddfe77e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -24,8 +24,8 @@ the code should hopefully be easy.
 
 ### Setup
 
-Flair requires python-3.7 or higher. To make sure your code also runs on the oldest supported
-python version, it is recommended to use python-3.7.x for flair development.
+Flair requires python-3.8 or higher. To make sure your code also runs on the oldest supported
+python version, it is recommended to use python-3.8.x for flair development.
 
 Create a python environment of your preference and run:
 ```bash
diff --git a/README.md b/README.md
index 6aea091510..79b831a944 100644
--- a/README.md
+++ b/README.md
@@ -52,7 +52,7 @@ In your favorite virtual environment, simply do:
 pip install flair
 ```
 
-Flair requires Python 3.7+. 
+Flair requires Python 3.8+. 
 
 ### Example 1: Tag Entities in Text
 
diff --git a/setup.py b/setup.py
index fb62c0116a..368fc5fffb 100644
--- a/setup.py
+++ b/setup.py
@@ -17,5 +17,5 @@
     license="MIT",
     install_requires=required,
     include_package_data=True,
-    python_requires=">=3.7",
+    python_requires=">=3.8",
 )

From c133f9ae6dabb03d952c248d9f789f8bad78fb9c Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 10:49:57 +0200
Subject: [PATCH 035/124] add action to remove Awaiting Response label when an
 response was made

---
 .github/workflows/issues.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 .github/workflows/issues.yml

diff --git a/.github/workflows/issues.yml b/.github/workflows/issues.yml
new file mode 100644
index 0000000000..772773c254
--- /dev/null
+++ b/.github/workflows/issues.yml
@@ -0,0 +1,11 @@
+on: issue_comment
+
+jobs:
+  issue_commented:
+    name: Issue comment
+    if: ${{ !github.event.issue.pull_request && github.event.issue.author == github.even.issue_comment.author }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions-ecosystem/action-remove-labels@v1
+        with:
+          labels: "Awaiting Response"
\ No newline at end of file

From 731973f6a5b647f73926812fd3c9559cfe912df8 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 11:38:00 +0200
Subject: [PATCH 036/124] fix training lm embeddings tutorial

---
 resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md b/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md
index 7640afe963..f81fbe4521 100644
--- a/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md
+++ b/resources/docs/TUTORIAL_9_TRAINING_LM_EMBEDDINGS.md
@@ -197,7 +197,7 @@ from flair.trainers.language_model_trainer import LanguageModelTrainer, TextCorp
 
 
 # instantiate an existing LM, such as one from the FlairEmbeddings
-language_model = FlairEmbeddings('news-forward').lm
+language_model = FlairEmbeddings('news-forward', decoder=True).lm
 
 # are you fine-tuning a forward or backward LM?
 is_forward_lm = language_model.is_forward_lm

From 8517e69f2cf7790cdff1a9d3a9ddd2a69de1735a Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 11:39:52 +0200
Subject: [PATCH 037/124] fix ruff

---
 flair/nn/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 2fa90714ca..2e77d67d35 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -660,7 +660,7 @@ def multi_label_threshold(self):
 
     @multi_label_threshold.setter
     def multi_label_threshold(self, x):  # setter method
-        if type(x) is dict:
+        if isinstance(x, dict):
             if "default" in x:
                 self._multi_label_threshold = x
             else:

From d288688587294fa9934a9ce090be096f6171e9cf Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 11:45:16 +0200
Subject: [PATCH 038/124] fix ruff

---
 flair/nn/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 2fa90714ca..2e77d67d35 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -660,7 +660,7 @@ def multi_label_threshold(self):
 
     @multi_label_threshold.setter
     def multi_label_threshold(self, x):  # setter method
-        if type(x) is dict:
+        if isinstance(x, dict):
             if "default" in x:
                 self._multi_label_threshold = x
             else:

From 7bd1a335dc738c2a3dc9ae69916c02bad096a172 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 16:51:27 +0200
Subject: [PATCH 039/124] fix ruff

---
 flair/nn/model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/nn/model.py b/flair/nn/model.py
index 2fa90714ca..2e77d67d35 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -660,7 +660,7 @@ def multi_label_threshold(self):
 
     @multi_label_threshold.setter
     def multi_label_threshold(self, x):  # setter method
-        if type(x) is dict:
+        if isinstance(x, dict):
             if "default" in x:
                 self._multi_label_threshold = x
             else:

From 64feada23ed333fb8b871e30bc121517ab67ef7b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 17:21:09 +0200
Subject: [PATCH 040/124] fix onnx export of DocumentTransformers

---
 flair/embeddings/transformer.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 9d2a8b5ab1..f9df654b94 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -323,6 +323,7 @@ def __init__(
         fine_tune: bool,
         truncate: bool,
         use_lang_emb: bool,
+        cls_pooling: str,
         is_document_embedding: bool = False,
         is_token_embedding: bool = False,
         force_device: Optional[torch.device] = None,
@@ -349,9 +350,11 @@ def __init__(
         self.force_max_length = force_max_length
         self.feature_extractor = feature_extractor
         self.use_context_separator = use_context_separator
+        self.cls_pooling = cls_pooling
 
         tokenizer_params = list(inspect.signature(self.tokenizer.__call__).parameters.keys())
         self.tokenizer_needs_ocr_boxes = "boxes" in tokenizer_params
+        self.initial_cls_token = self._has_initial_cls_token()
 
         # The layoutlm tokenizer doesn't handle ocr themselves
         self.needs_manual_ocr = isinstance(self.tokenizer, (LayoutLMTokenizer, LayoutLMTokenizerFast))
@@ -364,6 +367,14 @@ def __init__(
         if not self.token_embedding and not self.document_embedding:
             raise ValueError("either 'is_token_embedding' or 'is_document_embedding' needs to be set.")
 
+    def _has_initial_cls_token(self) -> bool:
+        # most models have CLS token as last token (GPT-1, GPT-2, TransfoXL, XLNet, XLM), but BERT is initial
+        if self.tokenizer_needs_ocr_boxes:
+            # cannot run `.encode` if ocr boxes are required, assume
+            return True
+        tokens = self.tokenizer.encode("a")
+        return tokens[0] == self.tokenizer.cls_token_id
+
     def to_args(self):
         args = {
             "is_token_embedding": self.token_embedding,
@@ -382,6 +393,7 @@ def to_args(self):
             "force_max_length": self.force_max_length,
             "feature_extractor": self.feature_extractor,
             "use_context_separator": self.use_context_separator,
+            "cls_pooling": self.cls_pooling,
         }
         if hasattr(self, "needs_manual_ocr"):
             args["needs_manual_ocr"] = self.needs_manual_ocr
@@ -396,6 +408,7 @@ def __setstate__(self, state):
     def from_params(cls, params):
         tokenizer = cls._tokenizer_from_bytes(params.pop("tokenizer_data"))
         feature_extractor = cls._feature_extractor_from_bytes(params.pop("feature_extractor_data", None))
+        params.setdefault("cls_pooling", "cls")
         embedding = cls.create_from_state(tokenizer=tokenizer, feature_extractor=feature_extractor, **params)
         return embedding
 
@@ -1121,14 +1134,6 @@ def _load_from_state_dict(
             state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
         )
 
-    def _has_initial_cls_token(self) -> bool:
-        # most models have CLS token as last token (GPT-1, GPT-2, TransfoXL, XLNet, XLM), but BERT is initial
-        if self.tokenizer_needs_ocr_boxes:
-            # cannot run `.encode` if ocr boxes are required, assume
-            return True
-        tokens = self.tokenizer.encode("a")
-        return tokens[0] == self.tokenizer.cls_token_id
-
     def _calculate_embedding_length(self, model) -> int:
         length = len(self.layer_indexes) * model.config.hidden_size if not self.layer_mean else model.config.hidden_size
 

From c1afffd7f1aaf07d26f1e03885535a9ac09fe009 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 21 Aug 2023 18:20:57 +0200
Subject: [PATCH 041/124] add option to set session_options fro onnx
 transformer models

---
 flair/embeddings/transformer.py                  | 16 +++++++++++++---
 .../TUTORIAL_PRODUCTION_FASTER_TRANSFORMERS.md   |  3 ++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index f9df654b94..7ef1fc8607 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -715,11 +715,12 @@ def _add_embeddings_internal(self, sentences: List[Sentence]):
 
 @register_embeddings
 class TransformerOnnxEmbeddings(TransformerBaseEmbeddings):
-    def __init__(self, onnx_model: str, providers: List = [], **kwargs) -> None:
+    def __init__(self, onnx_model: str, providers: List = [], session_options: Optional[Dict] = None, **kwargs) -> None:
         # onnx prepares numpy arrays, no mather if it runs on gpu or cpu, the input is on cpu first.
         super().__init__(**kwargs, force_device=torch.device("cpu"))
         self.onnx_model = onnx_model
         self.providers = providers
+        self.session_options = session_options
         self.create_session()
         self.eval()
 
@@ -727,6 +728,7 @@ def to_params(self):
         params = super().to_params()
         params["providers"] = self.providers
         params["onnx_model"] = self.onnx_model
+        params["session_options"] = self.session_options
         return params
 
     @classmethod
@@ -745,7 +747,14 @@ def create_session(self):
             )
             raise
         if os.path.isfile(self.onnx_model):
-            self.session = onnxruntime.InferenceSession(self.onnx_model, providers=self.providers)
+            session_options = onnxruntime.SessionOptions()
+            if self.session_options is not None:
+                for k, v in self.session_options.items():
+                    setattr(session_options, k, v)
+
+            self.session = onnxruntime.InferenceSession(
+                self.onnx_model, providers=self.providers, sess_options=session_options
+            )
         else:
             log.warning(
                 f"Could not find file '{self.onnx_model}' used in {self.__class__.name}({self.name})."
@@ -824,6 +833,7 @@ def export_from_embedding(
         example_sentences: List[Sentence],
         opset_version: int = 14,
         providers: Optional[List] = None,
+        session_options: Optional[dict] = None,
     ):
         path = str(path)
         example_tensors = embedding.prepare_tensors(example_sentences)
@@ -864,7 +874,7 @@ def export_from_embedding(
             dynamic_axes=dynamic_axes,
             opset_version=opset_version,
         )
-        return cls(onnx_model=path, providers=providers, **embedding.to_args())
+        return cls(onnx_model=path, providers=providers, session_options=session_options, **embedding.to_args())
 
 
 @register_embeddings
diff --git a/resources/docs/TUTORIAL_PRODUCTION_FASTER_TRANSFORMERS.md b/resources/docs/TUTORIAL_PRODUCTION_FASTER_TRANSFORMERS.md
index a1dfc1c0b6..16bdf608bd 100644
--- a/resources/docs/TUTORIAL_PRODUCTION_FASTER_TRANSFORMERS.md
+++ b/resources/docs/TUTORIAL_PRODUCTION_FASTER_TRANSFORMERS.md
@@ -28,10 +28,11 @@ which can be done via `pip install onnxruntime`
 
 To export the OnnxEmbeddings there is only one line to run:
 ```python
-model.embeddings = model.embeddings.export_onnx("flert-embeddings.onnx", sentences, providers=["CUDAExecutionProvider", "CPUExecutionProvider"])
+model.embeddings = model.embeddings.export_onnx("flert-embeddings.onnx", sentences, providers=["CUDAExecutionProvider", "CPUExecutionProvider"], session_options={})
 ```
 This creates a file `flert-embeddings.onnx` which stores the exported Onnx Model. Besides that, the embeddings are replaced by `TransformerOnnxEmbeddings` which ensure, that the created Onnx model is used for predictions.
 The providers referenced are part of your production environment and are documented [here](https://onnxruntime.ai/docs/execution-providers/)
+You can provide SessionOptions documented [here](https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions), by passing each property you want to set as key in the `session_options` dictionary
 
 The usage for predictions is the same as before:
 ```python

From 66cc15c339cd6d26b3f15f68cd70121050b11be3 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 10:03:14 +0200
Subject: [PATCH 042/124] add basic doc setup

---
 .github/workflows/publish-docs.yml |  45 +++++++++++++
 assets/README.md                   |   0
 assets/redirect.html               |   9 +++
 docs/_templates/page.html          |  19 ++++++
 docs/_templates/versioning.html    |  20 ++++++
 docs/api/flair.rst                 |   7 ++
 docs/conf.py                       | 103 +++++++++++++++++++++++++++++
 docs/index.rst                     |  16 +++++
 docs/modules.rst                   |   7 ++
 docs/requirements.txt              |   7 ++
 10 files changed, 233 insertions(+)
 create mode 100644 .github/workflows/publish-docs.yml
 create mode 100644 assets/README.md
 create mode 100644 assets/redirect.html
 create mode 100644 docs/_templates/page.html
 create mode 100644 docs/_templates/versioning.html
 create mode 100644 docs/api/flair.rst
 create mode 100644 docs/conf.py
 create mode 100644 docs/index.rst
 create mode 100644 docs/modules.rst
 create mode 100644 docs/requirements.txt

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
new file mode 100644
index 0000000000..babe90f080
--- /dev/null
+++ b/.github/workflows/publish-docs.yml
@@ -0,0 +1,45 @@
+name: 'Run tests for ci cd'
+on:
+  push:
+    branches: [ main ]
+    tags:
+      - "*"
+
+jobs:
+  publish_docs:
+    name: Build the docs using Sphinx and push to gh-pages
+    runs-on: ubuntu-latest
+    env:
+      python-version: 3.7
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install poetry
+        run: pipx install poetry
+      - name: setup python ${{ env.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ env.python-version }}
+      - name: Install Flair dependencies
+        run: pip install -e .
+      - name: Install unittest dependencies
+        run: pip install -r requirements-dev.txt
+      - name: Install doc dependencies
+        run: pip install -r docs/requirements.txt
+      - name: Fetch git tags
+        run: git fetch --tags origin
+      - name: Install Dependencies
+        run: poetry install
+      - name: Build docs
+        run: |
+          sphinx-multiversion docs doc_build/
+      - name: Add redirect to stable doc
+        run: |
+          cp assets/redirect.html doc_build/index.html
+          cp assets/README.md doc_build/README.md
+          sed -i "s/\[VERSION\]/$(python -c 'import flair;print(flair.__version__)')/g" doc_build/index.html
+      - name: Deploy
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./doc_build
\ No newline at end of file
diff --git a/assets/README.md b/assets/README.md
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/assets/redirect.html b/assets/redirect.html
new file mode 100644
index 0000000000..e3b5ad9ba2
--- /dev/null
+++ b/assets/redirect.html
@@ -0,0 +1,9 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Redirecting to https://flairnlp.github.io/</title>
+    <meta charset="utf-8">
+    <meta http-equiv="refresh" content="0; URL=https://flairnlp.github.io/">
+    <link rel="canonical" href="https://flairnlp.github.io/">
+  </head>
+</html>
\ No newline at end of file
diff --git a/docs/_templates/page.html b/docs/_templates/page.html
new file mode 100644
index 0000000000..d7b48c82d7
--- /dev/null
+++ b/docs/_templates/page.html
@@ -0,0 +1,19 @@
+{% extends "!page.html" %}
+{% block body %}
+{% if current_version and latest_version and current_version != latest_version and current_version != release and current_version.name != latest_version.release %}
+<p>
+  <strong>
+    {% if current_version.is_released %}
+        {% if latest_version.release.replace('v', '').split('.') | map('int') | list > current_version.name.replace('v', '').split('.') | map('int') | list  %}
+          You're reading an old version of this documentation.
+          If you want up-to-date information, please have a look at <a href="{{ vpathto(latest_version.name) }}">{{latest_version.name}}</a>.
+        {% endif %}
+    {% else %}
+    You're reading the documentation for a development version.
+    For the latest stable version, please have a look at <a href="{{ vpathto(latest_version.name) }}">{{latest_version.name}}</a>.
+    {% endif %}
+  </strong>
+</p>
+{% endif %}
+{{ super() }}
+{% endblock %}%
\ No newline at end of file
diff --git a/docs/_templates/versioning.html b/docs/_templates/versioning.html
new file mode 100644
index 0000000000..a6f92873ce
--- /dev/null
+++ b/docs/_templates/versioning.html
@@ -0,0 +1,20 @@
+{% if versions %}
+<h3>{{ _('Versions') }}</h3>
+<ul>
+  {%- for item in versions|reverse %}
+  <li><a href="{{ item.url }}">
+    {% if item.is_released %}
+    {{ item.name }}
+      {% if latest_version and item == latest_version %}
+      (stable)
+      {% endif %}
+    {% else %}
+    latest (dev)
+    {% endif %}
+    {% if item == current_version %}
+    [x]
+    {% endif %}
+  </a></li>
+  {%- endfor %}
+</ul>
+{% endif %}
\ No newline at end of file
diff --git a/docs/api/flair.rst b/docs/api/flair.rst
new file mode 100644
index 0000000000..056e6182b4
--- /dev/null
+++ b/docs/api/flair.rst
@@ -0,0 +1,7 @@
+flair
+=====
+
+.. automodule:: flair
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000000..ad2ab9a3a8
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,103 @@
+# noqa: INP001
+
+import importlib_metadata
+
+# -- Project information -----------------------------------------------------
+from sphinx_github_style import get_linkcode_resolve
+
+version = "0.12.2"
+release = "0.12.2"
+project = "flair"
+author = importlib_metadata.metadata(project)["Author"]
+copyright = "2018 Zalando SE"  # TODO: verify if this is right.
+
+# The full version, including alpha/beta/rc tags
+top_level = project.replace("-", "_")
+
+linkcode_url = importlib_metadata.metadata(project)["Home-page"]
+
+
+smv_current_version = ""  # will by overwritten by sphinx-multi-version to the name of the tag or branch.
+html_context = {"github_version": ""}  # dummy value that sphinx-github-style won't crash when run in temp folder.
+
+
+def linkcode_resolve(*args):
+    # use smv_current_version as the git url
+    real_linkcode_url = linkcode_url + f"/blob/{smv_current_version}/" + "{filepath}#L{linestart}-L{linestop}"
+    return get_linkcode_resolve(real_linkcode_url)(*args)
+
+
+# -- General configuration ---------------------------------------------------
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.autodoc",
+    "sphinx.ext.doctest",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.ifconfig",
+    "sphinx.ext.napoleon",  # to render Google format docstrings
+    "sphinx.ext.githubpages",
+    "myst_parser",
+    "sphinx_github_style",
+    "sphinx_autodoc_typehints",
+    "sphinx_multiversion",
+]
+
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ["_templates"]
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = "pydata_sphinx_theme"
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+# html_static_path = ["_static"]
+
+# Napoleon settings
+napoleon_include_init_with_doc = True
+napoleon_include_private_with_doc = True
+
+source_suffix = {
+    ".rst": "restructuredtext",
+    ".md": "markdown",
+}
+
+html_sidebars = {
+    "**": [
+        "localtoc.html",
+        "searchbox.html",
+        "versioning.html",
+    ]
+}
+
+smv_latest_version = importlib_metadata.version(project)
+
+# Whitelist pattern for tags (set to None to ignore all tags)
+smv_tag_whitelist = r"^\d+\.\d+\.\d+$"
+
+# Whitelist pattern for branches (set to None to ignore all branches)
+smv_branch_whitelist = r"^main|master|doc-page$"
+
+# Whitelist pattern for remotes (set to None to use local branches only)
+smv_remote_whitelist = r"^origin$"
+
+# Pattern for released versions
+smv_released_pattern = r"^refs/tags/\d+\.\d+\.\d+$"
+
+# Format for versioned output directories inside the build directory
+smv_outputdir_format = "{ref.name}"
+
+# Determines whether remote or local git branches/tags are preferred if their output dirs conflict
+smv_prefer_remote_refs = False
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000000..b7c33bd067
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,16 @@
+flair
+=====
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   modules
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file
diff --git a/docs/modules.rst b/docs/modules.rst
new file mode 100644
index 0000000000..eaf72506ee
--- /dev/null
+++ b/docs/modules.rst
@@ -0,0 +1,7 @@
+Api docs
+========
+
+.. toctree::
+   :maxdepth: 4
+
+   api/flair
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000..bc1d8f027c
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,7 @@
+sphinx-github-style
+sphinx-autodoc-typehints
+myst-parser
+sphinx
+importlib-metadata
+sphinx-multiversion
+pydata-sphinx-theme
\ No newline at end of file

From 50925f2261aea550aa9e95961459c49c864b115b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 10:13:56 +0200
Subject: [PATCH 043/124] document __init__.py

---
 flair/__init__.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/flair/__init__.py b/flair/__init__.py
index 2eb42075fa..0358e3ec91 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -9,9 +9,18 @@
 from .file_utils import set_proxies
 
 cache_root = Path(os.getenv("FLAIR_CACHE_ROOT", Path(Path.home(), ".flair")))
+"""The path to the cache folder Flair is using.
+
+This value defaults to `<Home Directory>/.flair`. 
+You can choose the path by setting the `FLAIR_CACHE_ROOT` environment variable.
+"""
 
 device: torch.device
-"""Flair is using a single device for everything. You can set this device by overwriting this variable."""
+"""Flair is using a single device for everything. You can set this device by overwriting this variable.
+
+This value will be automatically set to the first found GPU if available and to CPU otherwise.
+You can choose a specific GPU, by setting the `FLAIR_DEVICE` environment variable to its index. 
+"""
 
 
 # global variable: device
@@ -25,6 +34,7 @@
 
 # global variable: version
 __version__ = "0.12.2"
+"""The current version of the flair library installed."""
 
 # global variable: arrow symbol
 _arrow = " → "
@@ -55,9 +65,21 @@
 )
 
 logger = logging.getLogger("flair")
+"""The logger used by Flair.
+
+You can reconfigure it to change the log output to your likings.
+"""
 
 
 def set_seed(seed: int):
+    """Set the seed for all random generators used in training.
+
+    Use this method to guarantee reproducibility of experiments.
+
+    Args:
+        seed: any value you want
+
+    """
     hf_set_seed(seed)
 
 

From f302effaf23588bff4112de8d925304dbc40eef4 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 10:15:37 +0200
Subject: [PATCH 044/124] make set_proxies follow google docs format

---
 flair/file_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flair/file_utils.py b/flair/file_utils.py
index edad5c71bf..752ee98ea8 100644
--- a/flair/file_utils.py
+++ b/flair/file_utils.py
@@ -33,8 +33,9 @@ def set_proxies(proxies: typing.Dict[str, str]) -> None:
     """Allows for data downloaded from urls to be forwarded to a proxy.
 
     see https://requests.readthedocs.io/en/latest/user/advanced/#proxies
-    :param proxies: A dictionary of proxies according to the requests documentation.
-    :return: None
+
+    Args:
+        proxies: A dictionary of proxies according to the requests documentation.
     """
     global url_proxies
     url_proxies = proxies

From ea8fe2a5eba8281adfe4ad46478bcc79ada7edaa Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 10:21:09 +0200
Subject: [PATCH 045/124] code formatting

---
 docs/conf.py      | 2 +-
 flair/__init__.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index ad2ab9a3a8..fcce3f960b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -100,4 +100,4 @@ def linkcode_resolve(*args):
 smv_outputdir_format = "{ref.name}"
 
 # Determines whether remote or local git branches/tags are preferred if their output dirs conflict
-smv_prefer_remote_refs = False
\ No newline at end of file
+smv_prefer_remote_refs = False
diff --git a/flair/__init__.py b/flair/__init__.py
index 0358e3ec91..46550af6b8 100644
--- a/flair/__init__.py
+++ b/flair/__init__.py
@@ -11,7 +11,7 @@
 cache_root = Path(os.getenv("FLAIR_CACHE_ROOT", Path(Path.home(), ".flair")))
 """The path to the cache folder Flair is using.
 
-This value defaults to `<Home Directory>/.flair`. 
+This value defaults to `<Home Directory>/.flair`.
 You can choose the path by setting the `FLAIR_CACHE_ROOT` environment variable.
 """
 
@@ -19,7 +19,7 @@
 """Flair is using a single device for everything. You can set this device by overwriting this variable.
 
 This value will be automatically set to the first found GPU if available and to CPU otherwise.
-You can choose a specific GPU, by setting the `FLAIR_DEVICE` environment variable to its index. 
+You can choose a specific GPU, by setting the `FLAIR_DEVICE` environment variable to its index.
 """
 
 

From 431b782a691cc23e10bf8f6c5e4d3810025d59fc Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 11:59:27 +0200
Subject: [PATCH 046/124] reconfigure index page

---
 docs/_static/api.svg          |  2 +
 docs/_static/contributing.svg |  1 +
 docs/_static/tutorial.svg     | 36 ++++++++++++++
 docs/conf.py                  |  2 +-
 docs/contributing/index.rst   |  0
 docs/index.rst                | 71 ++++++++++++++++++++++++++--
 docs/tutorial/intro.md        | 89 +++++++++++++++++++++++++++++++++++
 7 files changed, 195 insertions(+), 6 deletions(-)
 create mode 100644 docs/_static/api.svg
 create mode 100644 docs/_static/contributing.svg
 create mode 100644 docs/_static/tutorial.svg
 create mode 100644 docs/contributing/index.rst
 create mode 100644 docs/tutorial/intro.md

diff --git a/docs/_static/api.svg b/docs/_static/api.svg
new file mode 100644
index 0000000000..21e58f2f73
--- /dev/null
+++ b/docs/_static/api.svg
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg width="800px" height="800px" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg"><rect width="24" height="24" fill="none"/><path d="M20,6H4A2,2,0,0,0,2,8v8a2,2,0,0,0,2,2H20a2,2,0,0,0,2-2V8A2,2,0,0,0,20,6ZM9.29,14.8,9,13.73H7.16L6.87,14.8H5.17L7,9.07H9.09L11,14.8Zm6.34-3.14a1.7,1.7,0,0,1-.36.64,1.82,1.82,0,0,1-.67.44,2.75,2.75,0,0,1-1,.17h-.44V14.8H11.6V9.09h2a2.43,2.43,0,0,1,1.62.47,1.67,1.67,0,0,1,.55,1.35A2.36,2.36,0,0,1,15.63,11.66Zm2.58,3.14H16.66V9.09h1.55ZM8.45,11.53l.24.93H7.48l.24-.93c0-.13.08-.28.12-.47s.09-.38.13-.57a4.63,4.63,0,0,0,.1-.48c0,.13.07.29.11.5l.15.58Zm5.59-1a.57.57,0,0,1,.16.43.75.75,0,0,1-.11.42.59.59,0,0,1-.27.22.9.9,0,0,1-.37.07h-.31V10.34h.4A.63.63,0,0,1,14,10.51Z" fill-rule="evenodd"/></svg>
\ No newline at end of file
diff --git a/docs/_static/contributing.svg b/docs/_static/contributing.svg
new file mode 100644
index 0000000000..c9f5e27805
--- /dev/null
+++ b/docs/_static/contributing.svg
@@ -0,0 +1 @@
+<svg data-name="Layer 1" xmlns="http://www.w3.org/2000/svg" width="808.14938" height="589.44401" viewBox="0 0 808.14938 589.44401" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M1010.64622,651.37416c-2.71155-20.47851-15.57545-40.57509-35.17783-47.09113a95.1759,95.1759,0,0,1,.51268,65.34266c-2.93967,8.14954-7.05786,16.9077-4.16734,25.07485,1.79845,5.08184,6.13028,8.94647,10.95718,11.34616,4.8272,2.39969,10.17216,3.51931,15.45016,4.61623l1.04376.84941C1007.47971,692.55879,1013.35776,671.85268,1010.64622,651.37416Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M975.27746,604.67216a81.3498,81.3498,0,0,1,20.58531,45.62207,35.03206,35.03206,0,0,1-.31163,10.97177,20.09236,20.09236,0,0,1-4.92329,9.35855c-2.23269,2.49243-4.80513,4.78336-6.39308,7.77513a12.30262,12.30262,0,0,0-.5215,10.04784c1.45513,4.06963,4.28576,7.3737,7.16617,10.5086,3.19815,3.48072,6.57517,7.04542,7.95963,11.68254.16774.56185,1.02974.26871.86224-.29229-2.40871-8.06779-10.29479-12.59884-14.096-19.85247-1.7737-3.38468-2.53814-7.32439-.92895-10.92483,1.40716-3.14843,4.05786-5.51313,6.34032-8.01723a21.44009,21.44009,0,0,0,5.15486-8.97129,32.38605,32.38605,0,0,0,.7323-10.91931A78.92362,78.92362,0,0,0,990.95,627.72335a82.79233,82.79233,0,0,0-15.07556-23.73876c-.39271-.43281-.98715.2576-.597.68757Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M995.57408,644.78666a12.20505,12.20505,0,0,0,9.18894-12.8584c-.051-.58356-.96023-.53171-.90915.05262a11.30163,11.30163,0,0,1-8.57208,11.94354c-.56886.14-.27332,1.00146.29229.86224Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M991.92636,669.50988a23.52422,23.52422,0,0,1-10.61193-13.46464c-.16973-.56124-1.03178-.26827-.86225.29229a24.46878,24.46878,0,0,0,11.07116,13.98874c.50721.29546.90748-.52253.403-.81639Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M986.47747,619.87779a6.90865,6.90865,0,0,1-6.5495-.2805c-.50321-.30174-.90295.5166-.403.81638a7.74171,7.74171,0,0,0,7.24481.32637.47052.47052,0,0,0,.285-.57727.45752.45752,0,0,0-.57727-.285Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M916.16848,641.36805c.30886.19735.61771.3947.927.59974a90.98207,90.98207,0,0,1,11.70478,8.99c.2864.25132.57285.51034.85143.76937a95.91645,95.91645,0,0,1,21.05621,28.34934,93.14616,93.14616,0,0,1,5.20662,13.16021c1.93212,6.23195,3.53118,13.1427,7.29832,18.22288a15.97692,15.97692,0,0,0,1.257,1.52688l33.67775-.26666c.07624-.039.1529-.07039.22951-.10941l1.34514.05083c-.05591-.23776-.119-.48316-.17493-.72093-.03186-.13805-.07122-.27605-.10308-.41411-.02362-.092-.04761-.18408-.06366-.26847-.00775-.03066-.01587-.06131-.02356-.08433-.01605-.08439-.03992-.16107-.05591-.23776q-.5234-2.059-1.07755-4.11788c-.00006-.00769-.00006-.00769-.00762-.01532a136.95938,136.95938,0,0,0-11.76516-30.16719c-.15608-.28307-.31184-.57384-.4833-.8568a88.88277,88.88277,0,0,0-8.07936-12.04632,78.57691,78.57691,0,0,0-5.28005-5.94415,65.34166,65.34166,0,0,0-16.44775-12.11837c-12.12945-6.28182-26.13292-8.61451-39.01093-4.61665C916.82,641.15545,916.49779,641.25785,916.16848,641.36805Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M916.24758,641.79489a81.34988,81.34988,0,0,1,43.90385,24.03288,35.03233,35.03233,0,0,1,6.35695,8.948,20.09248,20.09248,0,0,1,1.70352,10.43644c-.28207,3.3343-.95672,6.71227-.42336,10.05708a12.30267,12.30267,0,0,0,5.63311,8.33662c3.612,2.37328,7.86142,3.30716,12.04869,4.076,4.64918.85365,9.49174,1.66667,13.389,4.53561.47221.34761.984-.40543.51248-.75251-6.78059-4.99147-15.80519-3.86129-23.20741-7.36434-3.454-1.63458-6.43634-4.32-7.31921-8.16357-.772-3.361-.07931-6.845.23547-10.21862a21.44023,21.44023,0,0,0-1.28547-10.26666,32.38621,32.38621,0,0,0-5.98948-9.15935A78.92392,78.92392,0,0,0,942.63965,650.764a82.79235,82.79235,0,0,0-26.32937-9.87755c-.57413-.10913-.63309.8-.0627.90843Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M956.605,661.6041a12.20506,12.20506,0,0,0-.4048-15.79909c-.39208-.43523-1.08682.15358-.69422.58939a11.30162,11.30162,0,0,1,.34651,14.69722c-.36991.45429.38472.96417.75251.51248Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M968.57758,683.54039a23.52429,23.52429,0,0,1-16.57968-4.36165c-.47342-.34593-.98533.407-.51248.75251a24.46877,24.46877,0,0,0,17.26189,4.50362c.58287-.06946.41-.96357-.16973-.89448Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M934.345,647.19255a6.90864,6.90864,0,0,1-5.39829,3.71928c-.58345.062-.40993.95612.16973.89449a7.74174,7.74174,0,0,0,5.98107-4.10129.47053.47053,0,0,0-.12-.6325.45753.45753,0,0,0-.63249.12Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M788.70874,445.35256a6.6176,6.6176,0,0,0,10.13608.47785l21.56551,9.37924-2.1006-12.03868-20.15693-7.061a6.65349,6.65349,0,0,0-9.44406,9.24262Z" transform="translate(-223.31656 -125.21687)" fill="#a0616a"/><path d="M808.33018,438.55131l41.19723,5.33928,42.485-25.59256a18.78361,18.78361,0,0,1,26.80668,8.35028h0a18.80382,18.80382,0,0,1-8.4331,24.39255l-57.08925,20.34376-.10746.03844-50.08943-22.31169Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><polygon points="690.298 579.764 681.54 579.764 677.374 545.984 690.3 545.985 690.298 579.764" fill="#a0616a"/><path d="M915.848,713.4702l-28.2383-.001V713.112a10.99172,10.99172,0,0,1,10.99113-10.991h.0007l5.1581-3.9132,9.62386,3.9138,2.465.0001Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><polygon points="656.36 574.245 647.818 576.174 636.313 544.142 648.921 541.295 656.36 574.245" fill="#a0616a"/><path d="M883.72548,707.25094l-27.54492,6.21926-.07868-.3484a10.99172,10.99172,0,0,1,8.3001-13.14209l.00068-.00016,4.1694-4.9533,10.2496,1.69774,2.40451-.54289Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M924.70647,498.47061c4.106,12.3837,3.67191,25.82586-.98693,40.253L921.728,619.95178s2.96729,53.26781-6.6076,72.01238H901.23414l-6.87529-76.877-.86335-51.60031-29.77062,43.53325,18.9618,83.68716L865.596,692.23513l-30.76995-90.82,37.964-88.11122Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M868.49283,510.44241l3.42446-8.561-2.26477-7.36036-.51563-71.32157-.00028-.05432,8.21752-13.10363.06685-8.288,22.94605-.55964,2.48846,6.31934,4.924,2.9442A35.64637,35.64637,0,0,1,925.08672,441.638l-.88947,48.12063,2.28984,4.58024a9.63783,9.63783,0,0,1-5.67975,13.49028l-50.4935,8.07657Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><circle cx="663.47615" cy="247.27029" r="20.8488" fill="#a0616a"/><path d="M908.5027,377.03213c.62184-2.6197.33635-6.30613-.01506-8.94389a25.03486,25.03486,0,0,0-10.5963-17.38372,7.63326,7.63326,0,0,0-3.80218-1.511,3.28226,3.28226,0,0,0-3.3002,1.99619,6.84189,6.84189,0,0,0-3.8598-3.82756,12.05793,12.05793,0,0,0-5.49444-.74021,20.08039,20.08039,0,0,0-16.75192,11.9747c-.55687,1.32291-2.11229-3.56352-2.81785-2.31357a5.55717,5.55717,0,0,1-3.15143,2.76892c-1.38251.38578-1.944,6.08184-2.4881,4.75361a6.33176,6.33176,0,0,0,6.05342,8.8288,34.87245,34.87245,0,0,0,4.143-.793,10.32927,10.32927,0,0,1,11.3093,7.73074,13.68475,13.68475,0,0,1,1.80033-4.00553,4.63181,4.63181,0,0,1,3.71585-2.04166,5.2038,5.2038,0,0,1,3.57592,2.05552c.91785,1.07772,1.6072,2.3351,2.53182,3.407,2.71306,3.14524,6.38255,10.4801,10.24894,9.94514C902.65525,388.51049,907.65366,380.609,908.5027,377.03213Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M809.07054,500.68736a6.61764,6.61764,0,0,0,9.07527-4.53962l23.39771-2.36389-7.71912-9.474L812.789,488.00707a6.65349,6.65349,0,0,0-3.71842,12.68029Z" transform="translate(-223.31656 -125.21687)" fill="#a0616a"/><path d="M822.86038,485.16l38.54676-15.48762,24.54475-43.09887a18.78361,18.78361,0,0,1,27.46639-5.82433h0a18.80381,18.80381,0,0,1,4.5716,25.40107L878.13941,491.812l-.07493.08608-54.60267,5.031Z" transform="translate(-223.31656 -125.21687)" fill="#3f3d56"/><path d="M1030.27526,714.66088h-347.988a1.19069,1.19069,0,1,1,0-2.38137h347.988a1.19069,1.19069,0,1,1,0,2.38137Z" transform="translate(-223.31656 -125.21687)" fill="#ccc"/><path d="M805.38289,521.01774H647.24317a4.16145,4.16145,0,0,1-4.15662-4.15662V452.69488a4.16146,4.16146,0,0,1,4.15662-4.15663H805.38289a4.16139,4.16139,0,0,1,4.15662,4.15663v64.16624A4.16138,4.16138,0,0,1,805.38289,521.01774Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M790.32508,551.79867h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C796.75761,541.733,796.822,551.88927,790.32508,551.79867Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M790.32508,574.6601h-128.024a4.98795,4.98795,0,1,1,0-9.9759h128.024C796.75761,564.59441,796.822,574.7507,790.32508,574.6601Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M790.32508,597.52152h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C796.75761,587.45584,796.822,597.61212,790.32508,597.52152Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M252.32744,350.82694c-27.83858-36.57706-39.05316-88.47429-18.33162-129.504A211.7829,211.7829,0,0,0,335.67037,325.267c17.42189,8.253,37.65858,15.61834,45.90545,33.04327,5.13162,10.84211,4.35538,23.73629.5,35.09462-3.8558,11.35878-10.53652,21.50912-17.14717,31.51858l-.31528,2.9778C321.91979,410.86836,280.166,387.404,252.32744,350.82694Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M234.90751,221.63772c1.70626,37.60771,15.43815,74.91386,39.06105,104.29815,5.10722,6.35282,10.841,12.35276,17.70007,16.82962a44.70911,44.70911,0,0,0,22.44916,7.05027c7.43334.4324,15.08691.01389,22.2857,2.24569,7.5879,2.35244,13.24276,7.92665,16.58226,15.04208,4.086,8.70592,4.80008,18.36081,5.17031,27.82674.41107,10.51012.67152,21.43336,5.7591,30.92419.61644,1.14995-1.20394,2.0392-1.81944.891-8.85153-16.51242-3.51028-36.033-8.88695-53.44429-2.50885-8.12442-7.4813-15.542-15.66826-18.70125-7.15915-2.76263-15.05194-2.33781-22.58214-2.71057-7.90684-.39141-15.31136-2.06365-22.2073-6.07581-7.05417-4.10421-13.03457-9.89263-18.2818-16.087a175.61939,175.61939,0,0,1-28.14647-47.12278,184.22731,184.22731,0,0,1-13.436-61.115c-.05894-1.2991,1.96218-1.14165,2.02073.149Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M265.78615,316.7897a27.15837,27.15837,0,0,1-34.67145-5.88435c-.83475-1.00113.68173-2.3453,1.51758-1.34285a25.14805,25.14805,0,0,0,32.26288,5.40776c1.11752-.6712,2.00212,1.15208.891,1.81944Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M310.31991,350.09292a52.34556,52.34556,0,0,0-4.36763-37.89706c-.61233-1.15209,1.20785-2.04169,1.81945-.891a54.44728,54.44728,0,0,1,4.46475,39.44456c-.3372,1.26188-2.25193.59851-1.91657-.65651Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M241.07747,263.20516a15.373,15.373,0,0,0,9.8981-10.715c.321-1.26553,2.23551-.60078,1.91656.65651a17.22666,17.22666,0,0,1-10.92367,11.87795,1.047,1.047,0,0,1-1.35522-.46423,1.01806,1.01806,0,0,1,.46423-1.35522Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M385.762,186.85342c-.178.79592-.356,1.59184-.52259,2.40058a202.45258,202.45258,0,0,0-4.37543,32.54807c-.05788.84589-.1038,1.704-.13725,2.54979a213.43075,213.43075,0,0,0,11.22712,77.77264A207.26709,207.26709,0,0,0,404.376,331.06328c6.72447,12.86716,15.0393,26.28341,17.06085,40.2106a35.55473,35.55473,0,0,1,.41063,4.38162L368.27065,428.055c-.18152.058-.35174.12871-.53386.18727l-2.04349,2.19c-.28466-.463-.57-.94931-.85461-1.4123-.16624-.26789-.32054-.54745-.48678-.81534-.107-.18228-.21357-.36523-.32059-.52361-.03586-.06054-.07112-.12167-.09508-.17005-.107-.15838-.18961-.31685-.28466-.463q-2.40326-4.071-4.75763-8.18989c-.012-.01224-.012-.01224-.012-.03615-11.90612-20.95328-22.16757-43.08461-28.74369-66.06987-.19762-.69161-.40781-1.39487-.581-2.11041a197.78094,197.78094,0,0,1-6.14065-31.68621,174.84773,174.84773,0,0,1-.98857-17.66379,145.397,145.397,0,0,1,6.9554-44.92495c9.29327-28.93943,27.73842-54.58458,54.336-68.4721C384.4001,187.53974,385.06935,187.196,385.762,186.85342Z" transform="translate(-223.31656 -125.21687)" fill="#f0f0f0"/><path d="M386.30659,187.65123C365.0265,218.70616,353.52977,256.76058,354.7,294.44491c.253,8.14727,1.21868,16.39,3.99991,24.09417a44.7091,44.7091,0,0,0,13.67965,19.14519c5.67477,4.82064,12.03769,9.09445,16.44183,15.21059,4.64218,6.44673,5.80121,14.302,4.18363,21.99392-1.97915,9.41123-7.22187,17.55006-12.62542,25.331-5.9996,8.63922-12.36818,17.51763-14.02017,28.15861-.20016,1.2893-2.189.90333-1.98916-.384,2.87417-18.51348,18.8916-30.88379,25.08142-48.02284,2.88829-7.99739,3.384-16.91366-1.25079-24.36527-4.05287-6.51611-10.6106-10.92892-16.39862-15.76025-6.07751-5.073-10.9828-10.86619-14.07323-18.22151-3.16134-7.52408-4.45132-15.74643-4.9115-23.85149a175.6193,175.6193,0,0,1,5.89779-54.571,184.22712,184.22712,0,0,1,26.06753-56.8863c.73508-1.07275,2.254.26982,1.52376,1.33556Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M353.67338,282.21583A27.15836,27.15836,0,0,1,329.533,256.64289c-.06375-1.30191,1.95636-1.46214,2.02019-.15849a25.14806,25.14806,0,0,0,22.50425,23.74227c1.29639.13691.905,2.12529-.384,1.98916Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M369.18027,335.619a52.34554,52.34554,0,0,0,19.32933-32.88829c.20472-1.28855,2.19364-.903,1.98916.384a54.44726,54.44726,0,0,1-20.18349,34.18237c-1.029.80452-2.15839-.87794-1.135-1.6781Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M366.20648,224.55524a15.37292,15.37292,0,0,0,14.35425-2.596c1.01827-.81717,2.14664.86624,1.135,1.67809a17.22664,17.22664,0,0,1-15.87329,2.90707,1.047,1.047,0,0,1-.80257-1.18659,1.01807,1.01807,0,0,1,1.1866-.80257Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><circle cx="176.73495" cy="141.33694" r="22.44541" fill="#2f2e41"/><path d="M522.86615,482.5485H364.72643a4.16146,4.16146,0,0,1-4.15662-4.15663V414.22563a4.16145,4.16145,0,0,1,4.15662-4.15662H522.86615a4.16139,4.16139,0,0,1,4.15662,4.15662v64.16624A4.16139,4.16139,0,0,1,522.86615,482.5485Z" transform="translate(-223.31656 -125.21687)" fill="#d1d3d4"/><path d="M507.80833,513.32943h-128.024a4.988,4.988,0,0,1,0-9.9759h128.024C514.24087,503.26374,514.30521,513.42,507.80833,513.32943Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M507.80833,536.19085h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C514.24087,526.12517,514.30521,536.28145,507.80833,536.19085Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M507.80833,559.05228h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C514.24087,548.9866,514.30521,559.14288,507.80833,559.05228Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M518.3203,310.88021a7.5527,7.5527,0,0,1-11.506,1.31694l-56.03547,23.91971,1.47421-13.86917,54.6082-21.17247a7.59361,7.59361,0,0,1,11.45907,9.805Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><path d="M404.26434,309.59186s-6.9281,6.729-.05264,15.38831,40.87031,14.28989,46.32241,15.36146,35.77912-19.55715,35.77912-19.55715l2.93726-14.94511-33.85165,11.614Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M385.65859,360.6475l.54554,10.36527s-1.6102,9.1817-.28592,11.1631c1.31431,1.97149,37.17883,2.22041,37.17883,2.22041s2.36973-12.77463,4.41087-23.95613Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><polygon points="265.457 372.482 272.955 370.968 270.809 340.682 259.743 342.918 265.457 372.482" fill="#ffb6b6"/><polygon points="276.197 333.07 283.167 329.936 274.125 300.959 263.837 305.586 276.197 333.07" fill="#ffb6b6"/><path d="M505.58509,441.9601l-13.12456,7.43147-32.368-52.01294c-22.72256,15.30844-44.94235,22.63954-67.34188,26.55157-7.57317,1.34845-15.2945.66509-16.74841-7.04294q-.102-.54051-.16257-1.08705c-.89431-8.41941,9.57876-19.21125,10.91-29.5123l28.663-1.72061,3.53455,4.53182,33.14777-20.37939a14.22944,14.22944,0,0,1,19.58307,4.95118q.32862.54118.61018,1.10917Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M497.55209,476.94969l-15.01776,1.39566L455.30469,422.656c-27.0018,4.64388-52.47556,6.13153-74.50924.51382a14.24211,14.24211,0,0,1-10.18528-17.20508q.12864-.53472.29751-1.0581c2.637-8.04565,9.53976-13.73838,14.97824-22.5875l39.72453.60342-4.437,15.3091,38.58961-4.99362a14.22943,14.22943,0,0,1,15.8303,12.54641q.07779.62835.10166,1.26184Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><circle cx="177.85296" cy="149.54605" r="16.13042" fill="#ffb6b6"/><path d="M391.71309,296.71644l14.7901,1.74a3.018,3.018,0,0,1,2.59579,2.353l2.35209,10.763,14.66229,17.26667s5.26507,9.87206,3.29069,21.0604S426.562,369.4013,426.562,369.4013s-39.33133,14.747-40.6476,12.77256-1.74344-10.94267-1.74344-10.94267L378.90588,312.657s5.92323-9.87205,9.21392-9.87205C390.24358,302.71331,389.97113,297.93343,391.71309,296.71644Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><ellipse cx="203.64529" cy="243.2793" rx="3.75539" ry="4.50647" fill="#f79910"/><path d="M485.05261,489.90106l11.431-5.72874,2.15445,5.02613,14.45491,2.922a3.41579,3.41579,0,0,1,.6692,6.48775l-13.96637,5.98667-4.5384-3.93972,1.19114,5.37452-5.26588,2.25722Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M496.242,455.87627l11.32849-5.9289,2.24251,4.98747,14.50407,2.6673a3.41578,3.41578,0,0,1,.7832,6.475l-13.85893,6.23139-4.607-3.8593,1.28548,5.35274-5.22537,2.34948Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M391.95364,269.70328a20.64237,20.64237,0,0,1-11.90679-3.75646l-.11641-.08151v-.142a15.56559,15.56559,0,0,1,15.5479-15.5479h6.54649a15.56559,15.56559,0,0,1,15.54791,15.5479,3.62006,3.62006,0,0,1-3.54069,3.60249l-21.71054.37426C392.199,269.70221,392.07591,269.70328,391.95364,269.70328Z" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M456.88021,395.17611a7.55265,7.55265,0,0,1-8.22151-8.15655l-53.70233-28.77824,11.74595-7.52062,50.66588,29.38193a7.59361,7.59361,0,0,1-.488,15.07348Z" transform="translate(-223.31656 -125.21687)" fill="#ffb6b6"/><path d="M386.5765,305.35513s-9.58323-1.2-12.04279,9.57987,14.4,40.83164,16.97242,45.75671,37.633,15.69661,37.633,15.69661l13.5004-7.05147L412.41086,350.1784Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M403.66608,252.57923c1.50926-1.85063.32945-4.59164-.90821-6.63391q-4.25372-7.01907-8.92988-13.77023c-3.89552-5.62391-8.61563-11.5035-15.32276-12.85178-6.11687-1.22961-12.27907,1.67716-17.54144,5.02911-5.17385,3.29557-10.26964,7.44685-12.23887,13.25646-1.88453,5.55973-.561,11.7338,1.78383,17.11563,2.00823,4.60934,4.71,10.12111,1.84595,14.25346a11.05671,11.05671,0,0,1-5.29648,3.60469,37.8325,37.8325,0,0,1-16.62932,2.45369c7.76709-.0835,14.56583,7.89179,22.09541,5.98389,4.26985-1.08192,7.11355-5.06266,9.15993-8.96324a25.11031,25.11031,0,0,0,2.98931-8.466c.49749-4.04034-.57852-8.292.7799-12.12949,1.41377-3.99388,5.17214-6.66612,8.96571-8.55255,2.51875-1.25249,5.26467-2.29852,8.06961-2.0862,7.95119.60184,12.06933,10.38851,19.61418,12.969" transform="translate(-223.31656 -125.21687)" fill="#2f2e41"/><path d="M673.90537,307.19618H515.76565a4.16145,4.16145,0,0,1-4.15662-4.15663V238.87331a4.16145,4.16145,0,0,1,4.15662-4.15662H673.90537a4.16139,4.16139,0,0,1,4.15662,4.15662v64.16624A4.16139,4.16139,0,0,1,673.90537,307.19618Z" transform="translate(-223.31656 -125.21687)" fill="#f79910"/><path d="M658.84755,337.97711h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C665.28009,327.91142,665.34443,338.06771,658.84755,337.97711Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M658.84755,360.83853h-128.024a4.98795,4.98795,0,1,1,0-9.97589h128.024C665.28009,350.77285,665.34443,360.92913,658.84755,360.83853Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M658.84755,383.7h-128.024a4.98795,4.98795,0,0,1,0-9.9759h128.024C665.28009,373.63428,665.34443,383.79056,658.84755,383.7Z" transform="translate(-223.31656 -125.21687)" fill="#e4e4e4"/><path d="M842.65819,148.60552H312.91682a2.9705,2.9705,0,0,1-2.96725-2.96725V128.18412a2.9705,2.9705,0,0,1,2.96725-2.96725H842.65819a2.9705,2.9705,0,0,1,2.96725,2.96725v17.45415A2.9705,2.9705,0,0,1,842.65819,148.60552Z" transform="translate(-223.31656 -125.21687)" fill="#e6e7e8"/><circle id="b637d542-0ea5-46b9-aa61-095218110267" data-name="Ellipse 90" cx="103.85322" cy="10.84298" r="4.21275" fill="#fff"/><circle id="b2f1169c-b566-47c9-8d4c-e5da87bed52d" data-name="Ellipse 91" cx="119.84345" cy="10.84298" r="4.21275" fill="#fff"/><circle id="a4343e91-3550-42c2-91d2-f5254c8e4c9c" data-name="Ellipse 92" cx="135.83441" cy="10.84298" r="4.21275" fill="#fff"/><path d="M826.99117,133.16028H808.57783a1.364,1.364,0,0,1,0-2.72791h18.41334a1.364,1.364,0,0,1,0,2.72791Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M826.99117,138.27509H808.57783a1.36395,1.36395,0,0,1,0-2.7279h18.41334a1.364,1.364,0,1,1,0,2.7279Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/><path d="M826.99117,143.38991H808.57783a1.36395,1.36395,0,1,1,0-2.7279h18.41334a1.36395,1.36395,0,1,1,0,2.7279Z" transform="translate(-223.31656 -125.21687)" fill="#fff"/></svg>
\ No newline at end of file
diff --git a/docs/_static/tutorial.svg b/docs/_static/tutorial.svg
new file mode 100644
index 0000000000..97559bd8aa
--- /dev/null
+++ b/docs/_static/tutorial.svg
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg fill="#000000" height="800px" width="800px" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" 
+	 viewBox="0 0 512 512" xml:space="preserve">
+<g>
+	<g>
+		<path d="M418.472,17.102H204.159v-0.534C204.159,7.432,196.727,0,187.591,0h-34.205c-9.136,0-16.568,7.432-16.568,16.568v0.534
+			h-43.29c-13.851,0-25.119,11.268-25.119,25.119v444.66c0,13.851,11.268,25.119,25.119,25.119h324.944
+			c13.851,0,25.119-11.268,25.119-25.119V42.221C443.591,28.371,432.323,17.102,418.472,17.102z M152.852,16.568
+			c0-0.295,0.239-0.534,0.534-0.534h34.205c0.295,0,0.534,0.239,0.534,0.534v79.083l-13.19-8.794
+			c-1.347-0.898-2.897-1.347-4.447-1.347c-1.55,0-3.1,0.449-4.447,1.347l-13.19,8.794V16.568z M102.614,495.967h-9.086
+			c-5.01,0-9.086-4.076-9.086-9.086V42.221c0-5.01,4.076-9.086,9.086-9.086h9.086V495.967z M427.558,486.881h-0.001
+			c0,5.01-4.076,9.086-9.086,9.086H118.647V33.136h18.171v77.495c0,2.957,1.627,5.674,4.234,7.069
+			c2.607,1.395,5.77,1.243,8.229-0.399l21.207-14.138l21.207,14.138c1.341,0.895,2.892,1.347,4.448,1.347
+			c1.297,0,2.597-0.314,3.783-0.948c2.606-1.395,4.234-4.112,4.234-7.069V33.136h214.313c5.01,0,9.086,4.076,9.086,9.086V486.881z"
+			/>
+	</g>
+</g>
+<g>
+	<g>
+		<path d="M375.716,136.818H256c-9.136,0-16.568,7.432-16.568,16.568v77.495h-77.495c-9.136,0-16.568,7.432-16.568,16.568v119.716
+			c0,9.136,7.432,16.568,16.568,16.568h213.779c9.136,0,16.568-7.432,16.568-16.568V153.386
+			C392.284,144.25,384.852,136.818,375.716,136.818z M376.251,367.165c0,0.295-0.239,0.534-0.534,0.534H161.937
+			c-0.295,0-0.534-0.239-0.534-0.534V247.449c0-0.295,0.239-0.534,0.534-0.534h85.511c4.427,0,8.017-3.589,8.017-8.017v-85.511
+			c0-0.295,0.239-0.534,0.534-0.534h119.716c0.295,0,0.534,0.239,0.534,0.534V367.165z"/>
+	</g>
+</g>
+<g>
+	<g>
+		<path d="M315.858,222.33c-4.427,0-8.017,3.589-8.017,8.017v68.944h-59.568l3.624-2.416c3.684-2.455,4.68-7.433,2.223-11.117
+			s-7.434-4.679-11.116-2.223l-25.653,17.102c-2.23,1.487-3.57,3.989-3.57,6.67s1.339,5.184,3.57,6.67l25.653,17.102
+			c1.367,0.912,2.912,1.348,4.439,1.348c2.59,0,5.133-1.254,6.677-3.571c2.456-3.683,1.461-8.661-2.223-11.117l-3.624-2.416h67.585
+			c4.427,0,8.017-3.589,8.017-8.017v-76.96C323.875,225.919,320.285,222.33,315.858,222.33z"/>
+	</g>
+</g>
+</svg>
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index fcce3f960b..fe9440dd16 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -9,7 +9,7 @@
 release = "0.12.2"
 project = "flair"
 author = importlib_metadata.metadata(project)["Author"]
-copyright = "2018 Zalando SE"  # TODO: verify if this is right.
+copyright = f"2023 {author}"
 
 # The full version, including alpha/beta/rc tags
 top_level = project.replace("-", "_")
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/index.rst b/docs/index.rst
index b7c33bd067..1ded8b6ef3 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,6 +1,19 @@
 flair
 =====
 
+.. _flair_docs_mainpage:
+
+
+**Version**: |version|
+
+**Useful links**:
+`Getting started .. _getting_started`_ |
+`Source Repository <https://https://github.com/flairNLP/flair>`_ |
+`Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
+
+Flair is a very simple framework for state-of-the-art Natural Language Processing (NLP)
+
+
 .. toctree::
    :maxdepth: 2
    :caption: Contents:
@@ -8,9 +21,57 @@ flair
    modules
 
 
-Indices and tables
-==================
+.. grid:: 2
+
+    .. grid-item-card::
+        :img-top: ./_static/tutorial.svg
+
+        Tutorial
+        ^^^^^^^^
+
+        New to Flair? Check out the Tutorials. It contains an introduction to Flair's main concepts.
+
+        +++
+
+        .. button-ref:: tutorial/intro
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the tutorial
+
+    .. grid-item-card::
+        :img-top: ./_static/api.svg
+
+        API-docs
+        ^^^^^^^^
+
+        The API-docs provides in-depth information on the classes and functions designed for public use.
+
+        +++
+
+        .. button-ref:: api/flair
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the API docs
+
+    .. grid-item-card::
+        :img-top: ./_static/contributing.svg
+
+        Contributor's Guide
+        ^^^^^^^^^^^^^^^^^^^
+
+        Want to add to the codebase? Can help add translation or a flowchart to the
+        documentation? The contributing guidelines will guide you through the
+        process of improving NumPy.
+
+        +++
+
+        .. button-ref:: contributing/index
+            :expand:
+            :color: secondary
+            :click-parent:
 
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
\ No newline at end of file
+            To the contributor's guide
\ No newline at end of file
diff --git a/docs/tutorial/intro.md b/docs/tutorial/intro.md
new file mode 100644
index 0000000000..a0e0ce2fcb
--- /dev/null
+++ b/docs/tutorial/intro.md
@@ -0,0 +1,89 @@
+---
+sidebar_position: 1
+---
+
+# Quick Start
+
+Let's discover **Flair in less than 5 minutes**.
+
+### Requirements and Installation
+
+In your favorite virtual environment, simply do:
+
+```
+pip install flair
+```
+
+Flair requires Python 3.7+. 
+
+### Example 1: Tag Entities in Text
+
+Let's run **named entity recognition**  (NER) over the following example sentence: "_I love Berlin and New York._"
+
+Our goal is to identify names in this sentence, and their types.
+
+To do this, all you need is to make a `Sentence` for this text, load a pre-trained model and use it to predict tags for the sentence:
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('I love Berlin and New York.')
+
+# load the NER tagger
+tagger = Classifier.load('ner')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence[7]: "I love Berlin and New York." → ["Berlin"/LOC, "New York"/LOC]
+```
+
+The output shows that both "Berlin" and "New York" were tagged as **location entities** (LOC) in this sentence.
+
+
+### Example 2: Detect Sentiment 
+
+Let's run **sentiment analysis** over the same sentence to determine whether it is POSITIVE or NEGATIVE.
+
+You can do this with essentially the same code as above. Just instead of loading the 'ner' model, you now load the 'sentiment' model:
+
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('I love Berlin and New York.')
+
+# load the sentiment tagger
+tagger = Classifier.load('sentiment')
+
+# run sentiment analysis over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+
+```
+
+This should print:
+
+```console
+Sentence[7]: "I love Berlin and New York." → POSITIVE (0.9982)
+```
+
+The output shows that the sentence "_I love Berlin and New York._" was tagged as having **POSITIVE** sentiment. 
+
+
+### Summary
+
+Congrats, you now know how to use Flair to find entities and detect sentiment!
\ No newline at end of file

From 0422c0e2a70cc6a66c9e81f3278c0d6384459e65 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 12:12:25 +0200
Subject: [PATCH 047/124] add grid and getting started section

---
 docs/conf.py           | 1 +
 docs/index.rst         | 1 -
 docs/requirements.txt  | 3 ++-
 docs/tutorial/intro.md | 8 ++++----
 4 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index fe9440dd16..bc9af1eecc 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -42,6 +42,7 @@ def linkcode_resolve(*args):
     "sphinx_github_style",
     "sphinx_autodoc_typehints",
     "sphinx_multiversion",
+    "sphinx_design",
 ]
 
 
diff --git a/docs/index.rst b/docs/index.rst
index 1ded8b6ef3..9ae16c893c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,7 +7,6 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started .. _getting_started`_ |
 `Source Repository <https://https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index bc1d8f027c..cfb2a9a5a6 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,4 +4,5 @@ myst-parser
 sphinx
 importlib-metadata
 sphinx-multiversion
-pydata-sphinx-theme
\ No newline at end of file
+pydata-sphinx-theme
+sphinx_design
\ No newline at end of file
diff --git a/docs/tutorial/intro.md b/docs/tutorial/intro.md
index a0e0ce2fcb..373e5a2d40 100644
--- a/docs/tutorial/intro.md
+++ b/docs/tutorial/intro.md
@@ -6,7 +6,7 @@ sidebar_position: 1
 
 Let's discover **Flair in less than 5 minutes**.
 
-### Requirements and Installation
+## Requirements and Installation
 
 In your favorite virtual environment, simply do:
 
@@ -16,7 +16,7 @@ pip install flair
 
 Flair requires Python 3.7+. 
 
-### Example 1: Tag Entities in Text
+## Example 1: Tag Entities in Text
 
 Let's run **named entity recognition**  (NER) over the following example sentence: "_I love Berlin and New York._"
 
@@ -50,7 +50,7 @@ Sentence[7]: "I love Berlin and New York." → ["Berlin"/LOC, "New York"/LOC]
 The output shows that both "Berlin" and "New York" were tagged as **location entities** (LOC) in this sentence.
 
 
-### Example 2: Detect Sentiment 
+## Example 2: Detect Sentiment 
 
 Let's run **sentiment analysis** over the same sentence to determine whether it is POSITIVE or NEGATIVE.
 
@@ -84,6 +84,6 @@ Sentence[7]: "I love Berlin and New York." → POSITIVE (0.9982)
 The output shows that the sentence "_I love Berlin and New York._" was tagged as having **POSITIVE** sentiment. 
 
 
-### Summary
+## Summary
 
 Congrats, you now know how to use Flair to find entities and detect sentiment!
\ No newline at end of file

From d551b1e2f86f23ddf1c95ca53a3f8f123b8975a6 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 12:14:08 +0200
Subject: [PATCH 048/124] remove doctree

---
 docs/index.rst | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 9ae16c893c..2d162371fd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -12,14 +12,6 @@ flair
 
 Flair is a very simple framework for state-of-the-art Natural Language Processing (NLP)
 
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   modules
-
-
 .. grid:: 2
 
     .. grid-item-card::

From cbe5322bf56d1b9714b4364079c11d1d7cb77505 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 13:08:17 +0200
Subject: [PATCH 049/124] add toctree for tutorials api and contributing

---
 docs/api/index.rst      |  5 +++++
 docs/index.rst          | 11 ++++++++++-
 docs/tutorial/index.rst |  5 +++++
 3 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 docs/api/index.rst
 create mode 100644 docs/tutorial/index.rst

diff --git a/docs/api/index.rst b/docs/api/index.rst
new file mode 100644
index 0000000000..6affbb2bc4
--- /dev/null
+++ b/docs/api/index.rst
@@ -0,0 +1,5 @@
+.. toctree::
+   :maxdepth: 4
+   :caption: API:
+
+   flair
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 2d162371fd..1363937773 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,6 +7,7 @@ flair
 **Version**: |version|
 
 **Useful links**:
+`Getting started .. _getting_started`_ |
 `Source Repository <https://https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
 
@@ -65,4 +66,12 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
             :color: secondary
             :click-parent:
 
-            To the contributor's guide
\ No newline at end of file
+            To the contributor's guide
+
+.. toctree::
+   :maxdepth: 1
+   :hidden:
+
+   Tutorials <tutorial/index>
+   API reference <api/flair>
+   Contributing <contributing/index>
\ No newline at end of file
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
new file mode 100644
index 0000000000..601add6c86
--- /dev/null
+++ b/docs/tutorial/index.rst
@@ -0,0 +1,5 @@
+.. toctree::
+   :maxdepth: 2
+   :caption: Tutorials:
+
+   intro

From a047fdb4c6d8609c996cebb85a573ef96e50662d Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 13:12:30 +0200
Subject: [PATCH 050/124] add headlines to sections

---
 docs/api/index.rst          | 3 +++
 docs/contributing/index.rst | 2 ++
 docs/index.rst              | 8 ++++----
 docs/tutorial/index.rst     | 6 ++++++
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 6affbb2bc4..5382f0c724 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -1,3 +1,6 @@
+API Docs
+========
+
 .. toctree::
    :maxdepth: 4
    :caption: API:
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
index e69de29bb2..0b29ebea03 100644
--- a/docs/contributing/index.rst
+++ b/docs/contributing/index.rst
@@ -0,0 +1,2 @@
+Contributing
+============
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 1363937773..2c910907f7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,7 +7,7 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started .. _getting_started`_ |
+`Getting started .. _flair_tutorials`_ |
 `Source Repository <https://https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
 
@@ -25,7 +25,7 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
 
         +++
 
-        .. button-ref:: tutorial/intro
+        .. button-ref:: tutorial/index
             :expand:
             :color: secondary
             :click-parent:
@@ -42,7 +42,7 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
 
         +++
 
-        .. button-ref:: api/flair
+        .. button-ref:: api/index
             :expand:
             :color: secondary
             :click-parent:
@@ -73,5 +73,5 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
    :hidden:
 
    Tutorials <tutorial/index>
-   API reference <api/flair>
+   API reference <api/index>
    Contributing <contributing/index>
\ No newline at end of file
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index 601add6c86..afbb471825 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -1,3 +1,9 @@
+Tutorials
+=========
+
+
+.. _flair_tutorials:
+
 .. toctree::
    :maxdepth: 2
    :caption: Tutorials:

From c67340cb9e62ca085f5a7b8e1ee10dc008f88271 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 13:16:56 +0200
Subject: [PATCH 051/124] remove redundant titles

---
 docs/api/index.rst      | 1 -
 docs/tutorial/index.rst | 1 -
 2 files changed, 2 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 5382f0c724..4625b412f5 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,6 +3,5 @@ API Docs
 
 .. toctree::
    :maxdepth: 4
-   :caption: API:
 
    flair
\ No newline at end of file
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index afbb471825..38230a8cb3 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -6,6 +6,5 @@ Tutorials
 
 .. toctree::
    :maxdepth: 2
-   :caption: Tutorials:
 
    intro

From c08adbed7775883f1b1372de7266d77064d90240 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 13:34:23 +0200
Subject: [PATCH 052/124] add readme for gh page

---
 assets/README.md    | 6 ++++++
 flair/file_utils.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/assets/README.md b/assets/README.md
index e69de29bb2..55b6e8da73 100644
--- a/assets/README.md
+++ b/assets/README.md
@@ -0,0 +1,6 @@
+# Docs For Flair NLP
+
+This branch is currently under construction.
+
+It will contain the docs for Flair NLP.
+Don't change files, as this branch will be autogenerated using github actions.
\ No newline at end of file
diff --git a/flair/file_utils.py b/flair/file_utils.py
index 752ee98ea8..7968fefc2a 100644
--- a/flair/file_utils.py
+++ b/flair/file_utils.py
@@ -30,7 +30,7 @@
 
 
 def set_proxies(proxies: typing.Dict[str, str]) -> None:
-    """Allows for data downloaded from urls to be forwarded to a proxy.
+    r"""Allows for data downloaded from urls to be forwarded to a proxy.
 
     see https://requests.readthedocs.io/en/latest/user/advanced/#proxies
 

From 3a5e55d19bdbcd7bf813237de9884e7af6b2b1f2 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 15:33:51 +0200
Subject: [PATCH 053/124] add contributing docs

---
 .github/ISSUE_TEMPLATE/config.yaml          |  1 +
 docs/conf.py                                |  2 +-
 docs/contributing/index.rst                 | 10 +++-
 docs/contributing/local_development.md      | 47 ++++++++++++++++
 docs/contributing/making_a_pull_request.md  | 17 ++++++
 docs/contributing/updating_documentation.md | 47 ++++++++++++++++
 docs/contributing/writing_a_good_issue.md   | 59 +++++++++++++++++++++
 7 files changed, 181 insertions(+), 2 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yaml
 create mode 100644 docs/contributing/local_development.md
 create mode 100644 docs/contributing/making_a_pull_request.md
 create mode 100644 docs/contributing/updating_documentation.md
 create mode 100644 docs/contributing/writing_a_good_issue.md

diff --git a/.github/ISSUE_TEMPLATE/config.yaml b/.github/ISSUE_TEMPLATE/config.yaml
new file mode 100644
index 0000000000..ec4bb386bc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yaml
@@ -0,0 +1 @@
+blank_issues_enabled: false
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index bc9af1eecc..b0e4b8e355 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -89,7 +89,7 @@ def linkcode_resolve(*args):
 smv_tag_whitelist = r"^\d+\.\d+\.\d+$"
 
 # Whitelist pattern for branches (set to None to ignore all branches)
-smv_branch_whitelist = r"^main|master|doc-page$"
+smv_branch_whitelist = r"^master|doc-page$"
 
 # Whitelist pattern for remotes (set to None to use local branches only)
 smv_remote_whitelist = r"^origin$"
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
index 0b29ebea03..ef72362f37 100644
--- a/docs/contributing/index.rst
+++ b/docs/contributing/index.rst
@@ -1,2 +1,10 @@
 Contributing
-============
\ No newline at end of file
+============
+
+.. toctree::
+   :maxdepth: 1
+
+   writing_a_good_issue
+   local_development
+   making_a_pull_request
+   updating_documentation
diff --git a/docs/contributing/local_development.md b/docs/contributing/local_development.md
new file mode 100644
index 0000000000..bd9acc4559
--- /dev/null
+++ b/docs/contributing/local_development.md
@@ -0,0 +1,47 @@
+# Local Development
+
+For contributors looking to get deeper into the API we suggest cloning the repository and checking out the unit
+tests for examples of how to call methods. Most classes and methods are documented, so finding your way around
+the code should hopefully be easy.
+
+## Setup
+
+Flair requires python-3.7 or higher. To make sure our code also runs on the oldest supported
+python version, it is recommended to use python-3.7.x for flair development.
+
+Create a python environment of your preference and run:
+```bash
+pip install -r requirements-dev.txt
+pip install -e .
+```
+
+## Tests
+
+To only run typechecks and check the code formatting execute:
+
+```bash
+pytest flair
+```
+
+To run all basic tests execute:
+
+```bash
+pytest
+```
+
+To run integration tests execute:
+
+```bash
+pytest --runintegration
+```
+
+The integration tests will train small models and therefore take more time.
+In general, it is recommended to ensure all basic tests are running through before testing the integration tests
+
+## Code Formatting
+
+To ensure a standardized code style we use the formatter [black](https://github.com/ambv/black) and for standardizing imports we use [ruff](https://github.com/charliermarsh/ruff).
+If your code is not formatted properly, the tests will fail.
+
+We recommend configuring your IDE to run these formatters for you, but you can also always run them manually via
+`black . && ruff --fix .` in the flair root folder.
\ No newline at end of file
diff --git a/docs/contributing/making_a_pull_request.md b/docs/contributing/making_a_pull_request.md
new file mode 100644
index 0000000000..ae795ec682
--- /dev/null
+++ b/docs/contributing/making_a_pull_request.md
@@ -0,0 +1,17 @@
+# Making a pull request
+
+We are happy to accept your contributions to make `flair` better and more awesome! To avoid unnecessary work on either
+side, please stick to the following process:
+
+1. Check if there is already [an issue](https://github.com/flairNLP/flair/issues) for your concern.
+2. If there is not, open a new one to start a discussion. We hate to close finished PRs!
+3. If we decide your concern needs code changes, we would be happy to accept a pull request. Please consider the
+   commit guidelines below.
+
+
+## Git Commit Guidelines
+
+If there is already a ticket, use this number at the start of your commit message.
+Use meaningful commit messages that described what you did.
+
+**Example:** `GH-42: Added new type of embeddings: DocumentEmbedding.`
\ No newline at end of file
diff --git a/docs/contributing/updating_documentation.md b/docs/contributing/updating_documentation.md
new file mode 100644
index 0000000000..92553b3721
--- /dev/null
+++ b/docs/contributing/updating_documentation.md
@@ -0,0 +1,47 @@
+# Updating documentation
+
+
+## What is good documentation?
+
+Good Documentation
+* Always refers to the enduser. Do not document *why* something is the way it is, but rather *how* to use it.
+* Doesn't lie and is always up-to-ate. Whenever code is updated, consider if the documentation needs to change accordingly to reflect reality.
+* Provides useful links whenever usable. Do not reference another object without linking it.
+
+
+## Tutorials
+
+All tutorials are markdown files stored at [the tutorial folder](https://github.com/flairNLP/flair/tree/master/docs/tutorial).
+When adding a new tutorial, you must add its name to the `index.rst` file in the respective folder.
+We are using the [MyST parser](https://myst-parser.readthedocs.io/en/latest/syntax/typography.html) which adds
+some additional syntax over markdown.
+
+A tutorial should always be easy to understand, and reference api documentation for future readings.
+
+:::{note}
+You can reference symbols by defining links
+`[flair.set_seed](#flair.set_seed)`
+[flair.set_seed](#flair.set_seed)`
+::: 
+
+
+## Docstrings
+
+For docstrings we follow the [Google docstring](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) format.
+We do not need to specify types or default values, as those will be extracted from the function signature.
+
+Docstrings have usual a 1 liner giving a simple explanation of the object. Then there is a more detailed explanation followed **if required**.
+Ensure that you always use cross-references instead of just mentioning another object,
+e.g. ``:class:`flair.models.SequenceTagger` `` can be used to reference the SequenceTagger.
+
+
+## Building the local docs
+
+For building the docs,
+
+* Ensure that you have everything committed. Local changes won't be used for building.
+* Install the build dependencies via `pip install -r docs/requirements.txt`.
+* In `docs/conf.py` temporarily add your local branch name to the `smv_branch_whitelist` pattern. 
+  E.g. if your branch is called `doc-page` `smv_branch_whitelist` need to have the value `r"^master|doc-page$"`
+* run `sphinx-multiversion docs doc_build/` to generate the docs.
+* open `doc_build/<your branch name>/index.html` to view the docs.
diff --git a/docs/contributing/writing_a_good_issue.md b/docs/contributing/writing_a_good_issue.md
new file mode 100644
index 0000000000..da89567cc9
--- /dev/null
+++ b/docs/contributing/writing_a_good_issue.md
@@ -0,0 +1,59 @@
+# Writing a good issue
+
+You are likely reading this, because you want to create an issue.
+This is great, issues are a great way to provide feedback which then can be used to enhance the library.
+Here are some guidelines to make the issue as insightful as possible.
+
+## Issue types
+
+Before you start with the issue, you need to choose the type.
+There are 3 issue types:
+
+* **Bug Report** -> You have noticed something that doesn't work the way it is expected.
+* **Feature/Enhancement request** -> You have an idea for something that would make flair better.
+* **Question** -> You have a question that is unrelated to a potential bug or feature request.
+
+### Bug Report
+
+A bug report is one of the most common issues. It is simple: you tried something, but it didn't work as expected.
+It is important to provide as much context as possible, so ensure that you ran the [collect_env.py](https://github.com/flairNLP/flair/blob/master/collect_env.py) and if required  and created a minimal reproducible example.
+The minimal reproducible example has, like the name says two properties:
+* it is reproducible
+* it is as small as possible
+
+#### Reproducibility
+
+Please ensure that we can really reproduce your issue.
+
+You might have encountered the issue while training on your custom dataset and don't want to share it. That is ok,
+but maybe you can test if you can recreate the same bug by using one of the manny public datasets instead and if not,
+maybe filter the problem down to a single sentence and report what property it has. 
+
+It is also possible, that you have encountered the issue while predicting some sentences. Maybe you don't want to share
+your trained model, but maybe you can recreate the issue by creating a model without training it?
+
+Please, be sure to not add local paths, or load any data that others have no access.
+
+#### Minimal
+
+After ensuring reproducibility, please also take some time to make it minimal. That way, we can quicker understand
+what the issue is and won't need to spend time debugging code that is unrelated to the issue.
+
+For example, you might get an error where the stack trace shows that it occurred while saving the model. In that case,
+you can verify, if the model really needs to be trained on the full dataset for 100 epochs and test if it instead would be enough
+to just create a model and save it with no training involved.
+
+### Feature/Enhancement request
+
+For a Feature/Enhancement request, please provide not only the *what* but also the *why*, it is easier to judge how important a feature is,
+when you know why it is wanted and what it could provide to the users.
+
+### Question
+
+Questions are the most generic types of issues, but also those whose usually lack most of the context.
+Please ensure that you are not creating a Question that should actually be a bug report.
+
+For example issues like: `[Question]: Something is wrong with ...`, `[Question]: sentence.to_dict(tag_type='ner') no longer have ...`
+or `[Question]: MultiTagger cannot be loaded...` are examples for issues that clearly should be bug reports instead and
+could have been resolved quicker, if enough context and a minimal reproducible example were provided.
+

From ebfc829bffc0dae3b941d7e693c6fc1f3f33df69 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 15:39:43 +0200
Subject: [PATCH 054/124] fix note parsing

---
 docs/contributing/updating_documentation.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/docs/contributing/updating_documentation.md b/docs/contributing/updating_documentation.md
index 92553b3721..112b858683 100644
--- a/docs/contributing/updating_documentation.md
+++ b/docs/contributing/updating_documentation.md
@@ -18,12 +18,10 @@ some additional syntax over markdown.
 
 A tutorial should always be easy to understand, and reference api documentation for future readings.
 
-:::{note}
-You can reference symbols by defining links
-`[flair.set_seed](#flair.set_seed)`
-[flair.set_seed](#flair.set_seed)`
-::: 
-
+```{note}
+  You can reference symbols by defining links
+  `[flair.set_seed](#flair.set_seed)`
+```
 
 ## Docstrings
 

From 1d21ddf09f76c32ab0eda54a76982b8894400733 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 15:53:26 +0200
Subject: [PATCH 055/124] add more autodoc modules

---
 docs/api/flair.data.rst         | 7 +++++++
 docs/api/flair.datasets.rst     | 7 +++++++
 docs/api/flair.embeddings.rst   | 7 +++++++
 docs/api/flair.models.rst       | 7 +++++++
 docs/api/flair.splitter.rst     | 7 +++++++
 docs/api/flair.tokenziation.rst | 7 +++++++
 docs/api/flair.trainers.rst     | 7 +++++++
 docs/api/index.rst              | 9 ++++++++-
 docs/modules.rst                | 7 -------
 9 files changed, 57 insertions(+), 8 deletions(-)
 create mode 100644 docs/api/flair.data.rst
 create mode 100644 docs/api/flair.datasets.rst
 create mode 100644 docs/api/flair.embeddings.rst
 create mode 100644 docs/api/flair.models.rst
 create mode 100644 docs/api/flair.splitter.rst
 create mode 100644 docs/api/flair.tokenziation.rst
 create mode 100644 docs/api/flair.trainers.rst
 delete mode 100644 docs/modules.rst

diff --git a/docs/api/flair.data.rst b/docs/api/flair.data.rst
new file mode 100644
index 0000000000..9d93885dd4
--- /dev/null
+++ b/docs/api/flair.data.rst
@@ -0,0 +1,7 @@
+flair.data
+==========
+
+.. automodule:: flair.data
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.datasets.rst b/docs/api/flair.datasets.rst
new file mode 100644
index 0000000000..0dcf3900d6
--- /dev/null
+++ b/docs/api/flair.datasets.rst
@@ -0,0 +1,7 @@
+flair.datasets
+==============
+
+.. automodule:: flair.datasets
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.embeddings.rst b/docs/api/flair.embeddings.rst
new file mode 100644
index 0000000000..107a828902
--- /dev/null
+++ b/docs/api/flair.embeddings.rst
@@ -0,0 +1,7 @@
+flair.embeddings
+================
+
+.. automodule:: flair.embeddings
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.models.rst b/docs/api/flair.models.rst
new file mode 100644
index 0000000000..5fe0cf440b
--- /dev/null
+++ b/docs/api/flair.models.rst
@@ -0,0 +1,7 @@
+flair.models
+============
+
+.. automodule:: flair.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.splitter.rst b/docs/api/flair.splitter.rst
new file mode 100644
index 0000000000..c577b3b944
--- /dev/null
+++ b/docs/api/flair.splitter.rst
@@ -0,0 +1,7 @@
+flair.splitter
+==============
+
+.. automodule:: flair.splitter
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.tokenziation.rst b/docs/api/flair.tokenziation.rst
new file mode 100644
index 0000000000..92baa459db
--- /dev/null
+++ b/docs/api/flair.tokenziation.rst
@@ -0,0 +1,7 @@
+flair.tokenization
+==================
+
+.. automodule:: flair.tokenization
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.trainers.rst b/docs/api/flair.trainers.rst
new file mode 100644
index 0000000000..dabe55cb5f
--- /dev/null
+++ b/docs/api/flair.trainers.rst
@@ -0,0 +1,7 @@
+flair.trainers
+==============
+
+.. automodule:: flair.trainers
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4625b412f5..64408152ad 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -4,4 +4,11 @@ API Docs
 .. toctree::
    :maxdepth: 4
 
-   flair
\ No newline at end of file
+   flair
+   flair.data
+   flair.datasets
+   flair.embeddings
+   flair.models
+   flair.splitter
+   flair.tokenization
+   flair.trainers
\ No newline at end of file
diff --git a/docs/modules.rst b/docs/modules.rst
deleted file mode 100644
index eaf72506ee..0000000000
--- a/docs/modules.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-Api docs
-========
-
-.. toctree::
-   :maxdepth: 4
-
-   api/flair
\ No newline at end of file

From 5261e07bd45a1a30e8c7234d9142b7c499afcdf6 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 15:54:39 +0200
Subject: [PATCH 056/124] limit depth in toctree

---
 docs/api/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 64408152ad..ae6d8d517d 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -2,7 +2,7 @@ API Docs
 ========
 
 .. toctree::
-   :maxdepth: 4
+   :maxdepth: 1
 
    flair
    flair.data

From f35439b236b1982849d41d0ea4faf7c426bb1875 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 15:56:31 +0200
Subject: [PATCH 057/124] fix typo

---
 docs/api/{flair.tokenziation.rst => flair.tokenization.rst} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename docs/api/{flair.tokenziation.rst => flair.tokenization.rst} (100%)

diff --git a/docs/api/flair.tokenziation.rst b/docs/api/flair.tokenization.rst
similarity index 100%
rename from docs/api/flair.tokenziation.rst
rename to docs/api/flair.tokenization.rst

From 5b3fac6f998c269c6850ed7c0ae3b864bc674f25 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:02:11 +0200
Subject: [PATCH 058/124] improve docstrings of trainer.train_custom

---
 docs/index.rst            |  2 +-
 flair/trainers/trainer.py | 79 ++++++++++++++++++++-------------------
 2 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index 2c910907f7..a16e874eec 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,7 +7,7 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started .. _flair_tutorials`_ |
+`Getting started .. tutorial/index/intro`_ |
 `Source Repository <https://https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
 
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 3cf96f5599..f52cea1d6b 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -329,49 +329,50 @@ def train_custom(
     ) -> dict:
         """Trains any class that implements the flair.nn.Model interface.
 
-        Args:
-            base_path: Main path to which all output during training is logged and models are saved
-            learning_rate (float): The learning rate of the optimizer
-            decoder_learning_rate (Optional[float]): Optional, if set, the decoder is trained with a separate learning rate
-            mini_batch_size (int): Size of mini-batches during training
-            eval_batch_size (int): Size of mini-batches during evaluation
-            mini_batch_chunk_size (int): If mini-batches are larger than this number, they get broken down into chunks of
-                this size for processing purposes
-            max_epochs (int): Maximum number of epochs to train. Terminates training if this number is surpassed.
-            optimizer: The optimizer to use (typically SGD or Adam)
-            train_with_dev (bool): If True, the data from dev split is added to the training data
-            train_with_test (bool): If True, the data from test split is added to the training data
-            main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
-            monitor_test (bool): If True, test data is evaluated at end of each epoch
-            monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
-                If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
-                a percentage of data points from train.
-            max_grad_norm (Optional[float]): If not None, gradients are clipped to this value before an optimizer.step is
+        Parameters
+        ----------
+        base_path: Main path to which all output during training is logged and models are saved
+        learning_rate: The learning rate of the optimizer
+        decoder_learning_rate: Optional, if set, the decoder is trained with a separate learning rate
+        mini_batch_size: Size of mini-batches during training
+        eval_batch_size: Size of mini-batches during evaluation
+        mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of
+            this size for processing purposes
+        max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
+        optimizer: The optimizer to use (typically SGD or Adam)
+        train_with_dev: If True, the data from dev split is added to the training data
+        train_with_test: If True, the data from test split is added to the training data
+        main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
+        monitor_test: If True, test data is evaluated at end of each epoch
+        monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
+            If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
+            a percentage of data points from train.
+        max_grad_norm: If not None, gradients are clipped to this value before an optimizer.step is
                 called.
-            use_final_model_for_eval (bool): If True, the final model is used for the final evaluation. If False, the
-                model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
-            gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
-            exclude_labels: Optionally define a list of labels to exclude from the evaluation
-            sampler: You can pass a data sampler here for special sampling of data.
-            shuffle: If True, data is shuffled during training
-            shuffle_first_epoch: If True, data is shuffled during the first epoch of training
-            embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
-                'cpu' (embeddings stored on CPU) or 'gpu' (embeddings stored on GPU)
-            epoch: The starting epoch (normally 0 but could be higher if you continue training model)
-            save_final_model: If True, the final model is saved at the end of training.
-            save_optimizer_state (bool): If True, the optimizer state is saved alongside the model
-            save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
-                be saved each 5 epochs. Default is 0 which means no model saving.
-            create_file_logs (bool): If True, logging output is written to a file
-            create_loss_file (bool): If True, a loss file logging output is created
-            use_amp (bool): If True, uses the torch automatic mixed precision
-            write_weights (bool): If True, write weights to weights.txt on each batch logging event.
-            plugins: Any additional plugins you want to pass to the trainer
-            **kwargs: Additional arguments, for instance for the optimizer
+        use_final_model_for_eval: If True, the final model is used for the final evaluation. If False, the
+            model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
+        gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
+        exclude_labels: Optionally define a list of labels to exclude from the evaluation
+        sampler: You can pass a data sampler here for special sampling of data.
+        shuffle: If True, data is shuffled during training
+        shuffle_first_epoch: If True, data is shuffled during the first epoch of training
+        embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
+            'cpu' (embeddings stored on CPU) or 'gpu' (embeddings stored on GPU)
+        epoch: The starting epoch (normally 0 but could be higher if you continue training model)
+        save_final_model: If True, the final model is saved at the end of training.
+        save_optimizer_state: If True, the optimizer state is saved alongside the model
+        save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
+            be saved each 5 epochs. Default is 0 which means no model saving.
+        create_file_logs: If True, logging output is written to a file
+        create_loss_file: If True, a loss file logging output is created
+        use_amp: If True, uses the torch automatic mixed precision
+        write_weights: If True, write weights to weights.txt on each batch logging event.
+        plugins: Any additional plugins you want to pass to the trainer
+        **kwargs: Additional arguments, for instance for the optimizer
 
         Returns:
         -------
-        dict: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
+        training_results: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
                 add additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
         """
         # Create output folder

From 82a1e0a49a8c2329e4fd802554920679551ce744 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:05:49 +0200
Subject: [PATCH 059/124] improve docstrings of trainer.train_custom

---
 flair/trainers/trainer.py | 84 +++++++++++++++++++--------------------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index f52cea1d6b..b0249cc1f0 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -16,6 +16,7 @@
 import flair.nn
 from flair.data import Corpus, Dictionary, _len_dataset
 from flair.datasets import DataLoader
+from flair.samplers import FlairSampler
 from flair.trainers.plugins import (
     AnnealingPlugin,
     CheckpointPlugin,
@@ -307,7 +308,7 @@ def train_custom(
         gold_label_dictionary_for_eval: Optional[Dictionary] = None,
         exclude_labels: List[str] = [],
         # sampling and shuffling
-        sampler=None,
+        sampler: Optional[FlairSampler] = None,
         shuffle: bool = True,
         shuffle_first_epoch: bool = True,
         # evaluation and monitoring
@@ -329,51 +330,46 @@ def train_custom(
     ) -> dict:
         """Trains any class that implements the flair.nn.Model interface.
 
-        Parameters
-        ----------
-        base_path: Main path to which all output during training is logged and models are saved
-        learning_rate: The learning rate of the optimizer
-        decoder_learning_rate: Optional, if set, the decoder is trained with a separate learning rate
-        mini_batch_size: Size of mini-batches during training
-        eval_batch_size: Size of mini-batches during evaluation
-        mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of
-            this size for processing purposes
-        max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
-        optimizer: The optimizer to use (typically SGD or Adam)
-        train_with_dev: If True, the data from dev split is added to the training data
-        train_with_test: If True, the data from test split is added to the training data
-        main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
-        monitor_test: If True, test data is evaluated at end of each epoch
-        monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
-            If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
-            a percentage of data points from train.
-        max_grad_norm: If not None, gradients are clipped to this value before an optimizer.step is
-                called.
-        use_final_model_for_eval: If True, the final model is used for the final evaluation. If False, the
-            model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
-        gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
-        exclude_labels: Optionally define a list of labels to exclude from the evaluation
-        sampler: You can pass a data sampler here for special sampling of data.
-        shuffle: If True, data is shuffled during training
-        shuffle_first_epoch: If True, data is shuffled during the first epoch of training
-        embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
-            'cpu' (embeddings stored on CPU) or 'gpu' (embeddings stored on GPU)
-        epoch: The starting epoch (normally 0 but could be higher if you continue training model)
-        save_final_model: If True, the final model is saved at the end of training.
-        save_optimizer_state: If True, the optimizer state is saved alongside the model
-        save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
-            be saved each 5 epochs. Default is 0 which means no model saving.
-        create_file_logs: If True, logging output is written to a file
-        create_loss_file: If True, a loss file logging output is created
-        use_amp: If True, uses the torch automatic mixed precision
-        write_weights: If True, write weights to weights.txt on each batch logging event.
-        plugins: Any additional plugins you want to pass to the trainer
-        **kwargs: Additional arguments, for instance for the optimizer
+        Args:
+            base_path: Main path to which all output during training is logged and models are saved
+            learning_rate: The learning rate of the optimizer
+            decoder_learning_rate: Optional, if set, the decoder is trained with a separate learning rate
+            mini_batch_size: Size of mini-batches during training
+            eval_batch_size: Size of mini-batches during evaluation
+            mini_batch_chunk_size: If mini-batches are larger than this number, they get broken down into chunks of
+                this size for processing purposes
+            max_epochs: Maximum number of epochs to train. Terminates training if this number is surpassed.
+            optimizer: The optimizer to use (typically SGD or Adam)
+            train_with_dev: If True, the data from dev split is added to the training data
+            train_with_test: If True, the data from test split is added to the training data
+            main_evaluation_metric: The metric to optimize (often micro-average or macro-average F1-score, or accuracy)
+            monitor_test: If True, test data is evaluated at end of each epoch
+            monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
+                If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
+                a percentage of data points from train.
+            use_final_model_for_eval: If True, the final model is used for the final evaluation. If False, the
+                model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
+            gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
+            exclude_labels: Optionally define a list of labels to exclude from the evaluation
+            sampler: You can pass a data sampler here for special sampling of data.
+            shuffle: If True, data is shuffled during training
+            shuffle_first_epoch: If True, data is shuffled during the first epoch of training
+            embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
+                'cpu' (embeddings stored on CPU) or 'gpu' (embeddings stored on GPU)
+            epoch: The starting epoch (normally 0 but could be higher if you continue training model)
+            save_final_model: If True, the final model is saved at the end of training.
+            save_optimizer_state: If True, the optimizer state is saved alongside the model
+            save_model_each_k_epochs: Each k epochs, a model state will be written out. If set to '5', a model will
+                be saved each 5 epochs. Default is 0 which means no model saving.
+            create_file_logs: If True, logging output is written to a file
+            create_loss_file: If True, a loss file logging output is created
+            write_weights: If True, write weights to weights.txt on each batch logging event.
+            plugins: Any additional plugins you want to pass to the trainer
+            **kwargs: Additional arguments, for instance for the optimizer
 
         Returns:
-        -------
-        training_results: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
-                add additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
+            training_results: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
+                    add additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
         """
         # Create output folder
         base_path = Path(base_path)

From 25619518da6ea46e5f73b9a342859c438da01b38 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:08:53 +0200
Subject: [PATCH 060/124] improve docstrings of trainer.train_custom

---
 docs/api/flair.trainers.plugins.rst | 7 +++++++
 flair/trainers/trainer.py           | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)
 create mode 100644 docs/api/flair.trainers.plugins.rst

diff --git a/docs/api/flair.trainers.plugins.rst b/docs/api/flair.trainers.plugins.rst
new file mode 100644
index 0000000000..6a4c6cd2c6
--- /dev/null
+++ b/docs/api/flair.trainers.plugins.rst
@@ -0,0 +1,7 @@
+flair.trainers.plugins
+======================
+
+.. automodule:: flair.trainers.plugins
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index b0249cc1f0..e16bbbb7e1 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -368,8 +368,8 @@ def train_custom(
             **kwargs: Additional arguments, for instance for the optimizer
 
         Returns:
-            training_results: A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins
-                    add additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
+            A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins add
+            additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
         """
         # Create output folder
         base_path = Path(base_path)

From 26f769a8636e9e847290524166e9f0ef6b181a1a Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:10:18 +0200
Subject: [PATCH 061/124] improve docstrings of trainer.train_custom

---
 flair/trainers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index e16bbbb7e1..efd17ea61b 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -369,7 +369,7 @@ def train_custom(
 
         Returns:
             A dictionary with at least the key "test_score" containing the final evaluation score. Some plugins add
-            additional information to this dictionary, such as the :class:`MetricHistoryPlugin`
+            additional information to this dictionary, such as the :class:`flair.trainers.plugins.MetricHistoryPlugin`
         """
         # Create output folder
         base_path = Path(base_path)

From 495f4a81d22a868c0c2baeaa99626291820874c2 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:24:02 +0200
Subject: [PATCH 062/124] make index more applicable

---
 docs/api/index.rst | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index ae6d8d517d..9b663700bb 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,12 +3,8 @@ API Docs
 
 .. toctree::
    :maxdepth: 1
+   :glob:
+   :genindex:
 
    flair
-   flair.data
-   flair.datasets
-   flair.embeddings
-   flair.models
-   flair.splitter
-   flair.tokenization
-   flair.trainers
\ No newline at end of file
+   flair.*
\ No newline at end of file

From c2044a7cd6a914b3f638efd27f578d65335a8c2a Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:25:22 +0200
Subject: [PATCH 063/124] revert glob change

---
 docs/api/index.rst | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 9b663700bb..3aa6b551ef 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,8 +3,13 @@ API Docs
 
 .. toctree::
    :maxdepth: 1
-   :glob:
    :genindex:
 
    flair
-   flair.*
\ No newline at end of file
+   flair.data
+   flair.datasets
+   flair.embeddings
+   flair.models
+   flair.splitter
+   flair.tokenization
+   flair.trainers
\ No newline at end of file

From 1650c81b0d9b8a01b8be5ddc57a751dc74f24ed9 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:26:16 +0200
Subject: [PATCH 064/124] remove genindex

---
 docs/api/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 3aa6b551ef..ae6d8d517d 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,7 +3,6 @@ API Docs
 
 .. toctree::
    :maxdepth: 1
-   :genindex:
 
    flair
    flair.data

From 2da55496e1f71d45a0e5fd5e2d463f45a08617d0 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 19 Jun 2023 16:27:18 +0200
Subject: [PATCH 065/124] glob without genindex

---
 docs/api/index.rst | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index ae6d8d517d..1c3512520b 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,12 +3,7 @@ API Docs
 
 .. toctree::
    :maxdepth: 1
+   :glob:
 
    flair
-   flair.data
-   flair.datasets
-   flair.embeddings
-   flair.models
-   flair.splitter
-   flair.tokenization
-   flair.trainers
\ No newline at end of file
+   flair.*
\ No newline at end of file

From 4fdce2c91be72327bc7f3a5d6c04585380b09835 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 11:23:25 +0200
Subject: [PATCH 066/124] fix getting started link

---
 docs/index.rst         | 8 ++++----
 docs/tutorial/intro.md | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/docs/index.rst b/docs/index.rst
index a16e874eec..6abf8b9ba2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,9 +7,9 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started .. tutorial/index/intro`_ |
-`Source Repository <https://https://github.com/flairNLP/flair>`_ |
-`Issue Tracker <https://https://github.com/flairNLP/flair/issues>`_ |
+`Getting started <gtutorial/index/intro.html>`_ |
+`Source Repository <https://github.com/flairNLP/flair>`_ |
+`Issue Tracker <https://github.com/flairNLP/flair/issues>`_ |
 
 Flair is a very simple framework for state-of-the-art Natural Language Processing (NLP)
 
@@ -30,7 +30,7 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
             :color: secondary
             :click-parent:
 
-            To the tutorial
+            To the tutorials
 
     .. grid-item-card::
         :img-top: ./_static/api.svg
diff --git a/docs/tutorial/intro.md b/docs/tutorial/intro.md
index 373e5a2d40..4a0e5b8908 100644
--- a/docs/tutorial/intro.md
+++ b/docs/tutorial/intro.md
@@ -2,6 +2,8 @@
 sidebar_position: 1
 ---
 
+(getting_started)=
+
 # Quick Start
 
 Let's discover **Flair in less than 5 minutes**.

From 3a4610fe08f0f20ccf11e03134e89b86e41a584b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 11:41:07 +0200
Subject: [PATCH 067/124] fix table of content show content for respective
 sites (api/tutorial/contributors)

---
 docs/api/index.rst                        | 1 -
 docs/conf.py                              | 2 +-
 docs/contributing/index.rst               | 1 -
 docs/contributing/writing_a_good_issue.md | 4 ++--
 docs/index.rst                            | 2 --
 docs/tutorial/index.rst                   | 2 --
 6 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 1c3512520b..4788ffb599 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -2,7 +2,6 @@ API Docs
 ========
 
 .. toctree::
-   :maxdepth: 1
    :glob:
 
    flair
diff --git a/docs/conf.py b/docs/conf.py
index b0e4b8e355..b490d92737 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -77,7 +77,7 @@ def linkcode_resolve(*args):
 
 html_sidebars = {
     "**": [
-        "localtoc.html",
+        "globaltoc.html",
         "searchbox.html",
         "versioning.html",
     ]
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
index ef72362f37..d1d37e8455 100644
--- a/docs/contributing/index.rst
+++ b/docs/contributing/index.rst
@@ -2,7 +2,6 @@ Contributing
 ============
 
 .. toctree::
-   :maxdepth: 1
 
    writing_a_good_issue
    local_development
diff --git a/docs/contributing/writing_a_good_issue.md b/docs/contributing/writing_a_good_issue.md
index da89567cc9..d3b5d4ff42 100644
--- a/docs/contributing/writing_a_good_issue.md
+++ b/docs/contributing/writing_a_good_issue.md
@@ -21,7 +21,7 @@ The minimal reproducible example has, like the name says two properties:
 * it is reproducible
 * it is as small as possible
 
-#### Reproducibility
+**Reproducibility**
 
 Please ensure that we can really reproduce your issue.
 
@@ -34,7 +34,7 @@ your trained model, but maybe you can recreate the issue by creating a model wit
 
 Please, be sure to not add local paths, or load any data that others have no access.
 
-#### Minimal
+**Minimal**
 
 After ensuring reproducibility, please also take some time to make it minimal. That way, we can quicker understand
 what the issue is and won't need to spend time debugging code that is unrelated to the issue.
diff --git a/docs/index.rst b/docs/index.rst
index 6abf8b9ba2..9e7c889a5f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -69,8 +69,6 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
             To the contributor's guide
 
 .. toctree::
-   :maxdepth: 1
-   :hidden:
 
    Tutorials <tutorial/index>
    API reference <api/index>
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index 38230a8cb3..f7fc4d5938 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -5,6 +5,4 @@ Tutorials
 .. _flair_tutorials:
 
 .. toctree::
-   :maxdepth: 2
-
    intro

From b954939901c05d69b96574ab78d40e4d4d9825e7 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 14:30:39 +0200
Subject: [PATCH 068/124] better structure for embeddings

---
 docs/api/embeddings/document.rst    |  7 +++++++
 docs/api/embeddings/image.rst       |  7 +++++++
 docs/api/embeddings/legacy.rst      | 11 +++++++++++
 docs/api/embeddings/token.rst       |  7 +++++++
 docs/api/embeddings/transformer.rst |  7 +++++++
 docs/api/flair.embeddings.rst       |  8 ++++----
 flair/embeddings/token.py           | 25 +++++++++++++++++++++++++
 7 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 docs/api/embeddings/document.rst
 create mode 100644 docs/api/embeddings/image.rst
 create mode 100644 docs/api/embeddings/legacy.rst
 create mode 100644 docs/api/embeddings/token.rst
 create mode 100644 docs/api/embeddings/transformer.rst

diff --git a/docs/api/embeddings/document.rst b/docs/api/embeddings/document.rst
new file mode 100644
index 0000000000..d289cd0166
--- /dev/null
+++ b/docs/api/embeddings/document.rst
@@ -0,0 +1,7 @@
+flair.embeddings.document
+=========================
+
+.. automodule:: flair.embeddings.document
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/embeddings/image.rst b/docs/api/embeddings/image.rst
new file mode 100644
index 0000000000..594ca9f3cb
--- /dev/null
+++ b/docs/api/embeddings/image.rst
@@ -0,0 +1,7 @@
+flair.embeddings.image
+======================
+
+.. automodule:: flair.embeddings.image
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/embeddings/legacy.rst b/docs/api/embeddings/legacy.rst
new file mode 100644
index 0000000000..51865b4e7e
--- /dev/null
+++ b/docs/api/embeddings/legacy.rst
@@ -0,0 +1,11 @@
+flair.embeddings.legacy
+============================
+
+.. warning::
+   All embeddings in `flair.embeddings.legacy` are considered deprecated.
+   there is no guarantee that they are still working and we recommend using different embeddings instead.
+
+.. automodule:: flair.embeddings.legacy
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/embeddings/token.rst b/docs/api/embeddings/token.rst
new file mode 100644
index 0000000000..8d475bd659
--- /dev/null
+++ b/docs/api/embeddings/token.rst
@@ -0,0 +1,7 @@
+flair.embeddings.token
+======================
+
+.. automodule:: flair.embeddings.token
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/embeddings/transformer.rst b/docs/api/embeddings/transformer.rst
new file mode 100644
index 0000000000..4d4b678ec9
--- /dev/null
+++ b/docs/api/embeddings/transformer.rst
@@ -0,0 +1,7 @@
+flair.embeddings.transformer
+============================
+
+.. automodule:: flair.embeddings.transformer
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.embeddings.rst b/docs/api/flair.embeddings.rst
index 107a828902..81241a7428 100644
--- a/docs/api/flair.embeddings.rst
+++ b/docs/api/flair.embeddings.rst
@@ -1,7 +1,7 @@
 flair.embeddings
 ================
 
-.. automodule:: flair.embeddings
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. toctree::
+   :glob:
+
+   embeddings/*
\ No newline at end of file
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 6dd2219334..fa9ff1bc46 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -11,6 +11,7 @@
 import numpy as np
 import torch
 from bpemb import BPEmb
+from deprecated.sphinx import deprecated
 from gensim.models import KeyedVectors
 from gensim.models.fasttext import FastTextKeyedVectors, load_facebook_vectors
 from torch import nn
@@ -1352,7 +1353,15 @@ def to_params(self):
 
 
 # TODO: keep for backwards compatibility, but remove in future
+@deprecated(
+    reason="""'BPEmbSerializable' is only used in the legacy pickle-embeddings format.
+    Please save your model again to save it in the serializable json format.
+    """,
+    version="0.13.0",
+)
 class BPEmbSerializable(BPEmb):
+    """Helper class to allow pickle-seralizable BPE embeddings."""
+
     def __getstate__(self):
         state = self.__dict__.copy()
         # save the sentence piece model as binary file (not as path which may change)
@@ -1549,3 +1558,19 @@ def replace_with_language_code(string: str):
     string = string.replace("spanish-", "es-")
     string = string.replace("swedish-", "sv-")
     return string
+
+
+__all__ = [
+    "TransformerWordEmbeddings",
+    "StackedEmbeddings",
+    "WordEmbeddings",
+    "CharacterEmbeddings",
+    "FlairEmbeddings",
+    "PooledFlairEmbeddings",
+    "FastTextEmbeddings",
+    "OneHotEmbeddings",
+    "HashEmbeddings",
+    "MuseCrosslingualEmbeddings",
+    "BytePairEmbeddings",
+    "NILCEmbeddings",
+]
\ No newline at end of file

From ac900c51398bac0965b729cba01463535f44bb75 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 17:24:26 +0200
Subject: [PATCH 069/124] use sphinx deprecation

---
 flair/data.py                     |  2 +-
 flair/datasets/base.py            |  2 +-
 flair/datasets/biomedical.py      |  2 +-
 flair/embeddings/legacy.py        |  2 +-
 flair/embeddings/token.py         |  2 +-
 flair/models/word_tagger_model.py | 15 ++++++---------
 6 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 9ca6da2ccd..9ef97048bd 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -9,7 +9,7 @@
 from typing import Dict, Iterable, List, NamedTuple, Optional, Union, cast
 
 import torch
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 from torch.utils.data import Dataset, IterableDataset
 from torch.utils.data.dataset import ConcatDataset, Subset
 
diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index f5550b5bc7..1ec7a0bbf8 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -4,7 +4,7 @@
 from typing import Generic, List, Optional, Union
 
 import torch.utils.data.dataloader
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 
 from flair.data import DT, FlairDataset, Sentence, Tokenizer
 from flair.tokenization import SegtokTokenizer, SpaceTokenizer
diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 0a5141942d..abb06cecbe 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -22,7 +22,7 @@
 from zipfile import BadZipFile, LargeZipFile
 
 import ftfy
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 from lxml import etree
 from lxml.etree import XMLSyntaxError
 
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 8ef43d239f..8e1f29a25b 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -4,7 +4,7 @@
 from typing import List, Optional, Union
 
 import torch
-from deprecated import deprecated
+from deprecated.sphinx import deprecated
 
 import flair
 from flair.data import Sentence, Token
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index fa9ff1bc46..82facf130b 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -1573,4 +1573,4 @@ def replace_with_language_code(string: str):
     "MuseCrosslingualEmbeddings",
     "BytePairEmbeddings",
     "NILCEmbeddings",
-]
\ No newline at end of file
+]
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index 32f58e17b0..a84b73e50f 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List, Union
 
 import torch
+from deprecated.sphinx import deprecated
 
 import flair.nn
 from flair.data import Dictionary, Sentence, Span, Token
@@ -11,15 +12,6 @@
 log = logging.getLogger("flair")
 
 
-def WordTagger(embeddings, tag_dictionary, tag_type, **classifierargs):
-    from warnings import warn
-
-    warn("The WordTagger class is deprecated and will be removed in Flair 1.0. Use TokenClassifier instead!")
-    return TokenClassifier(
-        embeddings=embeddings, label_dictionary=tag_dictionary, label_type=tag_type, **classifierargs
-    )
-
-
 class TokenClassifier(flair.nn.DefaultClassifier[Sentence, Token]):
     """This is a simple class of models that tags individual words in text."""
 
@@ -231,3 +223,8 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "TokenClassifier"
         from typing import cast
 
         return cast("TokenClassifier", super().load(model_path=model_path))
+
+
+@deprecated(reason="The WordTagger is deprecated in favour of :class:`flair.models.TokenClassifier`.", version="0.12.2")
+class WordTagger(TokenClassifier):
+    pass

From a0670cde9702b62c14ab7ff79983f647334933d0 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 17:43:26 +0200
Subject: [PATCH 070/124] fine grained datasets documentation

---
 docs/api/datasets/base.rst                    | 7 +++++++
 docs/api/datasets/biomedical.rst              | 7 +++++++
 docs/api/datasets/document_classification.rst | 7 +++++++
 docs/api/datasets/entity_linking.rst          | 7 +++++++
 docs/api/datasets/ocr.rst                     | 7 +++++++
 docs/api/datasets/relation_extraction.rst     | 7 +++++++
 docs/api/datasets/sequence_labeling.rst       | 7 +++++++
 docs/api/datasets/text_image.rst              | 7 +++++++
 docs/api/datasets/text_text.rst               | 7 +++++++
 docs/api/datasets/treebanks.rst               | 7 +++++++
 docs/api/embeddings/base.rst                  | 7 +++++++
 docs/api/flair.datasets.rst                   | 8 ++++----
 flair/models/word_tagger_model.py             | 2 +-
 13 files changed, 82 insertions(+), 5 deletions(-)
 create mode 100644 docs/api/datasets/base.rst
 create mode 100644 docs/api/datasets/biomedical.rst
 create mode 100644 docs/api/datasets/document_classification.rst
 create mode 100644 docs/api/datasets/entity_linking.rst
 create mode 100644 docs/api/datasets/ocr.rst
 create mode 100644 docs/api/datasets/relation_extraction.rst
 create mode 100644 docs/api/datasets/sequence_labeling.rst
 create mode 100644 docs/api/datasets/text_image.rst
 create mode 100644 docs/api/datasets/text_text.rst
 create mode 100644 docs/api/datasets/treebanks.rst
 create mode 100644 docs/api/embeddings/base.rst

diff --git a/docs/api/datasets/base.rst b/docs/api/datasets/base.rst
new file mode 100644
index 0000000000..b6d0a7a70b
--- /dev/null
+++ b/docs/api/datasets/base.rst
@@ -0,0 +1,7 @@
+flair.datasets.base
+===================
+
+.. automodule:: flair.datasets.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/biomedical.rst b/docs/api/datasets/biomedical.rst
new file mode 100644
index 0000000000..5ffcfa6b0c
--- /dev/null
+++ b/docs/api/datasets/biomedical.rst
@@ -0,0 +1,7 @@
+flair.datasets.biomedical
+=========================
+
+.. automodule:: flair.datasets.biomedical
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/document_classification.rst b/docs/api/datasets/document_classification.rst
new file mode 100644
index 0000000000..06ce075df5
--- /dev/null
+++ b/docs/api/datasets/document_classification.rst
@@ -0,0 +1,7 @@
+flair.datasets.document_classification
+======================================
+
+.. automodule:: flair.datasets.document_classification
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/entity_linking.rst b/docs/api/datasets/entity_linking.rst
new file mode 100644
index 0000000000..d3066675b7
--- /dev/null
+++ b/docs/api/datasets/entity_linking.rst
@@ -0,0 +1,7 @@
+flair.datasets.entity_linking
+=============================
+
+.. automodule:: flair.datasets.entity_linking
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/ocr.rst b/docs/api/datasets/ocr.rst
new file mode 100644
index 0000000000..0d33b55b4e
--- /dev/null
+++ b/docs/api/datasets/ocr.rst
@@ -0,0 +1,7 @@
+flair.datasets.ocr
+==================
+
+.. automodule:: flair.datasets.ocr
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/relation_extraction.rst b/docs/api/datasets/relation_extraction.rst
new file mode 100644
index 0000000000..c6df41c7d0
--- /dev/null
+++ b/docs/api/datasets/relation_extraction.rst
@@ -0,0 +1,7 @@
+flair.datasets.relation_extraction
+==================================
+
+.. automodule:: flair.datasets.relation_extraction
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/sequence_labeling.rst b/docs/api/datasets/sequence_labeling.rst
new file mode 100644
index 0000000000..cfa52a3ed5
--- /dev/null
+++ b/docs/api/datasets/sequence_labeling.rst
@@ -0,0 +1,7 @@
+flair.datasets.sequence_labeling
+================================
+
+.. automodule:: flair.datasets.sequence_labeling
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/text_image.rst b/docs/api/datasets/text_image.rst
new file mode 100644
index 0000000000..dbdf8f86d3
--- /dev/null
+++ b/docs/api/datasets/text_image.rst
@@ -0,0 +1,7 @@
+flair.datasets.text_image
+=========================
+
+.. automodule:: flair.datasets.text_image
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/text_text.rst b/docs/api/datasets/text_text.rst
new file mode 100644
index 0000000000..33e4bd6c41
--- /dev/null
+++ b/docs/api/datasets/text_text.rst
@@ -0,0 +1,7 @@
+flair.datasets.text_text
+=========================
+
+.. automodule:: flair.datasets.text_text
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/datasets/treebanks.rst b/docs/api/datasets/treebanks.rst
new file mode 100644
index 0000000000..3f294fa3a5
--- /dev/null
+++ b/docs/api/datasets/treebanks.rst
@@ -0,0 +1,7 @@
+flair.datasets.treebanks
+========================
+
+.. automodule:: flair.datasets.treebanks
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/embeddings/base.rst b/docs/api/embeddings/base.rst
new file mode 100644
index 0000000000..1d17b64de0
--- /dev/null
+++ b/docs/api/embeddings/base.rst
@@ -0,0 +1,7 @@
+flair.embeddings.base
+=====================
+
+.. automodule:: flair.embeddings.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/flair.datasets.rst b/docs/api/flair.datasets.rst
index 0dcf3900d6..aebd70c2da 100644
--- a/docs/api/flair.datasets.rst
+++ b/docs/api/flair.datasets.rst
@@ -1,7 +1,7 @@
 flair.datasets
 ==============
 
-.. automodule:: flair.datasets
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. toctree::
+   :glob:
+
+   datasets/*
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index a84b73e50f..d00d12b6c0 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -225,6 +225,6 @@ def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "TokenClassifier"
         return cast("TokenClassifier", super().load(model_path=model_path))
 
 
-@deprecated(reason="The WordTagger is deprecated in favour of :class:`flair.models.TokenClassifier`.", version="0.12.2")
+@deprecated(reason="The WordTagger was renamed to :class:`flair.models.TokenClassifier`.", version="0.12.2")
 class WordTagger(TokenClassifier):
     pass

From 09e7ceb91b072ddf09e321876a60f698b73b9587 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 17:54:12 +0200
Subject: [PATCH 071/124] add tutorials

---
 docs/index.rst                                |   1 +
 docs/tutorial/index.rst                       |   5 +
 docs/tutorial/tutorial-basics/basic-types.md  | 270 +++++++++++++
 .../tutorial-basics/entity-linking.md         |  85 +++++
 .../tutorial-basics/how-predictions-work.md   |  78 ++++
 .../tutorial-basics/how-to-tag-corpus.md      |  32 ++
 docs/tutorial/tutorial-basics/index.rst       |  17 +
 docs/tutorial/tutorial-basics/other-models.md | 152 ++++++++
 .../tutorial-basics/part-of-speech-tagging.md | 173 +++++++++
 .../tutorial-basics/tagging-entities.md       | 202 ++++++++++
 .../tutorial-basics/tagging-sentiment.md      |  79 ++++
 .../classic-word-embeddings.md                | 115 ++++++
 .../tutorial-embeddings/embeddings.md         | 138 +++++++
 .../tutorial-embeddings/flair-embeddings.md   | 143 +++++++
 docs/tutorial/tutorial-embeddings/index.rst   |  16 +
 .../tutorial-embeddings/other-embeddings.md   | 321 ++++++++++++++++
 .../transformer-embeddings.md                 | 181 +++++++++
 .../how-model-training-works.md               | 295 +++++++++++++++
 .../how-to-load-custom-dataset.md             | 160 ++++++++
 .../how-to-load-prepared-dataset.md           | 356 ++++++++++++++++++
 .../how-to-train-sequence-tagger.md           | 225 +++++++++++
 .../how-to-train-text-classifier.md           |  61 +++
 docs/tutorial/tutorial-training/index.rst     |  14 +
 .../tutorial-training/train-vs-fine-tune.md   |  11 +
 24 files changed, 3130 insertions(+)
 create mode 100644 docs/tutorial/tutorial-basics/basic-types.md
 create mode 100644 docs/tutorial/tutorial-basics/entity-linking.md
 create mode 100644 docs/tutorial/tutorial-basics/how-predictions-work.md
 create mode 100644 docs/tutorial/tutorial-basics/how-to-tag-corpus.md
 create mode 100644 docs/tutorial/tutorial-basics/index.rst
 create mode 100644 docs/tutorial/tutorial-basics/other-models.md
 create mode 100644 docs/tutorial/tutorial-basics/part-of-speech-tagging.md
 create mode 100644 docs/tutorial/tutorial-basics/tagging-entities.md
 create mode 100644 docs/tutorial/tutorial-basics/tagging-sentiment.md
 create mode 100644 docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
 create mode 100644 docs/tutorial/tutorial-embeddings/embeddings.md
 create mode 100644 docs/tutorial/tutorial-embeddings/flair-embeddings.md
 create mode 100644 docs/tutorial/tutorial-embeddings/index.rst
 create mode 100644 docs/tutorial/tutorial-embeddings/other-embeddings.md
 create mode 100644 docs/tutorial/tutorial-embeddings/transformer-embeddings.md
 create mode 100644 docs/tutorial/tutorial-training/how-model-training-works.md
 create mode 100644 docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
 create mode 100644 docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
 create mode 100644 docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
 create mode 100644 docs/tutorial/tutorial-training/how-to-train-text-classifier.md
 create mode 100644 docs/tutorial/tutorial-training/index.rst
 create mode 100644 docs/tutorial/tutorial-training/train-vs-fine-tune.md

diff --git a/docs/index.rst b/docs/index.rst
index 9e7c889a5f..b7beeaa079 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -69,6 +69,7 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
             To the contributor's guide
 
 .. toctree::
+   :maxdepth: 3
 
    Tutorials <tutorial/index>
    API reference <api/index>
diff --git a/docs/tutorial/index.rst b/docs/tutorial/index.rst
index f7fc4d5938..9636c33c53 100644
--- a/docs/tutorial/index.rst
+++ b/docs/tutorial/index.rst
@@ -5,4 +5,9 @@ Tutorials
 .. _flair_tutorials:
 
 .. toctree::
+   :maxdepth: 2
+
    intro
+   tutorial-basics/index
+   tutorial-training/index
+   tutorial-embeddings/index
\ No newline at end of file
diff --git a/docs/tutorial/tutorial-basics/basic-types.md b/docs/tutorial/tutorial-basics/basic-types.md
new file mode 100644
index 0000000000..0f84674c3c
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/basic-types.md
@@ -0,0 +1,270 @@
+# Basics
+
+This tutorial explains the basic concepts used in Flair:
+
+-    what is a `Sentence`
+-    what is a `Label`
+
+You should be familiar with these two concepts in order to get the most out of Flair.
+
+## What is a Sentence
+
+If you want to tag a sentence, you need to first make a `Sentence` object for it.
+
+For example, say you want to tag the text "_The grass is green._".
+
+Let's start by making a `Sentence` object for this sentence.
+
+
+```python
+# The sentence objects holds a sentence that we may want to embed or tag
+from flair.data import Sentence
+
+# Make a sentence object by passing a string
+sentence = Sentence('The grass is green.')
+
+# Print the object to see what's in there
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence[5]: "The grass is green."
+```
+
+The print-out tells us that the sentence consists of 5 tokens.
+
+:::info
+A token is an atomic unit of the text, often a word or punctuation. The printout is therefore telling us that the sentence "_The grass is green._" consists of 5 such atomic units. 
+:::
+
+### Iterating over the tokens in a Sentence
+
+So what are the 5 tokens in this example sentence?
+
+You can iterate over all tokens in a sentence like this:
+
+
+```python
+for token in sentence:
+    print(token)
+```
+
+This should print:
+
+```console
+Token[0]: "The"
+Token[1]: "grass"
+Token[2]: "is"
+Token[3]: "green"
+Token[4]: "."
+```
+
+This printout is telling us that the 5 tokens in the text are the words "_The_", "_grass_", "_is_", "_green_", with a separate token for the full stop at the end. The tokens therefore correspond to the words and the punctuation of the text.
+
+### Directly accessing a token
+
+You can access the tokens of a sentence via their token id or with their index:
+
+```python
+# using the token id
+print(sentence.get_token(4))
+# using the index itself
+print(sentence[3])
+```
+
+which should print in both cases
+
+```console
+Token[3]: "green"
+```
+
+This print-out includes the token index (3) and the lexical value of the token ("green"). 
+
+### Tokenization
+
+When you create a `Sentence` as above, the text is automatically tokenized (segmented into words) using the [segtok](https://pypi.org/project/segtok/) library.
+
+
+:::info
+You can also use a different tokenizer if you like. To learn more about this, check out our tokenization tutorial.
+:::
+
+
+## What is a Label
+
+All Flair models predict labels. For instance, our sentiment analysis models will predict labels for a sentence. Our NER models will predict labels for tokens in a sentence.
+
+### Example 1: Labeling a token in a sentence
+
+To illustrate how labels work, let's use the same example sentence as above: "_The grass is green._".
+
+Let us label all "color words" in this sentence. Since the sentence contains only one color word (namely "green"), we only need to add a label to one of the tokens.
+
+We access token 3 in the sentence, and set a label for it: 
+
+```python
+# Make a sentence object by passing a string
+sentence = Sentence('The grass is green.')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# print the sentence (now with this annotation)
+print(sentence)
+```
+
+This should print:
+
+```console
+Sentence: "The grass is green ." → ["green"/color]
+```
+
+The output indicates that the word "green" in this sentence is labeled as a "color". You can also
+iterate through each token and print it to see if it has labels:
+
+```python
+for token in sentence:
+    print(token)
+```
+
+This should print:
+
+```console
+Token[0]: "The"
+Token[1]: "grass"
+Token[2]: "is"
+Token[3]: "green" → color (1.0)
+Token[4]: "."
+```
+
+This shows that there are 5 tokens in the sentence, one of which has a label.
+
+:::info
+The `add_label` method used here has two mandatory parameters.
+:::
+
+### Example 2: Labeling a whole sentence
+
+Sometimes you want to label an entire sentence instead of only a token. Do this by calling `add_label` for the whole sentence.
+
+For example, say we want to add a sentiment label to the sentence "_The grass is green._":
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a label to a sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+print(sentence)
+```
+
+This should print:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0)
+```
+
+Indicating that this sentence is now labeled as having a positive sentiment.
+
+### Multiple labels
+
+Importantly, in Flair you can add as many labels to a sentence as you like.
+
+Let's bring the two examples above together: We will label the sentence "_The grass is green._" with an overall positive sentiment, and also add a "color" tag to the token "grass":
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a sentiment label to the sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This will print:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0) → ["green"/color]
+```
+
+Indicating that the sentence is now labeled with two different types of information.
+
+### Accessing labels
+
+You can iterate through all labels of a sentence using the `.get_labels()` method:
+
+```python
+# iterate over all labels and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This will get each label and print it. For instance, let's re-use the previous example in which we add two different labels to the same sentence:
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add a sentiment label to the sentence
+sentence.add_label('sentiment', 'POSITIVE')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# iterate over all labels and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This will now print the following two lines:
+
+```
+Sentence[5]: "The grass is green." → POSITIVE (1.0)
+Token[3]: "green" → color (1.0)
+```
+
+This printout tells us that there are two labels: The first is for the whole sentence, tagged as POSITIVE. The second is only for the token "green", tagged as "color".
+
+:::info
+
+If you only want to iterate over labels of a specific type, add the label name as parameter to get_labels(). For instance, to only iterate over all NER labels, do:
+
+```python
+# iterate over all NER labels only
+for label in sentence.get_labels('ner'):
+    print(label)
+```
+:::
+
+### Information for each label
+
+Each label is of class `Label` which next to the value has a score indicating confidence. It also has a pointer back to the data point to which it attaches.
+
+This means that you can print the value, the confidence and the labeled text of each label:
+
+```python
+sentence = Sentence('The grass is green.')
+
+# add an NER tag to token 3 in the sentence
+sentence[3].add_label('ner', 'color')
+
+# iterate over all labels and print
+for label in sentence.get_labels():
+
+    # Print the text, the label value and the label score
+    print(f'"{label.data_point.text}" is classified as "{label.value}" with score {label.score}')
+```
+
+This should print:
+
+```
+"green" is classified as "color" with score 1.0
+```
+
+Our color tag has a score of 1.0 since we manually added it. If a tag is predicted by our sequence labeler, the score value will indicate classifier confidence.
+
diff --git a/docs/tutorial/tutorial-basics/entity-linking.md b/docs/tutorial/tutorial-basics/entity-linking.md
new file mode 100644
index 0000000000..639b5d1e47
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/entity-linking.md
@@ -0,0 +1,85 @@
+# Tagging and linking entities
+
+As of Flair 0.12 we ship an **experimental entity linker** trained on the [Zelda dataset](https://github.com/flairNLP/zelda). The linker does not only
+tag entities, but also attempts to link each entity to the corresponding Wikipedia URL if one exists. 
+
+## Example 1: Entity linking on a single sentence​
+
+To illustrate, let's use the example sentence "_Kirk and Spock met on the Enterprise._":
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('linker')
+
+# make a sentence
+sentence = Sentence('Kirk and Spock met on the Enterprise.')
+
+# predict entity links
+tagger.predict(sentence)
+
+# iterate over predicted entities and print
+for label in sentence.get_labels():
+    print(label)
+```
+
+This should print:
+```console
+Span[0:1]: "Kirk" → James_T._Kirk (0.9969)
+Span[2:3]: "Spock" → Spock (0.9971)
+Span[6:7]: "Enterprise" → USS_Enterprise_(NCC-1701-D) (0.975)
+```
+
+As we can see, the linker can resolve what the two mentions of "Barcelona" refer to: 
+- "Kirk" refers to the entity "[James_T._Kirk](https://en.wikipedia.org/wiki/James_T._Kirk)"
+- "Spock" refers to "[Spock](https://en.wikipedia.org/wiki/Spock)" (ok, that one was easy)
+- "Enterprise" refers to the "[USS_Enterprise_(NCC-1701-D)](https://en.wikipedia.org/wiki/USS_Enterprise_(NCC-1701-D))" 
+
+ Not bad, eh? However, that last prediction is not quite correct as Star Trek fans will know. Entity linking is a hard task and we are working to improve the accuracy of our model.
+
+
+
+## Example 2: Entity linking on a text document (multiple sentences)
+
+Entity linking typically works best when applied to a whole document instead of only a single sentence.
+
+To illustrate how this works, let's use the following short text: "_Bayern played against Barcelona. The match took place in Barcelona._"
+
+In this case, split the text into sentences and pass a list of Sentence objects to the .predict() method:
+
+```python
+from flair.nn import Classifier
+from flair.splitter import SegtokSentenceSplitter
+
+# example text with many sentences
+text = "Bayern played against Barcelona. The match took place in Barcelona."
+
+# initialize sentence splitter
+splitter = SegtokSentenceSplitter()
+
+# use splitter to split text into list of sentences
+sentences = splitter.split(text)
+
+# predict tags for sentences
+tagger = Classifier.load('linker')
+tagger.predict(sentences)
+
+# iterate through sentences and print predicted labels
+for sentence in sentences:
+    print(sentence)
+```
+
+This should print: 
+```console
+Sentence[5]: "Bayern played against Barcelona." → ["Bayern"/FC_Bayern_Munich, "Barcelona"/FC_Barcelona]
+Sentence[7]: "The match took place in Barcelona." → ["Barcelona"/Barcelona]
+```
+
+As we can see, the linker can resolve that:
+
+- "Bayern" refers to the soccer club "[FC Bayern Munich](https://en.wikipedia.org/wiki/FC_Bayern_Munich)"
+- the first mention of "Barcelona" refers to the soccer club "[FC Barcelona](https://en.wikipedia.org/wiki/FC_Barcelona)"
+- the second mention of "Barcelona" refers to the city of "[Barcelona](https://en.wikipedia.org/wiki/Barcelona)"
+
diff --git a/docs/tutorial/tutorial-basics/how-predictions-work.md b/docs/tutorial/tutorial-basics/how-predictions-work.md
new file mode 100644
index 0000000000..abc5973c87
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/how-predictions-work.md
@@ -0,0 +1,78 @@
+# How predictions work
+
+All taggers in Flair make predictions. This tutorial helps you understand what information you can get out of each prediction.
+
+## Running example
+
+Let's use our standard NER example to illustrate how annotations work: 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ner')
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print the sentence with the tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+Showing us that two entities are labeled in this sentence: "George Washington" as PER (person) and "Washington"
+as LOC (location.)
+
+## Getting the predictions
+
+A common question that gets asked is **how to access these predictions directly**. You can do this by using
+the `get_labels()` method to iterate over all predictions:
+
+```python
+for label in sentence.get_labels():
+    print(label)
+```
+This should print the two NER predictions:
+
+```console
+Span[0:2]: "George Washington" → PER (0.9989)
+Span[4:5]: "Washington" → LOC (0.9942)
+```
+
+As you can see, each entity is printed, together with the predicted class. 
+The confidence of the prediction is indicated as a score in brackets.
+
+## Values for each prediction
+
+For each prediction, you can even **directly access** the label value, it's score and the entity text:  
+
+```python
+# iterate over all labels in the sentence
+for label in sentence.get_labels():
+    # print label value and score
+    print(f'label.value is: "{label.value}"')
+    print(f'label.score is: "{label.score}"')
+    # access the data point to which label attaches and print its text
+    print(f'the text of label.data_point is: "{label.data_point.text}"\n')
+```
+
+This should print: 
+```console
+label.value is: "PER"
+label.score is: "0.998886227607727"
+the text of label.data_point is: "George Washington"
+
+label.value is: "LOC"
+label.score is: "0.9942097663879395"
+the text of label.data_point is: "Washington"
+```
+
+
diff --git a/docs/tutorial/tutorial-basics/how-to-tag-corpus.md b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
new file mode 100644
index 0000000000..537b102946
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
@@ -0,0 +1,32 @@
+# How to tag a whole corpus
+
+Often, you may want to tag an entire text corpus. In this case, you need to split the corpus into sentences and pass a
+list of `Sentence` objects to the `.predict()` method.
+
+For instance, you can use the sentence splitter of segtok to split your text:
+
+```python
+from flair.nn import Classifier
+from flair.splitter import SegtokSentenceSplitter
+
+# example text with many sentences
+text = "This is a sentence. This is another sentence. I love Berlin."
+
+# initialize sentence splitter
+splitter = SegtokSentenceSplitter()
+
+# use splitter to split text into list of sentences
+sentences = splitter.split(text)
+
+# predict tags for sentences
+tagger = Classifier.load('ner')
+tagger.predict(sentences)
+
+# iterate through sentences and print predicted labels
+for sentence in sentences:
+    print(sentence)
+```
+
+Using the `mini_batch_size` parameter of the `.predict()` method, you can set the size of mini batches passed to the
+tagger. Depending on your resources, you might want to play around with this parameter to optimize speed.
+
diff --git a/docs/tutorial/tutorial-basics/index.rst b/docs/tutorial/tutorial-basics/index.rst
new file mode 100644
index 0000000000..dcbd8cd18d
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/index.rst
@@ -0,0 +1,17 @@
+Tutorial 1: Basic Tagging
+=========================
+
+This tutorial shows you in more detail how to tag your text and access predictions,
+and showcases various models we ship with Flair.
+
+.. toctree::
+   :glob:
+
+   basic-types
+   how-predictions-work
+   tagging-entities
+   tagging-sentiment
+   entity-linking
+   part-of-speech-tagging
+   other-models
+   how-to-tag-corpus
diff --git a/docs/tutorial/tutorial-basics/other-models.md b/docs/tutorial/tutorial-basics/other-models.md
new file mode 100644
index 0000000000..adb011d4da
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/other-models.md
@@ -0,0 +1,152 @@
+# Tagging other things
+
+This tutorial gives you a tour of **other crazy models** shipped with Flair. These include:
+* tagging semantic frames  
+* chunking text
+* relation extraction
+* others
+
+Let's get started! 
+
+## Semantic Frame Detection
+
+For English, we provide a pre-trained model that detects semantic frames in text, trained using Propbank 3.0 frames.
+This provides a sort of word sense disambiguation for frame evoking words, and we are curious what researchers might
+do with this.
+
+Here's an example:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('frame')
+
+# make English sentence
+sentence = Sentence('George returned to Berlin to return his hat.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# go through tokens and print predicted frame (if one is predicted)
+for token in sentence:
+    print(token)
+```
+This should print:
+
+```console
+Token[0]: "George"
+Token[1]: "returned" → return.01 (0.9951)
+Token[2]: "to"
+Token[3]: "Berlin"
+Token[4]: "to"
+Token[5]: "return" → return.02 (0.6361)
+Token[6]: "his"
+Token[7]: "hat"
+Token[8]: "."
+```
+
+As we can see, the frame detector makes a distinction in the sentence between two different meanings of the word 'return'. 'return.01' means returning to a location, while 'return.02' means giving something back.
+
+## Syntactic Chunking
+
+For English, we provide a model for chunking verb and noun phrases, trained using CoNLL 2000. 
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('chunk')
+
+# make English sentence
+sentence = Sentence('The quick brown fox jumps over the lazy dog.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print the chunks
+for chunk in sentence.get_labels():
+  print(chunk)
+```
+
+This should print:
+
+```console
+Span[0:4]: "The quick brown fox" → NP (0.9914)
+Span[4:5]: "jumps" → VP (1.0)
+Span[5:6]: "over" → PP (0.9967)
+Span[6:9]: "the lazy dog" → NP (0.9991)
+```
+This tells us for instance that "the quick brown fox" and "the lazy dog" form syntactic units in this sentence.
+
+
+## Tagging Relations
+
+Relations hold between two entities. For instance, a text like "_George was born in Washington_"
+names two entities and also expresses that there is a born_in relationship between
+both.
+
+We added an experimental relation extraction model trained over a modified version of TACRED.
+You must use this model together with an entity tagger. Here is an example:
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# 1. make example sentence
+sentence = Sentence("George was born in Washington")
+
+# 2. load entity tagger and predict entities
+tagger = Classifier.load('ner-fast')
+tagger.predict(sentence)
+
+# check which named entities have been found in the sentence
+entities = sentence.get_labels('ner')
+for entity in entities:
+    print(entity)
+
+# 3. load relation extractor
+extractor = Classifier.load('relations')
+
+# predict relations
+extractor.predict(sentence)
+
+# check which relations have been found
+relations = sentence.get_labels('relation')
+for relation in relations:
+    print(relation)
+
+# Use the `get_labels()` method with parameter 'relation' to iterate over all relation predictions. 
+for label in sentence.get_labels('relation'):
+    print(label)
+```
+
+This should print:
+
+```console
+Span[0:1]: "George" → PER (0.9971)
+Span[4:5]: "Washington" → LOC (0.9847)
+
+Relation[0:1][4:5]: "George -> Washington" → born_in (1.0)
+```
+
+Indicating that a **born_in** relationship holds between "George" and "Washington"!
+
+## List of Other Models
+
+We end this section with a list of all other models we currently ship with Flair:
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[chunk](https://huggingface.co/flair/chunk-english)' |  Chunking   |  English | Conll-2000     |  **96.47** (F1) |
+| '[chunk-fast](https://huggingface.co/flair/chunk-english-fast)' |   Chunking   |  English | Conll-2000     |  **96.22** (F1) |(fast model)
+| '[frame](https://huggingface.co/flair/frame-english)'  |   Frame Detection |  English | Propbank 3.0     |  **97.54** (F1) |
+| '[frame-fast](https://huggingface.co/flair/frame-english-fast)'  |  Frame Detection |  English | Propbank 3.0     |  **97.31** (F1) | (fast model)
+| 'negation-speculation'  | Negation / speculation |English |  Bioscope | **80.2** (F1) |
+| 'communicative-functions' |  detecting function of sentence in research paper (BETA) |  English| scholarly papers |  |
+| 'de-historic-indirect' | historical indirect speech | German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-direct' | historical direct speech |  German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-reported' | historical reported speech | German |  @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+| 'de-historic-free-indirect' | historical free-indirect speech | German | @redewiedergabe project |  **87.94** (F1) | [redewiedergabe](https://github.com/redewiedergabe/tagger) | |
+
diff --git a/docs/tutorial/tutorial-basics/part-of-speech-tagging.md b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
new file mode 100644
index 0000000000..16d5acb28c
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
@@ -0,0 +1,173 @@
+# Tagging parts-of-speech
+
+This tutorials shows you how to do part-of-speech tagging in Flair, showcases univeral and language-specific models, and gives a list of all PoS models in Flair.
+
+## Language-specific parts-of-speech (PoS)
+
+
+Syntax is fundamentally language-specific, so each language has different fine-grained parts-of-speech. Flair offers models for many languages:  
+
+### ... in English
+
+For English, we offer several models trained over Ontonotes. 
+
+Use like this:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('pos')
+
+# make a sentence
+sentence = Sentence('Dirk went to the store.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[6]: "Dirk went to the store." → ["Dirk"/NNP, "went"/VBD, "to"/IN, "the"/DT, "store"/NN, "."/.]
+```
+
+This printout tells us for instance that "_Dirk_" is a proper noun (tag: NNP), and "_went_" is a past tense verb (tag: VBD).
+
+:::info
+To better understand what each tag means, consult the [tag specification](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) of the Penn Treebank.
+:::
+
+### ... in German 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('de-pos')
+
+# make a sentence
+sentence = Sentence('Dort hatte er einen Hut gekauft.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[7]: "Dort hatte er einen Hut gekauft." → ["Dort"/ADV, "hatte"/VAFIN, "er"/PPER, "einen"/ART, "Hut"/NN, "gekauft"/VVPP, "."/$.]
+```
+
+
+### ... in Ukrainian
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('pos-ukrainian')
+
+# make a sentence
+sentence = Sentence("Сьогодні в Знам’янці проживають нащадки поета — родина Шкоди.")
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+
+### ... in Arabic
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ar-pos')
+
+# make a sentence
+sentence = Sentence('عمرو عادلي أستاذ للاقتصاد السياسي المساعد في الجامعة الأمريكية  بالقاهرة .')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+## Tagging universal parts-of-speech (uPoS)​
+
+Universal parts-of-speech are a set of minimal syntactic units that exist across languages. For instance, most languages
+will have VERBs or NOUNs. 
+
+
+We ship models trained over 14 langages to tag upos in **multilingual text**. Use like this: 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load model
+tagger = Classifier.load('pos-multi')
+
+# text with English and German sentences
+sentence = Sentence('George Washington went to Washington. Dort kaufte er einen Hut.')
+
+# predict PoS tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print (line breaks added for readability):
+```console
+Sentence: "George Washington went to Washington . Dort kaufte er einen Hut ."
+
+→ ["George"/PROPN, "Washington"/PROPN, "went"/VERB, "to"/ADP, "Washington"/PROPN, "."/PUNCT]
+
+→ ["Dort"/ADV, "kaufte"/VERB, "er"/PRON, "einen"/DET, "Hut"/NOUN, "."/PUNCT]
+```
+
+However note that they were trained for a mix of European languages and therefore will not work for other languages.
+
+## Tagging Language-Specific Parts-of-Speech (POS) in English
+
+
+## List of POS Models
+
+We end this section with a list of all models we currently ship with Flair. 
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[pos](https://huggingface.co/flair/pos-english)' |  POS-tagging |   English |  Ontonotes     |**98.19** (Accuracy) |
+| '[pos-fast](https://huggingface.co/flair/pos-english-fast)' |  POS-tagging |   English |  Ontonotes     |  **98.1** (Accuracy) |(fast model)
+| '[upos](https://huggingface.co/flair/upos-english)' |  POS-tagging (universal) | English | Ontonotes     |  **98.6** (Accuracy) |
+| '[upos-fast](https://huggingface.co/flair/upos-english-fast)' |  POS-tagging (universal) | English | Ontonotes     |  **98.47** (Accuracy) | (fast model)
+| '[pos-multi](https://huggingface.co/flair/upos-multi)' |  POS-tagging   |  Multilingual |  UD Treebanks  |  **96.41** (average acc.) |  (12 languages)
+| '[pos-multi-fast](https://huggingface.co/flair/upos-multi-fast)' |  POS-tagging |  Multilingual |  UD Treebanks  |  **92.88** (average acc.) | (12 languages)
+| '[ar-pos](https://huggingface.co/megantosh/flair-arabic-dialects-codeswitch-egy-lev)' | POS-tagging | Arabic (+dialects)| combination of corpora |  | |
+| 'de-pos' | POS-tagging | German | UD German - HDT  |  **98.50** (Accuracy) | |
+| 'de-pos-tweets' | POS-tagging | German | German Tweets  |  **93.06** (Accuracy) | [stefan-it](https://github.com/stefan-it/flair-experiments/tree/master/pos-twitter-german) |
+| 'da-pos' | POS-tagging | Danish | [Danish Dependency Treebank](https://github.com/UniversalDependencies/UD_Danish-DDT/blob/master/README.md)  |  | [AmaliePauli](https://github.com/AmaliePauli) |
+| 'ml-pos' | POS-tagging | Malayalam | 30000 Malayalam sentences  | **83** | [sabiqueqb](https://github.com/sabiqueqb) |
+| 'ml-upos' | POS-tagging | Malayalam | 30000 Malayalam sentences | **87** | [sabiqueqb](https://github.com/sabiqueqb) |
+| 'pt-pos-clinical' | POS-tagging | Portuguese | [PUCPR](https://github.com/HAILab-PUCPR/portuguese-clinical-pos-tagger) | **92.39** | [LucasFerroHAILab](https://github.com/LucasFerroHAILab) for clinical texts |
+| '[pos-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-pos)' | POS-tagging | Ukrainian |  [Ukrainian UD](https://universaldependencies.org/treebanks/uk_iu/index.html)  | **97.93** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
+
+You choose which pre-trained model you load by passing the appropriate string to the `load()` method of the `Classifier` class.
+
+A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
+
+
+
diff --git a/docs/tutorial/tutorial-basics/tagging-entities.md b/docs/tutorial/tutorial-basics/tagging-entities.md
new file mode 100644
index 0000000000..77d3d1efcb
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/tagging-entities.md
@@ -0,0 +1,202 @@
+# Tagging entities 
+
+This tutorials shows you how to do named entity recognition, showcases various NER models, and provides a full list of all NER models in Flair.
+
+## Tagging entities with our standard model​
+
+Our standard model uses Flair embeddings and was trained over the English CoNLL-03 task and can recognize 4 different entity types. It offers a good tradeoff between accuracy and speed.
+
+As example, let's use the sentence "_George Washington went to Washington._": 
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('ner')
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington went to Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+The printout tells us that two entities are labeled in this sentence: "George Washington" as PER (person) and "Washington" as LOC (location).
+
+## Tagging entities with our best model​
+
+Our best 4-class model is trained using a very large transformer. Use it if accuracy is the most important to you, and speed/memory not so much. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('George Washington went to Washington.')
+
+# load the NER tagger
+tagger = Classifier.load('ner-large')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+As you can see, it's the same code, just with '**ner-large**' as model instead of '**ner**'. 
+This model also works with most languages. 
+
+:::hint
+If you want the fastest model we ship, you can also try 'ner-fast'.
+:::
+
+## Tagging entities in non-English text
+
+We also have NER models for text in other languages. 
+
+### Tagging a German sentence
+
+To tag a German sentence, just load the appropriate model:
+
+```python
+
+# load model
+tagger = Classifier.load('de-ner-large')
+
+# make German sentence
+sentence = Sentence('George Washington ging nach Washington.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence: "George Washington ging nach Washington ." → ["George Washington"/PER, "Washington"/LOC]
+```
+
+### Tagging an Arabic sentence
+
+Flair also works for languages that write from right to left. To tag an Arabic sentence, just load the appropriate model:
+
+```python
+
+# load model
+tagger = Classifier.load('ar-ner')
+
+# make Arabic sentence
+sentence = Sentence("احب برلين")
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[2]: "احب برلين" → ["برلين"/LOC]
+```
+
+## Tagging Entities with 18 Classes
+
+We also ship models that distinguish between more than just 4 classes. For instance, use our ontonotes models 
+to classify 18 different types of entities. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('On September 1st George won 1 dollar while watching Game of Thrones.')
+
+# load the NER tagger
+tagger = Classifier.load('ner-ontonotes-large')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[13]: "On September 1st George won 1 dollar while watching Game of Thrones." → ["September 1st"/DATE, "George"/PERSON, "1 dollar"/MONEY, "Game of Thrones"/WORK_OF_ART]
+```
+
+Finding for instance that "Game of Thrones" is a work of art and that "September 1st" is a date.
+
+## Biomedical Data
+
+For biomedical data, we offer the hunflair models that detect 5 different types of biomedical entities. 
+
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence('Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome.')
+
+# load the NER tagger
+tagger = Classifier.load('bioner')
+
+# run NER over sentence
+tagger.predict(sentence)
+
+# print the sentence with all annotations
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[13]: "Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome." → ["Behavioral abnormalities"/Disease, "Fmr1"/Gene, "Mouse"/Species, "Fragile X Syndrome"/Disease]
+```
+
+Thus finding entities of classes "Species", "Disease" and "Gene" in this text.
+
+## List of NER Models
+
+We end this section with a list of all models we currently ship with Flair. 
+
+| ID | Task | Language | Training Dataset | Accuracy | Contributor / Notes |
+| -------------    | ------------- |------------- |------------- | ------------- | ------------- |
+| '[ner](https://huggingface.co/flair/ner-english)' | NER (4-class) |  English | Conll-03  |  **93.03** (F1) |
+| '[ner-fast](https://huggingface.co/flair/ner-english-fast)' | NER (4-class)  |  English  |  Conll-03  |  **92.75** (F1) | (fast model)
+| '[ner-large](https://huggingface.co/flair/ner-english-large)' | NER (4-class)  |  English / Multilingual |  Conll-03  |  **94.09** (F1) | (large model)
+| 'ner-pooled' | NER (4-class)  |  English |  Conll-03  |  **93.24** (F1) | (memory inefficient)
+| '[ner-ontonotes](https://huggingface.co/flair/ner-english-ontonotes)' | NER (18-class) |  English | Ontonotes  |  **89.06** (F1) |
+| '[ner-ontonotes-fast](https://huggingface.co/flair/ner-english-ontonotes-fast)' | NER (18-class) |  English | Ontonotes  |  **89.27** (F1) | (fast model)
+| '[ner-ontonotes-large](https://huggingface.co/flair/ner-english-ontonotes-large)' | NER (18-class) |  English / Multilingual | Ontonotes  |  **90.93** (F1) | (large model)
+| '[ar-ner](https://huggingface.co/megantosh/flair-arabic-multi-ner)' | NER (4-class) | Arabic | AQMAR & ANERcorp (curated) |  **86.66** (F1) | |
+| '[da-ner](https://huggingface.co/flair/ner-danish)' | NER (4-class) | Danish |  [Danish NER dataset](https://github.com/alexandrainst/danlp)  |   | [AmaliePauli](https://github.com/AmaliePauli) |
+| '[de-ner](https://huggingface.co/flair/ner-german)' | NER (4-class) |  German | Conll-03  |  **87.94** (F1) | |
+| '[de-ner-large](https://huggingface.co/flair/ner-german-large)' | NER (4-class) |  German / Multilingual | Conll-03  |  **92.31** (F1) | |
+| 'de-ner-germeval' | NER (4-class) | German | Germeval  |  **84.90** (F1) | |
+| '[de-ner-legal](https://huggingface.co/flair/ner-german-legal)' | NER (legal text) |  German | [LER](https://github.com/elenanereiss/Legal-Entity-Recognition) dataset  |  **96.35** (F1) | |
+| '[fr-ner](https://huggingface.co/flair/ner-french)' | NER (4-class) | French | [WikiNER (aij-wikiner-fr-wp3)](https://github.com/dice-group/FOX/tree/master/input/Wikiner)  |  **95.57** (F1) | [mhham](https://github.com/mhham) |
+| '[es-ner-large](https://huggingface.co/flair/ner-spanish-large)' | NER (4-class) | Spanish | CoNLL-03  |  **90.54** (F1) | [mhham](https://github.com/mhham) |
+| '[nl-ner](https://huggingface.co/flair/ner-dutch)' | NER (4-class) | Dutch |  [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **92.58** (F1) |  |
+| '[nl-ner-large](https://huggingface.co/flair/ner-dutch-large)' | NER (4-class) | Dutch | Conll-03 |  **95.25** (F1) |  |
+| 'nl-ner-rnn' | NER (4-class) | Dutch | [CoNLL 2002](https://www.clips.uantwerpen.be/conll2002/ner/)  |  **90.79** (F1) | |
+| '[ner-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-ner)' | NER (4-class) | Ukrainian |  [NER-UK dataset](https://github.com/lang-uk/ner-uk)  | **86.05** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
+
+
+You choose which pre-trained model you load by passing the appropriate string to the `load()` method of the `Classifier` class.
+
+A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
+
diff --git a/docs/tutorial/tutorial-basics/tagging-sentiment.md b/docs/tutorial/tutorial-basics/tagging-sentiment.md
new file mode 100644
index 0000000000..1e4dd688a8
--- /dev/null
+++ b/docs/tutorial/tutorial-basics/tagging-sentiment.md
@@ -0,0 +1,79 @@
+# Tagging sentiment
+
+This tutorials shows you how to do sentiment analysis in Flair.
+
+## Tagging sentiment with our standard model​
+
+Our standard sentiment analysis model uses distilBERT embeddings and was trained over a mix of corpora, notably
+the Amazon review corpus, and can thus handle a variety of domains and language.
+
+Let's use an example sentence:
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('sentiment')
+
+# make a sentence
+sentence = Sentence('This movie is not at all bad.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[8]: "This movie is not at all bad." → POSITIVE (0.9929)
+```
+
+Showing us that the sentence overall is tagged to be of POSITIVE sentiment. 
+
+## Tagging sentiment with our fast model
+
+We also offer an RNN-based variant which is faster but less accurate. Use it like this: 
+
+
+```python
+from flair.nn import Classifier
+from flair.data import Sentence
+
+# load the model
+tagger = Classifier.load('sentiment-fast')
+
+# make a sentence
+sentence = Sentence('This movie is very bad.')
+
+# predict NER tags
+tagger.predict(sentence)
+
+# print sentence with predicted tags
+print(sentence)
+```
+
+This should print:
+```console
+Sentence[6]: "This movie is very bad." → NEGATIVE (0.9999)
+```
+
+This indicates that the sentence is of NEGATIVE sentiment. As you can see, its the same code as above, just loading the
+'**sentiment-fast**' model instead of '**sentiment**'.
+
+
+### List of Sentiment Models
+
+We end this section with a list of all models we currently ship with Flair:
+
+| ID | Language | Task | Training Dataset | Accuracy |
+| ------------- | ---- | ------------- |------------- |------------- |
+| 'sentiment' | English | detecting positive and negative sentiment (transformer-based) | movie and product reviews |  **98.87** |
+| 'sentiment-fast' | English | detecting positive and negative sentiment (RNN-based) | movie and product reviews |  **96.83**|
+| 'de-offensive-language' | German | detecting offensive language | [GermEval 2018 Task 1](https://projects.fzai.h-da.de/iggsa/projekt/) |  **75.71** (Macro F1) |
+
+
+
+
diff --git a/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
new file mode 100644
index 0000000000..73e70838bb
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
@@ -0,0 +1,115 @@
+# Classic Word Embeddings
+
+Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed
+embedding. Most embeddings fall under this class, including the popular GloVe or Komninos embeddings.
+
+Simply instantiate the `WordEmbeddings` class and pass a string identifier of the embedding you wish to load. So, if
+you want to use GloVe embeddings, pass the string 'glove' to the constructor:
+
+```python
+from flair.embeddings import WordEmbeddings
+
+# init embedding
+glove_embedding = WordEmbeddings('glove')
+```
+Now, create an example sentence and call the embedding's `embed()` method. You can also pass a list of sentences to
+this method since some embedding types make use of batching to increase speed.
+
+```python
+# create sentence.
+sentence = Sentence('The grass is green .')
+
+# embed a sentence using glove.
+glove_embedding.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100.
+
+You choose which pre-trained embeddings you load by passing the appropriate
+id string to the constructor of the `WordEmbeddings` class. Typically, you use
+the **two-letter language code** to init an embedding, so 'en' for English and
+'de' for German and so on. By default, this will initialize FastText embeddings trained over Wikipedia.
+You can also always use FastText embeddings over Web crawls, by instantiating with '-crawl'. So 'de-crawl'
+to use embeddings trained over German web crawls.
+
+For English, we provide a few more options, so
+here you can choose between instantiating 'en-glove', 'en-extvec' and so on.
+
+The following embeddings are currently supported:
+
+| ID | Language | Embedding |
+| ------------- | -------------  | ------------- |
+| 'en-glove' (or 'glove') | English | GloVe embeddings |
+| 'en-extvec' (or 'extvec') | English |Komninos embeddings |
+| 'en-crawl' (or 'crawl')  | English | FastText embeddings over Web crawls |
+| 'en-twitter' (or 'twitter')  | English | Twitter embeddings |
+| 'en-turian' (or 'turian')  | English | Turian embeddings (small) |
+| 'en' (or 'en-news' or 'news')  |English | FastText embeddings over news and wikipedia data |
+| 'de' | German |German FastText embeddings |
+| 'nl' | Dutch | Dutch FastText embeddings |
+| 'fr' | French | French FastText embeddings |
+| 'it' | Italian | Italian FastText embeddings |
+| 'es' | Spanish | Spanish FastText embeddings |
+| 'pt' | Portuguese | Portuguese FastText embeddings |
+| 'ro' | Romanian | Romanian FastText embeddings |
+| 'ca' | Catalan | Catalan FastText embeddings |
+| 'sv' | Swedish | Swedish FastText embeddings |
+| 'da' | Danish | Danish FastText embeddings |
+| 'no' | Norwegian | Norwegian FastText embeddings |
+| 'fi' | Finnish | Finnish FastText embeddings |
+| 'pl' | Polish | Polish FastText embeddings |
+| 'cz' | Czech | Czech FastText embeddings |
+| 'sk' | Slovak | Slovak FastText embeddings |
+| 'sl' | Slovenian | Slovenian FastText embeddings |
+| 'sr' | Serbian | Serbian FastText embeddings |
+| 'hr' | Croatian | Croatian FastText embeddings |
+| 'bg' | Bulgarian | Bulgarian FastText embeddings |
+| 'ru' | Russian | Russian FastText embeddings |
+| 'ar' | Arabic | Arabic FastText embeddings |
+| 'he' | Hebrew | Hebrew FastText embeddings |
+| 'tr' | Turkish | Turkish FastText embeddings |
+| 'fa' | Persian | Persian FastText embeddings |
+| 'ja' | Japanese | Japanese FastText embeddings |
+| 'ko' | Korean | Korean FastText embeddings |
+| 'zh' | Chinese | Chinese FastText embeddings |
+| 'hi' | Hindi | Hindi FastText embeddings |
+| 'id' | Indonesian | Indonesian FastText embeddings |
+| 'eu' | Basque | Basque FastText embeddings |
+
+So, if you want to load German FastText embeddings, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de')
+```
+
+Alternatively, if you want to load German FastText embeddings trained over crawls, instantiate as follows:
+
+```python
+german_embedding = WordEmbeddings('de-crawl')
+```
+
+We generally recommend the FastText embeddings, or GloVe if you want a smaller model.
+
+If you want to use any other embeddings (not listed in the list above), you can load those by calling
+```python
+custom_embedding = WordEmbeddings('path/to/your/custom/embeddings.gensim')
+```
+If you want to load custom embeddings you need to make sure that the custom embeddings are correctly formatted to
+[gensim](https://radimrehurek.com/gensim/models/word2vec.html).
+
+You can, for example, convert [FastText embeddings](https://fasttext.cc/docs/en/crawl-vectors.html) to gensim using the
+following code snippet:
+```python
+import gensim
+
+word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttext/embeddings.txt', binary=False)
+word_vectors.save('/path/to/converted')
+```
+
+However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class.
+
diff --git a/docs/tutorial/tutorial-embeddings/embeddings.md b/docs/tutorial/tutorial-embeddings/embeddings.md
new file mode 100644
index 0000000000..0df422cf55
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/embeddings.md
@@ -0,0 +1,138 @@
+# Embeddings
+
+This tutorial shows you how to use Flair to produce **embeddings** for words and documents. Embeddings
+are vector representations that are useful for a variety of reasons. All Flair models are trained on 
+top of embeddings, so if you want to train your own models, you should understand how embeddings work.
+
+## Example 1: Embeddings Words with Transformers
+
+Let's use a standard BERT model (bert-base-uncased) to embed the sentence "the grass is green".
+
+Simply instantate `TransformerWordEmbeddings` and call `embed()` over an example sentence: 
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+from flair.data import Sentence
+
+# init embedding
+embedding = TransformerWordEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+This will cause **each word in the sentence** to be embedded. You can iterate through the words and get 
+each embedding like this:
+
+```python
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+This will print each token as a long PyTorch vector: 
+```console
+Token[0]: "The"
+tensor([-0.0323, -0.3904, -1.1946,  0.1296,  0.5806, ..], device='cuda:0')
+Token[1]: "grass"
+tensor([-0.3973,  0.2652, -0.1337,  0.4473,  1.1641, ..], device='cuda:0')
+Token[2]: "is"
+tensor([ 0.1374, -0.3688, -0.8292, -0.4068,  0.7717, ..], device='cuda:0')
+Token[3]: "green"
+tensor([-0.7722, -0.1152,  0.3661,  0.3570,  0.6573, ..], device='cuda:0')
+Token[4]: "."
+tensor([ 0.1441, -0.1772, -0.5911,  0.2236, -0.0497, ..], device='cuda:0')
+```
+
+*(Output truncated for readability, actually the vectors are much longer.)*
+
+Transformer word embeddings are the most important concept in Flair. Check out more info in this dedicated chapter.
+
+## Example 2: Embeddings Documents with Transformers
+
+Sometimes you want to have an **embedding for a whole document**, not only individual words. In this case, use one of the 
+DocumentEmbeddings classes in Flair. 
+
+Let's again use a standard BERT model to get an embedding for the entire sentence "the grass is green":  
+
+```python
+from flair.embeddings import TransformerDocumentEmbeddings
+from flair.data import Sentence
+
+# init embedding
+embedding = TransformerDocumentEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+Now, the whole sentence is embedded. Print the embedding like this: 
+
+```python
+# now check out the embedded sentence
+print(sentence.embedding)
+```
+
+Transformer document embeddings are the most important concept in Flair. Check out more info in this dedicated chapter.
+
+
+## How to Stack Embeddings
+
+Flair allows you to combine embeddings into "embedding stacks". When not fine-tuning, using combinations of embeddings often gives best results!
+
+Use the `StackedEmbeddings` class and instantiate it by passing a list of embeddings that you wish to combine. For instance, lets combine classic GloVe embeddings with forward and backward Flair embeddings. 
+
+First, instantiate the two embeddings you wish to combine:
+
+```python
+from flair.embeddings import WordEmbeddings, FlairEmbeddings
+
+# init standard GloVe embedding
+glove_embedding = WordEmbeddings('glove')
+
+# init Flair forward and backwards embeddings
+flair_embedding_forward = FlairEmbeddings('news-forward')
+flair_embedding_backward = FlairEmbeddings('news-backward')
+```
+
+Now instantiate the `StackedEmbeddings` class and pass it a list containing these two embeddings.
+
+```python
+from flair.embeddings import StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        glove_embedding,
+                                        flair_embedding_forward,
+                                        flair_embedding_backward,
+                                       ])
+```
+
+
+That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
+
+```python
+sentence = Sentence('The grass is green .')
+
+# just embed a sentence using the StackedEmbedding as you would with any single embedding.
+stacked_embeddings.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+
+Words are now embedded using a concatenation of three different embeddings. This means that the resulting embedding
+vector is still a single PyTorch vector.
+
+
+
+
diff --git a/docs/tutorial/tutorial-embeddings/flair-embeddings.md b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
new file mode 100644
index 0000000000..fd78f355fc
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
@@ -0,0 +1,143 @@
+# Flair embeddings
+
+Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
+ that capture latent syntactic-semantic information that goes beyond
+standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and
+thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their
+surrounding text, meaning that the *same word will have different embeddings depending on its
+contextual use*.
+
+With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
+
+```python
+from flair.embeddings import FlairEmbeddings
+
+# init embedding
+flair_embedding_forward = FlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class.
+Currently, the following contextual string embeddings are provided (note: replace '*X*' with either '*forward*' or '*backward*'):
+
+| ID | Language | Embedding |
+| -------------     | ------------- | ------------- |
+| 'multi-X'    | 300+ | [JW300 corpus](http://opus.nlpl.eu/JW300.php), as proposed by [Agić and Vulić (2019)](https://www.aclweb.org/anthology/P19-1310/). The corpus is licensed under CC-BY-NC-SA
+| 'multi-X-fast'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly |
+| 'news-X'    | English | Trained with 1 billion word corpus |
+| 'news-X-fast'    | English | Trained with 1 billion word corpus, CPU-friendly |
+| 'mix-X'     | English | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
+| 'ar-X'     | Arabic | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'bg-X'  | Bulgarian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'bg-X-fast'  | Bulgarian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes) |
+| 'cs-X'     | Czech | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'cs-v0-X'    | Czech | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'de-X'  | German  | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
+| 'de-historic-ha-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger* |
+| 'de-historic-wz-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung* |
+| 'de-historic-rw-X'  | German (historical) | Added by [@redewiedergabe](https://github.com/redewiedergabe): Historical German trained over 100 million tokens |
+| 'es-X'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia |
+| 'es-X-fast'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia, CPU-friendly |
+| 'es-clinical-'    | Spanish (clinical) | Added by [@matirojasg](https://github.com/flairNLP/flair/issues/2292): Trained with Wikipedia |
+| 'eu-X'    | Basque | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'eu-v0-X'    | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'fa-X'     | Persian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'fi-X'     | Finnish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'fr-X'    | French | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia |
+| 'he-X'     | Hebrew | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'hi-X'     | Hindi | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'hr-X'     | Croatian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'id-X'     | Indonesian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'it-X'     | Italian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'ja-X'    | Japanese | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)|
+| 'nl-X'     | Dutch | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'nl-v0-X'    | Dutch | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
+| 'no-X'     | Norwegian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'pl-X'  | Polish  | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl) |
+| 'pl-opus-X'     | Polish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'pt-X'    | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings |
+| 'sl-X'     | Slovenian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'sl-v0-X'  | Slovenian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018) |
+| 'sv-X'    | Swedish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
+| 'sv-v0-X'    | Swedish | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018) |
+| 'ta-X'    | Tamil | Added by [@stefan-it](https://github.com/stefan-it/plur) |
+| 'pubmed-X'    | English | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)|
+| 'de-impresso-hipe-v1-X' | German (historical)  | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'en-impresso-hipe-v1-X' | English (historical) | In-domain data (Chronicling America material) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'fr-impresso-hipe-v1-X' | French (historical)  | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'am-X' | Amharic  | Based on 6.5m Amharic text corpus crawled from different sources. See [this paper](https://www.mdpi.com/1999-5903/13/11/275) and the official [GitHub Repository](https://github.com/uhh-lt/amharicmodels) for more information. |
+| 'uk-X' | Ukrainian | Added by [@dchaplinsky](https://github.com/dchaplinsky): Trained with [UberText](https://lang.org.ua/en/corpora/) corpus. |
+
+So, if you want to load embeddings from the German forward LM model, instantiate the method as follows:
+
+```python
+flair_de_forward = FlairEmbeddings('de-forward')
+```
+
+And if you want to load embeddings from the Bulgarian backward LM model, instantiate the method as follows:
+
+```python
+flair_bg_backward = FlairEmbeddings('bg-backward')
+```
+
+## Recommended Flair usage
+
+We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard word embeddings into the mix. So, our recommended `StackedEmbedding` for most English tasks is:
+
+
+```python
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+
+# create a StackedEmbedding object that combines glove and forward/backward flair embeddings
+stacked_embeddings = StackedEmbeddings([
+                                        WordEmbeddings('glove'),
+                                        FlairEmbeddings('news-forward'),
+                                        FlairEmbeddings('news-backward'),
+                                       ])
+```
+
+That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
+
+```python
+sentence = Sentence('The grass is green .')
+
+# just embed a sentence using the StackedEmbedding as you would with any single embedding.
+stacked_embeddings.embed(sentence)
+
+# now check out the embedded tokens.
+for token in sentence:
+    print(token)
+    print(token.embedding)
+```
+Words are now embedded using a concatenation of three different embeddings. This combination often gives state-of-the-art accuracy.
+
+
+## Pooled Flair embeddings
+
+We also developed a pooled variant of the `FlairEmbeddings`. These embeddings differ in that they *constantly evolve over time*, even at prediction time (i.e. after training is complete). This means that the same words in the same sentence at two different points in time may have different embeddings.
+
+`PooledFlairEmbeddings` manage a 'global' representation of each distinct word by using a pooling operation of all past occurences. More details on how this works may be found in [Akbik et al. (2019)](https://www.aclweb.org/anthology/N19-1078/).
+
+You can instantiate and use `PooledFlairEmbeddings` like any other embedding:
+
+```python
+from flair.embeddings import PooledFlairEmbeddings
+
+# init embedding
+flair_embedding_forward = PooledFlairEmbeddings('news-forward')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+flair_embedding_forward.embed(sentence)
+```
+
+Note that while we get some of our best results with `PooledFlairEmbeddings` they are very ineffective memory-wise since they keep past embeddings of all words in memory. In many cases, regular `FlairEmbeddings` will be nearly as good but with much lower memory requirements.
+
+
diff --git a/docs/tutorial/tutorial-embeddings/index.rst b/docs/tutorial/tutorial-embeddings/index.rst
new file mode 100644
index 0000000000..f452e9aaec
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/index.rst
@@ -0,0 +1,16 @@
+Tutorial 3: Embeddings
+======================
+
+This tutorial shows you how to use Flair to produce embeddings for words and documents.
+Embeddings are vector representations that are useful for a variety of reasons.
+All Flair models are trained on top of embeddings, so if you want to train your own models,
+you should understand how embeddings work.
+
+.. toctree::
+   :glob:
+
+   embeddings
+   transformer-embeddings
+   flair-embeddings
+   classic-word-embeddings
+   other-embeddings
diff --git a/docs/tutorial/tutorial-embeddings/other-embeddings.md b/docs/tutorial/tutorial-embeddings/other-embeddings.md
new file mode 100644
index 0000000000..0edd291416
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/other-embeddings.md
@@ -0,0 +1,321 @@
+# Other embeddings in Flair
+
+Flair supports many other embedding types. This section introduces these embeddings.
+
+:::info
+We mostly train our models with either TransformerEmbeddings or FlairEmbeddings. The embeddings presented here might be useful 
+for specific use cases or for comparison purposes. 
+:::
+
+
+## One-Hot Embeddings
+
+`OneHotEmbeddings` are embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding
+layer. These embeddings
+thus do not encode any prior knowledge as do most other embeddings. They also differ in that they
+require to see a vocabulary (`vocab_dictionary`) during instantiation. Such dictionary can be passed as an argument
+during class initialization or constructed directly from a corpus with a `from_corpus` method. The dictionary consists
+of all unique tokens contained in the corpus plus an UNK token for all rare words.
+
+You initialize these embeddings like this:
+
+```python
+from flair.embeddings import OneHotEmbeddings
+from flair.datasets import UD_ENGLISH
+from flair.data import Sentence
+
+# load a corpus
+corpus = UD_ENGLISH()
+
+# init embedding
+embeddings = OneHotEmbeddings.from_corpus(corpus)
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embeddings.embed(sentence)
+```
+
+By default, the 'text' of a token (i.e. its lexical value) is one-hot encoded and the embedding layer has a dimensionality
+of 300. However, this layer is randomly initialized, meaning that these embeddings do not make sense unless they are trained in a task.
+
+### Vocabulary size
+
+By default, all words that occur in the corpus at least 3 times are part of the vocabulary. You can change
+this using the `min_freq` parameter. For instance, if your corpus is very large you might want to set a
+higher `min_freq`:
+
+```python
+embeddings = OneHotEmbeddings.from_corpus(corpus, min_freq=10)
+```
+
+### Embedding dimensionality
+
+By default, the embeddings have a dimensionality of 300. If you want to try higher or lower values, you can use the
+`embedding_length` parameter:
+
+```python
+embeddings = OneHotEmbeddings.from_corpus(corpus, embedding_length=100)
+```
+
+
+### Embedding other tags
+
+Sometimes, you want to embed something other than text. For instance, sometimes we have part-of-speech tags or
+named entity annotation available that we might want to use. If this field exists in your corpus, you can embed
+it by passing the field variable. For instance, the UD corpora have a universal part-of-speech tag for each
+token ('upos'). Embed it like so:
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import OneHotEmbeddings
+
+# load corpus
+corpus = UD_ENGLISH()
+
+# embed POS tags
+embeddings = OneHotEmbeddings.from_corpus(corpus, field='upos')
+```
+
+This should print a vocabulary of size 18 consisting of universal part-of-speech tags.
+
+
+## Byte Pair Embeddings
+
+`BytePairEmbeddings` are word embeddings that are precomputed on the subword-level. This means that they are able to
+embed any word by splitting words into subwords and looking up their embeddings. `BytePairEmbeddings` were proposed
+and computed by [Heinzerling and Strube (2018)](https://www.aclweb.org/anthology/L18-1473) who found that they offer nearly the same accuracy as word embeddings, but at a fraction
+of the model size. So they are a great choice if you want to train small models.
+
+You initialize with a language code (275 languages supported), a number of 'syllables' (one of ) and
+a number of dimensions (one of 50, 100, 200 or 300). The following initializes and uses byte pair embeddings
+for English:
+
+```python
+from flair.embeddings import BytePairEmbeddings
+
+# init embedding
+embedding = BytePairEmbeddings('en')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+More information can be found
+on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
+
+`BytePairEmbeddings` also have a multilingual model capable of embedding any word in any language.
+ You can instantiate it with:
+
+```python
+# init embedding
+embedding = BytePairEmbeddings('multi')
+```
+
+You can also load custom `BytePairEmbeddings` by specifying a path to model_file_path and embedding_file_path arguments. They correspond respectively to a SentencePiece model file and to an embedding file (Word2Vec plain text or GenSim binary). For example:
+
+```python
+# init custom embedding
+embedding = BytePairEmbeddings(model_file_path='your/path/m.model', embedding_file_path='your/path/w2v.txt')
+```
+
+
+## ELMo Embeddings
+
+[ELMo embeddings](http://www.aclweb.org/anthology/N18-1202) were presented by Peters et al. in 2018. They are using
+a bidirectional recurrent neural network to predict the next word in a text.
+We are using the implementation of [AllenNLP](https://allennlp.org/elmo). As this implementation comes with a lot of
+sub-dependencies, which we don't want to include in Flair, you need to first install the library via
+`pip install allennlp==0.9.0` before you can use it in Flair.
+Using the embeddings is as simple as using any other embedding type:
+
+```python
+from flair.embeddings import ELMoEmbeddings
+
+# init embedding
+embedding = ELMoEmbeddings()
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+ELMo word embeddings can be constructed by combining ELMo layers in different ways. The available combination strategies are:
+- `"all"`: Use the concatenation of the three ELMo layers.
+- `"top"`: Use the top ELMo layer.
+- `"average"`: Use the average of the three ELMo layers.
+
+By default, the top 3 layers are concatenated to form the word embedding.
+
+AllenNLP provides the following pre-trained models. To use any of the following models inside Flair
+simple specify the embedding id when initializing the `ELMoEmbeddings`.
+
+| ID | Language | Embedding |
+| ------------- | ------------- | ------------- |
+| 'small' | English | 1024-hidden, 1 layer, 14.6M parameters |
+| 'medium'   | English | 2048-hidden, 1 layer, 28.0M parameters |
+| 'original'    | English | 4096-hidden, 2 layers, 93.6M parameters |
+| 'large'    | English |  |
+| 'pt'   | Portuguese | |
+| 'pubmed' | English biomedical data | [more information](https://allennlp.org/elmo) |
+
+
+## Document Pool Embeddings
+
+DocumentPoolEmbeddings calculate a pooling operation over all word embeddings in a document.
+The default operation is `mean` which gives us the mean of all words in the sentence.
+The resulting embedding is taken as document embedding.
+
+To create a mean document embedding simply create any number of `TokenEmbeddings` first and put them in a list.
+Afterwards, initiate the `DocumentPoolEmbeddings` with this list of `TokenEmbeddings`.
+So, if you want to create a document embedding using GloVe embeddings together with `FlairEmbeddings`,
+use the following code:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
+
+# initialize the word embeddings
+glove_embedding = WordEmbeddings('glove')
+
+# initialize the document embeddings, mode = mean
+document_embeddings = DocumentPoolEmbeddings([glove_embedding])
+```
+
+Now, create an example sentence and call the embedding's `embed()` method.
+
+```python
+# create an example sentence
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+# embed the sentence with our document embedding
+document_embeddings.embed(sentence)
+
+# now check out the embedded sentence.
+print(sentence.embedding)
+```
+
+This prints out the embedding of the document. Since the document embedding is derived from word embeddings, its dimensionality depends on the dimensionality of word embeddings you are using.
+
+You have the following optional constructor arguments:
+
+| Argument             | Default             | Description
+| -------------------- | ------------------- | ------------------------------------------------------------------------------
+| `fine_tune_mode`             | `linear`       | One of `linear`, `nonlinear` and `none`.
+| `pooling`  | `first`             | One of `mean`, `max` and `min`.
+
+### Pooling operation
+
+Next to the `mean` pooling operation you can also use `min` or `max` pooling. Simply pass the pooling operation you want
+to use to the initialization of the `DocumentPoolEmbeddings`:
+```python
+document_embeddings = DocumentPoolEmbeddings([glove_embedding],  pooling='min')
+```
+
+### Fine-tune mode
+
+You can also choose which fine-tuning operation you want, i.e. which transformation to apply before word embeddings get
+pooled. The default operation is 'linear' transformation, but if you only use simple word embeddings that are
+not task-trained you should probably use a 'nonlinear' transformation instead:
+
+```python
+# instantiate pre-trained word embeddings
+embeddings = WordEmbeddings('glove')
+
+# document pool embeddings
+document_embeddings = DocumentPoolEmbeddings([embeddings], fine_tune_mode='nonlinear')
+```
+
+If on the other hand you use word embeddings that are task-trained (such as simple one hot encoded embeddings), you
+are often better off doing no transformation at all. Do this by passing 'none':
+
+```python
+# instantiate one-hot encoded word embeddings
+embeddings = OneHotEmbeddings(corpus)
+
+# document pool embeddings
+document_embeddings = DocumentPoolEmbeddings([embeddings], fine_tune_mode='none')
+```
+
+## Document RNN Embeddings
+
+Besides simple pooling we also support a method based on an RNN to obtain a `DocumentEmbeddings`.
+The RNN takes the word embeddings of every token in the document as input and provides its last output state as document
+embedding. You can choose which type of RNN you wish to use.
+
+In order to use the `DocumentRNNEmbeddings` you need to initialize them by passing a list of token embeddings to it:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
+
+glove_embedding = WordEmbeddings('glove')
+
+document_embeddings = DocumentRNNEmbeddings([glove_embedding])
+```
+
+By default, a GRU-type RNN is instantiated. Now, create an example sentence and call the embedding's `embed()` method.
+
+```python
+# create an example sentence
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+# embed the sentence with our document embedding
+document_embeddings.embed(sentence)
+
+# now check out the embedded sentence.
+print(sentence.get_embedding())
+```
+
+This will output a single embedding for the complete sentence. The embedding dimensionality depends on the number of
+hidden states you are using and whether the RNN is bidirectional or not.
+
+### RNN type
+
+If you want to use a different type of RNN, you need to set the `rnn_type` parameter in the constructor. So,
+to initialize a document RNN embedding with an LSTM, do:
+
+```python
+from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
+
+glove_embedding = WordEmbeddings('glove')
+
+document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LSTM')
+```
+
+### Need to be trained on a task
+
+Note that while `DocumentPoolEmbeddings` are immediately meaningful, `DocumentRNNEmbeddings` need to be tuned on the
+downstream task. This happens automatically in Flair if you train a new model with these embeddings. 
+
+Once the model is trained, you can access the tuned `DocumentRNNEmbeddings` object directly from the classifier object and use it to embed sentences.
+
+```python
+document_embeddings = classifier.document_embeddings
+
+sentence = Sentence('The grass is green . And the sky is blue .')
+
+document_embeddings.embed(sentence)
+
+print(sentence.get_embedding())
+```
+
+`DocumentRNNEmbeddings` have a number of hyper-parameters that can be tuned to improve learning:
+
+```text
+:param hidden_size: the number of hidden states in the rnn.
+:param rnn_layers: the number of layers for the rnn.
+:param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
+layer before putting them into the rnn or not.
+:param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
+dimension as before will be taken.
+:param bidirectional: boolean value, indicating whether to use a bidirectional rnn or not.
+:param dropout: the dropout value to be used.
+:param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used.
+:param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
+:param rnn_type: one of 'RNN' or 'LSTM'
+```
diff --git a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
new file mode 100644
index 0000000000..0682d7ef80
--- /dev/null
+++ b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
@@ -0,0 +1,181 @@
+# Transformer embeddings
+
+Flair supports various Transformer-based architectures like BERT or XLNet from [HuggingFace](https://github.com/huggingface), 
+with two classes `TransformerWordEmbeddings` (to embed words) and `TransformerDocumentEmbeddings` (to embed documents).
+
+## Embeddings words 
+
+For instance, to load a standard BERT transformer model, do:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings('bert-base-uncased')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+If instead you want to use RoBERTa, do:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings('roberta-base')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+[Here](https://huggingface.co/transformers/pretrained_models.html) is a full list of all models (BERT, RoBERTa, XLM, XLNet etc.). You can use any of these models with this class.
+
+
+## Embeddings sentences
+
+To embed a whole sentence as one (instead of each word in the sentence), simply use the TransformerDocumentEmbeddings 
+instead:
+
+```python
+from flair.embeddings import TransformerDocumentEmbeddings
+
+# init embedding
+embedding = TransformerDocumentEmbeddings('roberta-base')
+
+# create a sentence
+sentence = Sentence('The grass is green .')
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+## Arguments
+
+There are several options that you can set when you init the TransformerWordEmbeddings 
+and TransformerDocumentEmbeddings classes:
+
+| Argument             | Default             | Description
+| -------------------- | ------------------- | ------------------------------------------------------------------------------
+| `model` | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)
+| `layers`             | `all`       | Defines the layers of the Transformer-based model that produce the embedding
+| `subtoken_pooling`  | `first`             | See [Pooling operation section](#pooling).
+| `layer_mean`     | `True`             | See [Layer mean section](#layer-mean).
+| `fine_tune`     | `False`             | Whether or not embeddings are fine-tuneable.
+| `allow_long_sentences`     | `True`             | Whether or not texts longer than maximal sequence length are supported.
+| `use_context` | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation
+
+
+### Layers
+
+The `layers` argument controls which transformer layers are used for the embedding. If you set this value to '-1,-2,-3,-4', the top 4 layers are used to make an embedding. If you set it to '-1', only the last layer is used. If you set it to "all", then all layers are used.
+
+This affects the length of an embedding, since layers are just concatenated.
+
+```python
+from flair.data import Sentence
+from flair.embeddings import TransformerWordEmbeddings
+
+sentence = Sentence('The grass is green.')
+
+# use only last layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+
+sentence.clear_embeddings()
+
+# use last two layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='-1,-2', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+
+sentence.clear_embeddings()
+
+# use ALL layers
+embeddings = TransformerWordEmbeddings('bert-base-uncased', layers='all', layer_mean=False)
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+```
+
+This should print:
+```console
+torch.Size([768])
+torch.Size([1536])
+torch.Size([9984])
+```
+
+I.e. the size of the embedding increases the mode layers we use (but ONLY if layer_mean is set to False, otherwise the length is always the same).
+
+(pooling)=
+### Pooling operation
+
+Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
+token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
+
+We implement different pooling operations for these subwords to generate the final token representation:
+
+* `first`: only the embedding of the first subword is used
+* `last`: only the embedding of the last subword is used
+* `first_last`: embeddings of the first and last subwords are concatenated and used
+* `mean`: a `torch.mean` over all subword embeddings is calculated and used
+
+You can choose which one to use by passing this in the constructor:
+
+```python
+# use first and last subtoken for each word
+embeddings = TransformerWordEmbeddings('bert-base-uncased', subtoken_pooling='first_last')
+embeddings.embed(sentence)
+print(sentence[0].embedding.size())
+```
+
+(layer-mean)=
+### Layer mean
+
+The Transformer-based models have a certain number of layers. By default, all layers you select are
+concatenated as explained above. Alternatively, you can set layer_mean=True to do a mean over all
+selected layers. The resulting vector will then always have the same dimensionality as a single layer:
+
+```python
+from flair.embeddings import TransformerWordEmbeddings
+
+# init embedding
+embedding = TransformerWordEmbeddings("roberta-base", layers="all", layer_mean=True)
+
+# create a sentence
+sentence = Sentence("The Oktoberfest is the world's largest Volksfest .")
+
+# embed words in sentence
+embedding.embed(sentence)
+```
+
+### Fine-tuneable or not
+
+In some setups, you may wish to fine-tune the transformer embeddings. In this case, set `fine_tune=True` in the init method.
+When fine-tuning, you should also only use the topmost layer, so best set `layers='-1'`.
+
+```python
+# use first and last subtoken for each word
+embeddings = TransformerWordEmbeddings('bert-base-uncased', fine_tune=True, layers='-1')
+embeddings.embed(sentence)
+print(sentence[0].embedding)
+```
+
+This will print a tensor that now has a gradient function and can be fine-tuned if you use it in a training routine.
+
+```python
+tensor([-0.0323, -0.3904, -1.1946,  ...,  0.1305, -0.1365, -0.4323],
+       device='cuda:0', grad_fn=<CatBackward>)
+```
+
+### Models
+
+Please have a look at the awesome Hugging Face [documentation](https://huggingface.co/transformers/v2.3.0/pretrained_models.html)
+for all supported pretrained models!
+
diff --git a/docs/tutorial/tutorial-training/how-model-training-works.md b/docs/tutorial/tutorial-training/how-model-training-works.md
new file mode 100644
index 0000000000..59d0e96a62
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-model-training-works.md
@@ -0,0 +1,295 @@
+# How model training works in Flair
+
+In this section, we explain the main ideas of model training in Flair.
+
+In particular, we give an introduction to the `ModelTrainer` class, and discuss what decisions you have to make to train good models.
+
+## Example: Training a Part-of-Speech Tagger
+
+As example in this chapter, we train a simple part-of-speech tagger for English. To make the example run fast
+
+- we downsample the training data to 10%
+- we use only simple classic word embeddings (gloVe)
+
+Here is the full training code:
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import WordEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. load the corpus
+corpus = UD_ENGLISH().downsample(0.1)
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embeddings = WordEmbeddings('glove')
+
+# 5. initialize sequence tagger
+model = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(model, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=10)
+```
+
+This code (1) loads the English universal dependencies dataset as training corpus, (2) create a label dictionary for universal part-of-speech tags from the corpus, (3) initializes embeddings and (4) runs the trainer for 10 epochs.
+
+Running this script should produce output that looks like this during training:
+
+```
+2023-02-27 17:07:38,014 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,016 Model training base path: "resources/taggers/example-upos"
+2023-02-27 17:07:38,017 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,020 Device: cuda:0
+2023-02-27 17:07:38,022 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:38,023 Embeddings storage mode: cpu
+2023-02-27 17:07:38,025 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:39,128 epoch 1 - iter 4/40 - loss 3.28409882 - time (sec): 1.10 - samples/sec: 2611.84 - lr: 0.100000
+2023-02-27 17:07:39,474 epoch 1 - iter 8/40 - loss 3.13510367 - time (sec): 1.45 - samples/sec: 3143.21 - lr: 0.100000
+2023-02-27 17:07:39,910 epoch 1 - iter 12/40 - loss 3.02619775 - time (sec): 1.88 - samples/sec: 3434.39 - lr: 0.100000
+2023-02-27 17:07:40,167 epoch 1 - iter 16/40 - loss 2.95288554 - time (sec): 2.14 - samples/sec: 3783.76 - lr: 0.100000
+2023-02-27 17:07:40,504 epoch 1 - iter 20/40 - loss 2.86820018 - time (sec): 2.48 - samples/sec: 4171.22 - lr: 0.100000
+2023-02-27 17:07:40,843 epoch 1 - iter 24/40 - loss 2.80507526 - time (sec): 2.82 - samples/sec: 4557.72 - lr: 0.100000
+2023-02-27 17:07:41,118 epoch 1 - iter 28/40 - loss 2.74217397 - time (sec): 3.09 - samples/sec: 4878.00 - lr: 0.100000
+2023-02-27 17:07:41,420 epoch 1 - iter 32/40 - loss 2.69161746 - time (sec): 3.39 - samples/sec: 5072.93 - lr: 0.100000
+2023-02-27 17:07:41,705 epoch 1 - iter 36/40 - loss 2.63837577 - time (sec): 3.68 - samples/sec: 5260.02 - lr: 0.100000
+2023-02-27 17:07:41,972 epoch 1 - iter 40/40 - loss 2.58915523 - time (sec): 3.95 - samples/sec: 5394.33 - lr: 0.100000
+2023-02-27 17:07:41,975 ----------------------------------------------------------------------------------------------------
+2023-02-27 17:07:41,977 EPOCH 1 done: loss 2.5892 - lr 0.100000
+2023-02-27 17:07:42,567 DEV : loss 2.009714126586914 - f1-score (micro avg)  0.41
+2023-02-27 17:07:42,579 BAD EPOCHS (no improvement): 0
+```
+
+The output monitors the loss over the epochs. At the end of each epoch, the development score is computed and printed.
+
+And a **final evaluation report** gets printed in the end:
+
+```
+Results:
+- F-score (micro) 0.7732
+- F-score (macro) 0.6329
+- Accuracy 0.7732
+
+By class:
+              precision    recall  f1-score   support
+
+        NOUN     0.7199    0.7199    0.7199       407
+       PUNCT     0.9263    0.9843    0.9544       319
+        VERB     0.7521    0.6938    0.7218       258
+        PRON     0.7782    0.9300    0.8474       200
+         ADP     0.8559    0.9515    0.9011       206
+       PROPN     0.6585    0.6398    0.6490       211
+         ADJ     0.5654    0.6914    0.6221       175
+         DET     0.9572    0.8995    0.9275       199
+         AUX     0.8609    0.8784    0.8696       148
+         ADV     0.5052    0.5000    0.5026        98
+       CCONJ     0.9833    0.9077    0.9440        65
+         NUM     0.5435    0.3289    0.4098        76
+        PART     0.9091    0.7143    0.8000        56
+       SCONJ     0.7083    0.5667    0.6296        30
+         SYM     0.3333    0.2143    0.2609        14
+           X     0.0000    0.0000    0.0000        15
+        INTJ     0.0000    0.0000    0.0000        14
+
+    accuracy                         0.7732      2491
+   macro avg     0.6504    0.6247    0.6329      2491
+weighted avg     0.7635    0.7732    0.7655      2491
+```
+
+This report gives us a breakdown of the precision, recall and F1 score of all classes, as well as overall.
+
+Congrats, you just trained your first model!
+
+
+## Step-by-step walkthrough
+
+The above code showed you how to train a part-of-speech tagger.
+
+Now let's individually look at each of the main steps in the above script:
+
+### Step 1: Load a Corpus
+
+The first thing you need is data to train and evaluate your model on.
+
+In Flair, training is done using the `Corpus` object that holds three "splits": a `train`, a `dev` and a `test` split.
+
+:::info
+
+Splitting your data into three splits is standard procedure in machine learning: the `train` split is used to train the model while the `dev` split is used for model selection and early stopping. The `test` split is used only for the final evaluation.
+:::
+
+In this example, we use the <a href="https://universaldependencies.org/treebanks/en_ewt/index.html">English Universal Dependencies</a> dataset to train on. It contains many sentences fully annotated with both universal and language-specific part-of-speech tags. Running these lines will load and print the corpus:
+
+```python
+# 1. load the corpus
+corpus = UD_ENGLISH().downsample(0.1)
+print(corpus)
+```
+
+which should print:
+
+```
+Corpus: 1254 train + 200 dev + 208 test sentences
+```
+
+Showing us that our downsampled training data has three splits: a training split of 1254 sentences, a dev split of 200 sentences, and a test split of 208 sentences.
+
+:::tip
+The `Corpus` object has a number of very handy helper functions that let you manipulate the data and compute statistics. For instance, in the code above we called `.downsample(0.1)` to downsample the corpus to 10% of its original size. To learn about more helper functions, check out the corpus tutorial.
+:::
+
+### Step 2: Choose the label type
+
+After you load the corpus, you need to decide which label type to predict.
+
+We choose the label type **'upos'**, since we want to predict universal part-of-speech tags in this example.
+
+```python
+# 2. what label do we want to predict?
+label_type = 'upos'
+```
+
+:::info
+
+You might ask: why is specifying the `label_type` even necessary? Well, some corpora have more than one label type. The English UD treebank for instance has both universal PoS tags ('upos') and regular PoS tags ('pos'), plus many other layers of annotation. A tagger is normally trained to predict just type of annotation.
+
+This means that you need to know which label types a specific corpus has labels for, and choose one of them.
+:::
+
+
+### Step 3: Creating a label dictionary
+
+Our model needs to predict a set of labels. To determine the label set, run `make_label_dictionary` on the corpus and pass the label type you want to predict. In this example, we pass **'upos'** since we want to predict universal part-of-speech tags.
+
+Running these lines will compute and print the label dictionary from the corpus:
+
+```python
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+```
+
+which should print:
+
+```
+Dictionary with 18 tags: <unk>, NOUN, PUNCT, VERB, PRON, ADP, DET, AUX, ADJ, PROPN, ADV, CCONJ, PART, SCONJ, NUM, X, SYM, INTJ
+```
+
+Showing us that our label dictionary has 18 PoS tags, including one generic tag (`<unk>`) for all unknown labels.
+
+### Step 4: Initialize embeddings
+
+All models in Flair require you to choose embeddings. In most cases, you'll want transformer embeddings. Choosing the right embeddings and parameters is crucial in order to train good models.
+
+In our example, we use simple GloVe embeddings:
+
+
+```python
+# 4. initialize embeddings
+embeddings = WordEmbeddings('glove')
+```
+
+But this is only to make the example code run fast. We generally advise to use transformer-based embeddings instead.
+
+### Step 5: Initialize the Model
+
+Depending on what you want to do, you need to initialize the appropriate model type.
+
+For this example, we use the `SequenceLabeler` since we do part-of-speech tagging:
+
+```python
+# 5. initialize sequence tagger
+model = SequenceTagger(hidden_size=256,
+                       embeddings=embeddings,
+                       tag_dictionary=label_dict,
+                       tag_type=label_type)
+```
+
+Printing it will give you the PyTorch model that is initialized.
+
+:::info
+
+Depending on the task, you need a different model type: For sequence labeling (NER, part-of-speech tagging) you need the `SequenceLabeler`. For text classification you need the `TextClassifier`.
+
+For each model type, we are creating dedicated tutorials to better explain what they do.
+:::
+
+### Step 6: Initialize the Trainer
+
+The ModelTrainer is initialized simply by passing the model and the corpus because that is all it needs.
+
+```python
+# 6. initialize trainer
+trainer = ModelTrainer(model, corpus)
+```
+
+### Step 7: Train
+
+Once the trainer is initialized, you can call `train` to launch a standard training run.
+
+```python
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=10)
+```
+
+This will launch a "standard training run" with SGD as optimizer. By default, the learning rate is annealed against the development score: if fo 3 epochs there is no improvement on the dev split, the learning rate is halved. If this happens too often, the learning rate will fall below a minimal threshold and training stops early.
+
+The max_epochs parameter is set to a small number in this script to make it run fast, but normally you should use a much higher value (150 or 200).
+
+:::info
+
+There are two main mechanisms to train a model in Flair. (1) The "classic" workflow (SGD with annealing) is invoked as above using the `train()` method. (2) The current state-of-the-art based on fine-tuning (AdamW with One-Cycle) is invoked using the `fine_tune()` method. In most cases, you will want to use the latter.
+:::
+
+### Step 8: Predict
+
+Once the model is trained you can use it to predict tags for new sentences. Just call the `predict` method of the model.
+
+```python
+# load the model you trained
+model = SequenceTagger.load('resources/taggers/example-upos/final-model.pt')
+
+# create example sentence
+sentence = Sentence('I love Berlin')
+
+# predict tags and print
+model.predict(sentence)
+
+print(sentence.to_tagged_string())
+```
+
+If the model works well, it will correctly tag 'love' as a verb in this example.
+
+## Summary
+
+This tutorial gave you a general overview of the main steps to train a model:
+
+-    load a corpus
+-    choose a label type
+-    create a label dictionary
+-    choose embeddings
+-    initialize model
+-    initialize trainer
+-    train
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
new file mode 100644
index 0000000000..cdde05b4a2
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
@@ -0,0 +1,160 @@
+# How to load a custom dataset
+
+This part of the tutorial shows how you can load a corpus for training a model. 
+
+## Reading a dataset in column format
+
+In cases you want to train over a sequence labeling dataset that is not in the above list, you can load them with the ColumnCorpus object.
+Most sequence labeling datasets in NLP use some sort of column format in which each line is a word and each column is
+one level of linguistic annotation. See for instance this sentence:
+
+```console
+George N B-PER
+Washington N I-PER
+went V O
+to P O
+Washington N B-LOC
+
+Sam N B-PER
+Houston N I-PER
+stayed V O
+home N O
+```
+
+The first column is the word itself, the second coarse PoS tags, and the third BIO-annotated NER tags. Empty line separates sentences. To read such a
+dataset, define the column structure as a dictionary and instantiate a `ColumnCorpus`.
+
+```python
+from flair.data import Corpus
+from flair.datasets import ColumnCorpus
+
+# define columns
+columns = {0: 'text', 1: 'pos', 2: 'ner'}
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# init a corpus using column format, data folder and the names of the train, dev and test files
+corpus: Corpus = ColumnCorpus(data_folder, columns,
+                              train_file='train.txt',
+                              test_file='test.txt',
+                              dev_file='dev.txt')
+
+```
+
+This gives you a `Corpus` object that contains the train, dev and test splits, each has a list of `Sentence`.
+So, to check how many sentences there are in the training split, do
+
+```python
+len(corpus.train)
+```
+
+You can also access a sentence and check out annotations. Lets assume that the training split is
+read from the example above, then executing these commands
+
+```python
+print(corpus.train[0].to_tagged_string('ner'))
+print(corpus.train[1].to_tagged_string('pos'))
+```
+
+will print the sentences with different layers of annotation:
+
+```console
+George <B-PER> Washington <I-PER> went to Washington <B-LOC> .
+
+Sam <N> Houston <N> stayed <V> home <N>
+```
+
+## Reading a text classification dataset
+
+If you want to use your own text classification dataset, there are currently two methods to go about this:
+load specified text and labels from a simple CSV file or format your data to the
+[FastText format](https://fasttext.cc/docs/en/supervised-tutorial.html).
+
+### Load from simple CSV file
+
+Many text classification datasets are distributed as simple CSV files in which each row corresponds to a data point and
+columns correspond to text, labels, and other metadata.  You can load a CSV format classification dataset using
+`CSVClassificationCorpus` by passing in a column format (like in `ColumnCorpus` above).  This column format indicates
+which column(s) in the CSV holds the text and which field(s) the label(s). By default, Python's CSV library assumes that
+your files are in Excel CSV format, but [you can specify additional parameters](https://docs.python.org/3/library/csv.html#csv-fmt-params)
+if you use custom delimiters or quote characters.
+
+Note: You will need to save your split CSV data files in the `data_folder` path with each file titled appropriately i.e.
+`train.csv` `test.csv` `dev.csv`.   This is because the corpus initializers will automatically search for the train,
+dev, test splits in a folder.
+
+```python
+from flair.data import Corpus
+from flair.datasets import CSVClassificationCorpus
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data'
+
+# column format indicating which columns hold the text and label(s)
+column_name_map = {4: "text", 1: "label_topic", 2: "label_subtopic"}
+
+# load corpus containing training, test and dev data and if CSV has a header, you can skip it
+corpus: Corpus = CSVClassificationCorpus(data_folder,
+                                         column_name_map,
+                                         skip_header=True,
+                                         delimiter='\t',    # tab-separated files
+)
+```
+
+
+### FastText format
+If using `CSVClassificationCorpus` is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this:
+
+```bash
+__label__<label_1> <text>
+__label__<label_1> __label__<label_2> <text>
+```
+
+As previously mentioned, to create a `Corpus` for a text classification task, you need to have three files (train, dev, and test) in the
+above format located in one folder. This data folder structure could, for example, look like this for the IMDB task:
+```text
+/resources/tasks/imdb/train.txt
+/resources/tasks/imdb/dev.txt
+/resources/tasks/imdb/test.txt
+```
+Now create a `ClassificationCorpus` by pointing to this folder (`/resources/tasks/imdb`).
+Thereby, each line in a file is converted to a `Sentence` object annotated with the labels.
+
+Attention: A text in a line can have multiple sentences. Thus, a `Sentence` object can actually consist of multiple
+sentences.
+
+```python
+from flair.data import Corpus
+from flair.datasets import ClassificationCorpus
+
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# load corpus containing training, test and dev data
+corpus: Corpus = ClassificationCorpus(data_folder,
+                                      test_file='test.txt',
+                                      dev_file='dev.txt',
+                                      train_file='train.txt',
+                                      label_type='topic',
+                                      )
+```
+
+Note again that our corpus initializers have methods to automatically look for train, dev and test splits in a folder. So in
+most cases you don't need to specify the file names yourself. Often, this is enough:
+
+```python
+# this is the folder in which train, test and dev files reside
+data_folder = '/path/to/data/folder'
+
+# load corpus by pointing to folder. Train, dev and test gets identified automatically.
+corpus: Corpus = ClassificationCorpus(data_folder,
+                                      label_type='topic',
+                                      )
+```
+
+Since the FastText format does not have columns, you must manually define a name for the annotations. In this
+example we chose `label_type='topic'` to denote that we are loading a corpus with topic labels.
+
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
new file mode 100644
index 0000000000..428898ed84
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
@@ -0,0 +1,356 @@
+# How to load a prepared dataset
+
+This part of the tutorial shows how you can load a corpus for training a model. 
+
+## The Corpus Object
+
+The `Corpus` represents a dataset that you use to train a model. It consists of a list of `train` sentences,
+a list of `dev` sentences, and a list of `test` sentences, which correspond to the training, validation and testing
+split during model training.
+
+The following example snippet instantiates the Universal Dependency Treebank for English as a corpus object:
+
+```python
+import flair.datasets
+corpus = flair.datasets.UD_ENGLISH()
+```
+
+The first time you call this snippet, it triggers a download of the Universal Dependency Treebank for English onto your
+hard drive. It then reads the train, test and dev splits into the `Corpus` which it returns. Check the length of
+the three splits to see how many Sentences are there:
+
+```python
+# print the number of Sentences in the train split
+print(len(corpus.train))
+
+# print the number of Sentences in the test split
+print(len(corpus.test))
+
+# print the number of Sentences in the dev split
+print(len(corpus.dev))
+```
+
+You can also access the Sentence objects in each split directly. For instance, let us look at the first Sentence in
+the training split of the English UD:
+
+```python
+# get the first Sentence in the training split
+sentence = corpus.test[0]
+
+# print with all annotations
+print(sentence)
+
+# print only with POS annotations (better readability)
+print(sentence.to_tagged_string('pos'))
+```
+
+The sentence is fully tagged with syntactic and morphological information. With the latter line,
+you print out only the POS tags:
+
+```console
+Sentence: "What if Google Morphed Into GoogleOS ?" → ["What"/WP, "if"/IN, "Google"/NNP, "Morphed"/VBD, "Into"/IN, "GoogleOS"/NNP, "?"/.]
+```
+
+So the corpus is tagged and ready for training.
+
+### Helper functions
+
+A `Corpus` contains a bunch of useful helper functions.
+For instance, you can downsample the data by calling `downsample()` and passing a ratio. So, if you normally get a
+corpus like this:
+
+```python
+import flair.datasets
+corpus = flair.datasets.UD_ENGLISH()
+```
+
+then you can downsample the corpus, simply like this:
+
+```python
+import flair.datasets
+downsampled_corpus = flair.datasets.UD_ENGLISH().downsample(0.1)
+```
+
+If you print both corpora, you see that the second one has been downsampled to 10% of the data.
+
+```python
+print("--- 1 Original ---")
+print(corpus)
+
+print("--- 2 Downsampled ---")
+print(downsampled_corpus)
+```
+
+This should print:
+
+```console
+--- 1 Original ---
+Corpus: 12543 train + 2002 dev + 2077 test sentences
+
+--- 2 Downsampled ---
+Corpus: 1255 train + 201 dev + 208 test sentences
+```
+
+### Creating label dictionaries
+
+For many learning tasks you need to create a "dictionary" that contains all the labels you want to predict.
+You can generate this dictionary directly out of the `Corpus` by calling the method `make_label_dictionary`
+and passing the desired `label_type`.
+
+For instance, the UD_ENGLISH corpus instantiated above has multiple layers of annotation like regular
+POS tags ('pos'), universal POS tags ('upos'), morphological tags ('tense', 'number'..) and so on.
+Create label dictionaries for universal POS tags by passing `label_type='upos'` like this:
+
+```python
+# create label dictionary for a Universal Part-of-Speech tagging task
+upos_dictionary = corpus.make_label_dictionary(label_type='upos')
+
+# print dictionary
+print(upos_dictionary)
+```
+
+This will print out the created dictionary:
+
+```console
+Dictionary with 17 tags: PROPN, PUNCT, ADJ, NOUN, VERB, DET, ADP, AUX, PRON, PART, SCONJ, NUM, ADV, CCONJ, X, INTJ, SYM
+```
+
+#### Dictionaries for other label types
+
+If you don't know the label types in a corpus, just call `make_label_dictionary` with
+any random label name (e.g. `corpus.make_label_dictionary(label_type='abcd')`). This will print
+out statistics on all label types in the corpus:
+
+```console
+The corpus contains the following label types: 'lemma' (in 12543 sentences), 'upos' (in 12543 sentences), 'pos' (in 12543 sentences), 'dependency' (in 12543 sentences), 'number' (in 12036 sentences), 'verbform' (in 10122 sentences), 'prontype' (in 9744 sentences), 'person' (in 9381 sentences), 'mood' (in 8911 sentences), 'tense' (in 8747 sentences), 'degree' (in 7148 sentences), 'definite' (in 6851 sentences), 'case' (in 6486 sentences), 'gender' (in 2824 sentences), 'numtype' (in 2771 sentences), 'poss' (in 2516 sentences), 'voice' (in 1085 sentences), 'typo' (in 399 sentences), 'extpos' (in 185 sentences), 'abbr' (in 168 sentences), 'reflex' (in 98 sentences), 'style' (in 31 sentences), 'foreign' (in 5 sentences)
+```
+
+This means that you can create dictionaries for any of these label types for the UD_ENGLISH corpus. Let's create dictionaries for regular part of speech tags
+and a morphological number tagging task:
+
+```python
+# create label dictionary for a regular POS tagging task
+pos_dictionary = corpus.make_label_dictionary(label_type='pos')
+
+# create label dictionary for a morphological number tagging task
+tense_dictionary = corpus.make_label_dictionary(label_type='number')
+```
+
+If you print these dictionaries, you will find that the POS dictionary contains 50 tags and the number dictionary only 2 for this corpus (singular and plural).
+
+
+#### Dictionaries for other corpora types
+
+The method `make_label_dictionary` can be used for any corpus, including text classification corpora:
+
+```python
+# create label dictionary for a text classification task
+corpus = flair.datasets.TREC_6()
+corpus.make_label_dictionary('question_class')
+```
+
+### The MultiCorpus Object
+
+If you want to train multiple tasks at once, you can use the `MultiCorpus` object.
+To initiate the `MultiCorpus` you first need to create any number of `Corpus` objects. Afterwards, you can pass
+a list of `Corpus` to the `MultiCorpus` object. For instance, the following snippet loads a combination corpus
+consisting of the English, German and Dutch Universal Dependency Treebanks.
+
+```python
+english_corpus = flair.datasets.UD_ENGLISH()
+german_corpus = flair.datasets.UD_GERMAN()
+dutch_corpus = flair.datasets.UD_DUTCH()
+
+# make a multi corpus consisting of three UDs
+from flair.data import MultiCorpus
+multi_corpus = MultiCorpus([english_corpus, german_corpus, dutch_corpus])
+```
+
+The `MultiCorpus` inherits from `Corpus`, so you can use it like any other corpus to train your models.
+
+## Datasets included in Flair
+
+Flair supports many datasets out of the box. It automatically downloads and sets up the
+data the first time you call the corresponding constructor ID.
+
+The following datasets are supported:
+
+### Named Entity Recognition
+
+| Object                      | Languages                | Description                                                                                                                                                                                       |
+|-----------------------------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 'CONLL_03'                  | English                  | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER (requires manual download)                                                                                                 |
+| 'CONLL_03_GERMAN'           | German                   | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER (requires manual download)                                                                                                 |
+| 'CONLL_03_DUTCH'            | Dutch                    | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER                                                                                                                            |
+| 'CONLL_03_SPANISH'          | Spanish                  | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER                                                                                                                            |
+| 'ONTONOTES'                 | Arabic, English, Chinese | [Ontonotes](https://paperswithcode.com/dataset/ontonotes-5-0/) 18-class NER                                                                                                                          |
+| 'FEWNERD'                   | English                  | [FewNERD](https://ningding97.github.io/fewnerd/) 66-class NER                                                                                                                             |
+| 'NER_ARABIC_ANER'           | Arabic                   | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp) 4-class NER                                                                                       |
+| 'NER_ARABIC_AQMAR'          | Arabic                   | [American and Qatari Modeling of Arabic](http://www.cs.cmu.edu/~ark/AQMAR/) 4-class NER (modified)                                                                                                |
+| 'NER_BASQUE'                | Basque                   | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/)                                                                                                                                            |
+| 'NER_CHINESE_WEIBO'         | Chinese                  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).                                                                                               |
+| 'NER_DANISH_DANE'           | Danish                   | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank)                                                                                    |
+| 'NER_ENGLISH_MOVIE_SIMPLE'  | English                  | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER                                                                                                   |
+| 'NER_ENGLISH_MOVIE_COMPLEX' | English                  | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER                                                                                                  |
+| 'NER_ENGLISH_PERSON'        | English                  | [PERSON_NER](https://github.com/das-sudeshna/genid) NER with person names                                                                                                                         |
+| 'NER_ENGLISH_RESTAURANT'    | English                  | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/)                                                                                                      |
+| 'NER_ENGLISH_SEC_FILLINGS'  | English                  | [SEC-fillings](https://github.com/juand-r/entity-recognition-datasets) with 4-class NER labels from (Alvarado et al, 2015)[https://aclanthology.org/U15-1010/] here                               |
+| 'NER_ENGLISH_STACKOVERFLOW' | English                  | NER on StackOverflow posts                                                                                                                                                                        |
+| 'NER_ENGLISH_TWITTER'       | English                  | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/)                                                                                                                                    |
+| 'NER_ENGLISH_WIKIGOLD'      | English                  | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text                                                    |
+| 'NER_ENGLISH_WNUT_2020'     | English                  | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction                                                                                                                       |
+| 'NER_ENGLISH_WEBPAGES'      | English                  | 4-class NER on web pages from [Ratinov and Roth (2009)](https://aclanthology.org/W09-1119/)                                                                                                       |
+| 'NER_FINNISH'               | Finnish                  | [Finer-data](https://github.com/mpsilfve/finer-data)                                                                                                                                              |
+| 'NER_GERMAN_BIOFID'         | German                   | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER                                                                                                                |
+| 'NER_GERMAN_EUROPARL'       | German                   | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches                                                                            |
+| 'NER_GERMAN_GERMEVAL'       | German                   | [GermEval 14 NER](https://sites.google.com/site/germeval2014ner/data/) corpus                                                                                                                     |
+| 'NER_GERMAN_LEGAL'          | German                   | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents                                                                                |
+| 'NER_GERMAN_POLITICS'       | German                   | [NEMGP](https://www.thomas-zastrow.de/nlp/) corpus                                                                                                                                                |
+| 'NER_HIPE_2022'             | 5 languages              | NER dataset for [HIPE-2022](https://hipe-eval.github.io/HIPE-2022/) (Identifying Historical People, Places and other Entities)                                                                    |
+| 'NER_HUNGARIAN'             | Hungarian                | NER on Hungarian business news                                                                                                                                                                    |
+| 'NER_ICELANDIC'             | Icelandic                | NER on Icelandic                                                                                                                                                                                  |
+| 'NER_JAPANESE'              | Japanese                 | [Japanese NER](https://github.com/Hironsan/IOB2Corpus) dataset automatically generated from Wikipedia                                                                                             |
+| 'NER_MASAKHANE'             | 10 languages             | [MasakhaNER: Named Entity Recognition for African Languages](https://github.com/masakhane-io/masakhane-ner) corpora                                                                               |
+| 'NER_SWEDISH'               | Swedish                  | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER                                                                                                            |
+| 'NER_TURKU'                 | Finnish                  | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland                                                                 |
+| 'NER_UKRAINIAN'             | Ukrainian                | [lang-uk](https://github.com/lang-uk/flair-ner) NER corpus created by the [Lang-uk community](https://lang.org.ua/en/)                                                                            |
+| 'NER_MULTI_WIKIANN'         | 282 languages            | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).                                                                                              |
+| 'NER_MULTI_WIKINER'         | 8 languages              | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia (English, German, French, Italian, Spanish, Portuguese, Polish, Russian) |
+| 'NER_MULTI_XTREME'          | 176 languages            | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages                                            |
+| 'WNUT_17'                   | English                  | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection                                                                                                                     |
+
+### Biomedical Named Entity Recognition
+
+We support 31 biomedical NER datasets, listed
+
+### Entity Linking
+| Object | Languages | Description |
+| -------------    | ------------- |-------------  |
+| 'NEL_ENGLISH_AIDA' | English  |  [AIDA CoNLL-YAGO Entity Linking corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads) on the CoNLL-03 corpus |
+| 'NEL_ENGLISH_AQUAINT' | English  | Aquaint Entity Linking corpus introduced in [Milne and Witten (2008)](https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf) |
+| 'NEL_ENGLISH_IITB' | English  | ITTB Entity Linking corpus introduced in [Sayali et al. (2009)](https://dl.acm.org/doi/10.1145/1557019.1557073) |
+| 'NEL_ENGLISH_REDDIT' | English  | Reddit Entity Linking corpus introduced in [Botzer et al. (2021)](https://arxiv.org/abs/2101.01228v2) (only gold annotations)|
+| 'NEL_ENGLISH_TWEEKI' | English  | ITTB Entity Linking corpus introduced in [Harandizadeh and Singh (2020)](https://aclanthology.org/2020.wnut-1.29.pdf) |
+| 'NEL_GERMAN_HIPE' | German  | [HIPE](https://impresso.github.io/CLEF-HIPE-2020/) Entity Linking corpus for historical German as a [sentence-segmented version](https://github.com/stefan-it/clef-hipe) |
+
+### Relation Extraction
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'RE_ENGLISH_CONLL04' | English  |  [CoNLL-04](https://github.com/bekou/multihead_joint_entity_relation_extraction/tree/master/data/CoNLL04) Relation Extraction |
+| 'RE_ENGLISH_SEMEVAL2010' | English  |  [SemEval-2010 Task 8](https://aclanthology.org/S10-1006.pdf) on Multi-Way Classification of Semantic Relations Between Pairs of Nominals |
+| 'RE_ENGLISH_TACRED' | English  |  [TAC Relation Extraction Dataset](https://nlp.stanford.edu/projects/tacred/) with 41 relations (download required) |
+| 'RE_ENGLISH_DRUGPROT' | English  |  [DrugProt corpus: Biocreative VII Track 1](https://zenodo.org/record/5119892#.YSdSaVuxU5k/) - drug and chemical-protein interactions |
+
+
+### GLUE Benchmark
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'GLUE_COLA' | English | The Corpus of Linguistic Acceptability from GLUE benchmark |
+| 'GLUE_MNLI' | English | The Multi-Genre Natural Language Inference Corpus from the GLUE benchmark |
+| 'GLUE_RTE' | English | The RTE task from the GLUE benchmark |
+| 'GLUE_QNLI' | English | The Stanford Question Answering Dataset formated as NLI task from the GLUE benchmark |
+| 'GLUE_WNLI' | English | The Winograd Schema Challenge formated as NLI task from the GLUE benchmark |
+| 'GLUE_MRPC' | English | The MRPC task from GLUE benchmark |
+| 'GLUE_QQP' | English | The Quora Question Pairs dataset where the task is to determine whether a pair of questions are semantically equivalent |
+| 'SUPERGLUE_RTE' | English | The RTE task from the SuperGLUE benchmark |
+
+### Universal Proposition Banks
+
+We also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
+for the purpose of training multilingual frame detection systems.
+
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
+| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
+| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
+| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
+| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
+| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
+| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
+| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
+
+### Universal Dependency Treebanks
+
+| Object             | Languages         | Description                                                                                                                         |
+|--------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
+| 'UD_ARABIC'        | Arabic            | Universal Dependency Treebank for [Arabic](https://github.com/UniversalDependencies/UD_Arabic-PADT)                                 |
+| 'UD_BASQUE'        | Basque            | Universal Dependency Treebank for [Basque](https://github.com/UniversalDependencies/UD_Basque-BDT)                                  |
+| 'UD_BULGARIAN'     | Bulgarian         | Universal Dependency Treebank for [Bulgarian](https://github.com/UniversalDependencies/UD_Bulgarian-BTB)                            
+| 'UD_CATALAN',      | Catalan           | Universal Dependency Treebank for [Catalan](https://github.com/UniversalDependencies/UD_Catalan-AnCora)                             |
+| 'UD_CHINESE'       | Chinese           | Universal Dependency Treebank for [Chinese](https://github.com/UniversalDependencies/UD_Chinese-GSD)                                |
+| 'UD_CHINESE_KYOTO' | Classical Chinese | Universal Dependency Treebank for Classical [Chinese](https://github.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/tree/dev) |
+| 'UD_CROATIAN'      | Croatian          | Universal Dependency Treebank for [Croatian](https://github.com/UniversalDependencies/UD_Croatian-SET)                              |
+| 'UD_CZECH'         | Czech             | Very large Universal Dependency Treebank for [Czech](https://github.com/UniversalDependencies/UD_Czech-PDT)                         |
+| 'UD_DANISH'        | Danish            | Universal Dependency Treebank for [Danish](https://github.com/UniversalDependencies/UD_Danish-DDT)                                  |
+| 'UD_DUTCH'         | Dutch             | Universal Dependency Treebank for [Dutch](https://github.com/UniversalDependencies/UD_Dutch-Alpino)                                 |
+| 'UD_ENGLISH'       | English           | Universal Dependency Treebank for [English](https://github.com/UniversalDependencies/UD_English-EWT)                                |
+| 'UD_FINNISH'       | Finnish           | Universal Dependency Treebank for [Finnish](https://github.com/UniversalDependencies/UD_Finnish-TDT)                                |
+| 'UD_FRENCH'        | French            | Universal Dependency Treebank for [French](https://github.com/UniversalDependencies/UD_French-GSD)                                  |
+| 'UD_GERMAN'        | German            | Universal Dependency Treebank for [German](https://github.com/UniversalDependencies/UD_German-GSD)                                  |
+| 'UD_GERMAN-HDT'    | German            | Very large Universal Dependency Treebank for [German](https://github.com/UniversalDependencies/UD_German-HDT)                       |
+| 'UD_HEBREW'        | Hebrew            | Universal Dependency Treebank for [Hebrew](https://github.com/UniversalDependencies/UD_Hebrew-HTB)                                  |
+| 'UD_HINDI'         | Hindi             | Universal Dependency Treebank for [Hindi](https://github.com/UniversalDependencies/UD_Hindi-HDTB)                                   |
+| 'UD_INDONESIAN'    | Indonesian        | Universal Dependency Treebank for [Indonesian](https://github.com/UniversalDependencies/UD_Indonesian-GSD)                          |
+| 'UD_ITALIAN'       | Italian           | Universal Dependency Treebank for [Italian](https://github.com/UniversalDependencies/UD_Italian-ISDT)                               |
+| 'UD_JAPANESE'      | Japanese          | Universal Dependency Treebank for [Japanese](https://github.com/UniversalDependencies/UD_Japanese-GSD)                              |
+| 'UD_KOREAN'        | Korean            | Universal Dependency Treebank for [Korean](https://github.com/UniversalDependencies/UD_Korean-Kaist)                                |
+| 'UD_NORWEGIAN',    | Norwegian         | Universal Dependency Treebank for [Norwegian](https://github.com/UniversalDependencies/UD_Norwegian-Bokmaal)                        |
+| 'UD_PERSIAN'       | Persian / Farsi   | Universal Dependency Treebank for [Persian](https://github.com/UniversalDependencies/UD_Persian-Seraji)                             |
+| 'UD_POLISH'        | Polish            | Universal Dependency Treebank for [Polish](https://github.com/UniversalDependencies/UD_Polish-LFG)                                  |
+| 'UD_PORTUGUESE'    | Portuguese        | Universal Dependency Treebank for [Portuguese](https://github.com/UniversalDependencies/UD_Portuguese-Bosque)                       |
+| 'UD_ROMANIAN'      | Romanian          | Universal Dependency Treebank for [Romanian](https://github.com/UniversalDependencies/UD_Romanian-RRT)                              |
+| 'UD_RUSSIAN'       | Russian           | Universal Dependency Treebank for [Russian](https://github.com/UniversalDependencies/UD_Russian-SynTagRus)                          |
+| 'UD_SERBIAN'       | Serbian           | Universal Dependency Treebank for [Serbian](https://github.com/UniversalDependencies/UD_Serbian-SET)                                |
+| 'UD_SLOVAK'        | Slovak            | Universal Dependency Treebank for [Slovak](https://github.com/UniversalDependencies/UD_Slovak-SNK)                                  |
+| 'UD_SLOVENIAN'     | Slovenian         | Universal Dependency Treebank for [Slovenian](https://github.com/UniversalDependencies/UD_Slovenian-SSJ)                            |
+| 'UD_SPANISH'       | Spanish           | Universal Dependency Treebank for [Spanish](https://github.com/UniversalDependencies/UD_Spanish-GSD)                                |
+| 'UD_SWEDISH'       | Swedish           | Universal Dependency Treebank for [Swedish](https://github.com/UniversalDependencies/UD_Swedish-Talbanken)                          |
+| 'UD_TURKISH'       | Turkish           | Universal Dependency Treebank for [Tturkish](https://github.com/UniversalDependencies/UD_Turkish-IMST)                              |
+| 'UD_UKRAINIAN'     | Ukrainian         | Universal Dependency Treebank for [Ukrainian](https://github.com/UniversalDependencies/UD_Ukrainian-IU)                             |
+
+### Text Classification
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'AMAZON_REVIEWS' | English |  [Amazon product reviews](https://nijianmo.github.io/amazon/index.html/) dataset with sentiment annotation |
+| 'COMMUNICATIVE_FUNCTIONS' | English |  [Communicative functions](https://github.com/Alab-NII/FECFevalDataset) of sentences in scholarly papers |
+| 'GERMEVAL_2018_OFFENSIVE_LANGUAGE' | German | Offensive language detection for German |
+| 'GO_EMOTIONS' | English | [GoEmotions dataset](https://github.com/google-research/google-research/tree/master/goemotions) Reddit comments labeled with 27 emotions |
+| 'IMDB' | English |  [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/) dataset of movie reviews with sentiment annotation  |
+| 'NEWSGROUPS' | English | The popular [20 newsgroups](http://qwone.com/~jason/20Newsgroups/) classification dataset |
+| 'YAHOO_ANSWERS' | English | The [10 largest main categories](https://course.fast.ai/datasets#nlp) from the Yahoo! Answers |
+| 'SENTIMENT_140' | English | [Tweets dataset](http://help.sentiment140.com/for-students/) with sentiment annotation |
+| 'SENTEVAL_CR' | English | Customer reviews dataset of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
+| 'SENTEVAL_MR' | English | Movie reviews dataset of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
+| 'SENTEVAL_SUBJ' | English | Subjectivity dataset of [SentEval](https://github.com/facebookresearch/SentEval) |
+| 'SENTEVAL_MPQA' | English | Opinion-polarity dataset of [SentEval](https://github.com/facebookresearch/SentEval) with opinion-polarity annotation |
+| 'SENTEVAL_SST_BINARY' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
+| 'SENTEVAL_SST_GRANULAR' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with fine-grained sentiment annotation |
+| 'TREC_6', 'TREC_50' | English | The [TREC](http://cogcomp.org/Data/QA/QC/) question classification dataset |
+
+### Text Regression
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'WASSA_ANGER' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (anger) |
+| 'WASSA_FEAR' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (fear) |
+| 'WASSA_JOY' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (joy) |
+| 'WASSA_SADNESS' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (sadness) |
+
+### Other Sequence Labeling
+
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'CONLL_2000' | English  | Syntactic chunking with [CoNLL-2000](https://www.clips.uantwerpen.be/conll2000/chunking/)  |
+| 'BIOSCOPE' | English  | Negation and speculation scoping wih [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
+| 'KEYPHRASE_INSPEC' | English | Keyphrase dectection with [INSPEC](https://www.aclweb.org/anthology/W03-1028) original corpus (2000 docs) from INSPEC database, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
+| 'KEYPHRASE_SEMEVAL2017' | English | Keyphrase dectection with [SEMEVAL2017](https://arxiv.org/abs/1704.02853) dataset (500 docs) from ScienceDirect, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
+| 'KEYPHRASE_SEMEVAL2010' | English | Keyphrase dectection with [SEMEVAL2010](https://www.aclweb.org/anthology/S10-1004/) dataset (~250 docs) from ACM Digital Library, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
+
+### Experimental: Similarity Learning
+| Object | Languages | Description |
+| -------------    | ------------- |------------- |
+| 'FeideggerCorpus' | German |  [Feidegger](https://github.com/zalandoresearch/feidegger/) dataset fashion images and German-language descriptions  |
+| 'OpusParallelCorpus' | Any language pair | Parallel corpora of the [OPUS](http://opus.nlpl.eu/) project, currently supports only Tatoeba corpus |
+
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
new file mode 100644
index 0000000000..e56e6a5375
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
@@ -0,0 +1,225 @@
+# Train a sequence tagger
+
+Sequence labeling models are used to model problems such as named entity recognition (NER) and
+part-of-speech (PoS) tagging.
+
+This tutorial section show you how to train state-of-the-art NER models and other taggers in Flair.
+
+## Training a named entity recognition (NER) model with transformers
+
+For a state-of-the-art NER sytem you should fine-tune transformer embeddings, and use full document context
+(see our [FLERT](https://arxiv.org/abs/2011.06993) paper for details). 
+
+Use the following script:
+
+```python
+from flair.datasets import CONLL_03
+from flair.embeddings import TransformerWordEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = CONLL_03()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'ner'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
+print(label_dict)
+
+# 4. initialize fine-tuneable transformer embeddings WITH document context
+embeddings = TransformerWordEmbeddings(model='xlm-roberta-large',
+                                       layers="-1",
+                                       subtoken_pooling="first",
+                                       fine_tune=True,
+                                       use_context=True,
+                                       )
+
+# 5. initialize bare-bones sequence tagger (no CRF, no RNN, no reprojection)
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type='ner',
+                        use_crf=False,
+                        use_rnn=False,
+                        reproject_embeddings=False,
+                        )
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. run fine-tuning
+trainer.fine_tune('resources/taggers/sota-ner-flert',
+                  learning_rate=5.0e-6,
+                  mini_batch_size=4,
+                  mini_batch_chunk_size=1,  # remove this parameter to speed up computation if you have a big GPU
+                  )
+```
+
+As you can see, we use 'xlm-roberta-large' embeddings, enable fine-tuning and set `use_context` to True. 
+We also deactivate the RNN, CRF and reprojection in the `SequenceTagger`. This is because the 
+transformer is so powerful that it does not need these components. We then fine-tune the model with a very small
+learning rate on the corpus.
+
+This will give you state-of-the-art numbers similar to the ones reported
+in [Schweter and Akbik (2021)](https://arxiv.org/abs/2011.06993). 
+
+
+## Training a named entity recognition (NER) model with Flair embeddings
+
+Alternatively to fine-tuning a very large transformer, you can use a classic training setup without fine-tuning.
+In the classic setup, you learn a LSTM-CRF on top of frozen embeddings. We typically use a 'stack' that combines
+Flair and GloVe embeddings:
+
+```python
+from flair.datasets import CONLL_03
+from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = CONLL_03()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'ner'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type, add_unk=False)
+print(label_dict)
+
+# 4. initialize embedding stack with Flair and GloVe
+embedding_types = [
+    WordEmbeddings('glove'),
+    FlairEmbeddings('news-forward'),
+    FlairEmbeddings('news-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/sota-ner-flair',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=150)
+```
+
+This will give you state-of-the-art numbers similar to the ones reported in [Akbik et al. (2018)](https://aclanthology.org/C18-1139.pdf).
+The numbers are not quite as high as fine-tuning transformers, but it requires less GPU memory and depending on your
+setup may run faster in the end. 
+
+
+## Training a part-of-speech tagger
+
+If you want to train a part-of-speech model instead of NER, simply exchange the corpus and the label type: 
+
+```python
+from flair.datasets import UD_ENGLISH
+from flair.embeddings import WordEmbeddings, StackedEmbeddings, FlairEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus = UD_ENGLISH()
+print(corpus)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embedding_types = [
+    WordEmbeddings('glove'),
+    FlairEmbeddings('news-forward'),
+    FlairEmbeddings('news-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type,
+                        use_crf=True)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-upos',
+              learning_rate=0.1,
+              mini_batch_size=32)
+```
+
+This script will give you the state-of-the-art accuracy reported in [Akbik et al. (2018)](https://aclanthology.org/C18-1139.pdf).
+
+## Multi-dataset training
+
+Now, let us train a single model that can PoS tag text in both English and German. To do this, we load both the English
+and German UD corpora and create a MultiCorpus object. We also use the new multilingual Flair embeddings for this task.
+
+All the rest is same as before, e.g.:
+
+```python
+from flair.data import MultiCorpus
+from flair.datasets import UD_ENGLISH, UD_GERMAN
+from flair.embeddings import FlairEmbeddings, StackedEmbeddings
+from flair.models import SequenceTagger
+from flair.trainers import ModelTrainer
+
+# 1. get the corpora - English and German UD
+corpus = MultiCorpus([UD_ENGLISH(), UD_GERMAN()]).downsample(0.1)
+
+# 2. what label do we want to predict?
+label_type = 'upos'
+
+# 3. make the label dictionary from the corpus
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+print(label_dict)
+
+# 4. initialize embeddings
+embedding_types = [
+
+    # we use multilingual Flair embeddings in this task
+    FlairEmbeddings('multi-forward'),
+    FlairEmbeddings('multi-backward'),
+]
+
+embeddings = StackedEmbeddings(embeddings=embedding_types)
+
+# 5. initialize sequence tagger
+tagger = SequenceTagger(hidden_size=256,
+                        embeddings=embeddings,
+                        tag_dictionary=label_dict,
+                        tag_type=label_type,
+                        use_crf=True)
+
+# 6. initialize trainer
+trainer = ModelTrainer(tagger, corpus)
+
+# 7. start training
+trainer.train('resources/taggers/example-universal-pos',
+              learning_rate=0.1,
+              mini_batch_size=32,
+              max_epochs=150,
+              )
+```
+
+This gives you a multilingual model. Try experimenting with more languages!
+
+
diff --git a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
new file mode 100644
index 0000000000..39d6227464
--- /dev/null
+++ b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
@@ -0,0 +1,61 @@
+# Train a text classifier
+
+This tutorial shows you how to train your own text classifier models with Flair. For instance, you 
+could train your own sentiment analysis model, or offensive language detection model.
+
+
+## Training a text classification model with transformers
+
+For text classification, you reach state-of-the-art scores by fine-tuning a transformer. 
+
+Training a model is easy: load the appropriate corpus, make a label dictionary, then fine-tune a `TextClassifier`
+model using the `fine_tune()` method of the `ModelTrainer`. See the example script below:
+
+```python
+from flair.data import Corpus
+from flair.datasets import TREC_6
+from flair.embeddings import TransformerDocumentEmbeddings
+from flair.models import TextClassifier
+from flair.trainers import ModelTrainer
+
+# 1. get the corpus
+corpus: Corpus = TREC_6()
+
+# 2. what label do we want to predict?
+label_type = 'question_class'
+
+# 3. create the label dictionary
+label_dict = corpus.make_label_dictionary(label_type=label_type)
+
+# 4. initialize transformer document embeddings (many models are available)
+document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)
+
+# 5. create the text classifier
+classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type=label_type)
+
+# 6. initialize trainer
+trainer = ModelTrainer(classifier, corpus)
+
+# 7. run training with fine-tuning
+trainer.fine_tune('resources/taggers/question-classification-with-transformer',
+                  learning_rate=5.0e-5,
+                  mini_batch_size=4,
+                  max_epochs=10,
+                  )
+```
+
+Once the model is trained you can load it to predict the class of new sentences. Just call the `predict` method of the
+model.
+
+```python
+classifier = TextClassifier.load('resources/taggers/question-classification-with-transformer/final-model.pt')
+
+# create example sentence
+sentence = Sentence('Who built the Eiffel Tower ?')
+
+# predict class and print
+classifier.predict(sentence)
+
+print(sentence.labels)
+```
+
diff --git a/docs/tutorial/tutorial-training/index.rst b/docs/tutorial/tutorial-training/index.rst
new file mode 100644
index 0000000000..ce4682eb7b
--- /dev/null
+++ b/docs/tutorial/tutorial-training/index.rst
@@ -0,0 +1,14 @@
+Tutorial 2: Training models
+===========================
+
+This tutorial illustrates how you can train your own state-of-the-art NLP models with Flair.
+
+.. toctree::
+   :glob:
+
+   how-model-training-works
+   train-vs-fine-tune
+   how-to-load-prepared-dataset
+   how-to-load-custom-dataset
+   how-to-train-sequence-tagger
+   how-to-train-text-classifier
diff --git a/docs/tutorial/tutorial-training/train-vs-fine-tune.md b/docs/tutorial/tutorial-training/train-vs-fine-tune.md
new file mode 100644
index 0000000000..fd45e90ea0
--- /dev/null
+++ b/docs/tutorial/tutorial-training/train-vs-fine-tune.md
@@ -0,0 +1,11 @@
+# Training vs fine-tuning
+
+There are two broad ways you train a model: The "classic" approach and the fine-tuning approach. This section
+explains the differences, and the things you need to do. 
+
+
+## Fine-Tuning
+
+
+## Training
+

From c12139ef8daf6462694ca4361d39fcab5734859d Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 18:30:22 +0200
Subject: [PATCH 072/124] deploy this branch temporarly

---
 .github/workflows/publish-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index babe90f080..a5d748a96c 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -1,7 +1,7 @@
 name: 'Run tests for ci cd'
 on:
   push:
-    branches: [ main ]
+    branches: [ main, doc-page ]
     tags:
       - "*"
 

From 20f2c4225de2cd7107024c94482b8b4041c3043b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 18:35:10 +0200
Subject: [PATCH 073/124] fix publish docs ci

---
 .github/workflows/publish-docs.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index a5d748a96c..e86d8edf98 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -14,8 +14,6 @@ jobs:
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
-      - name: Install poetry
-        run: pipx install poetry
       - name: setup python ${{ env.python-version }}
         uses: actions/setup-python@v4
         with:
@@ -28,8 +26,6 @@ jobs:
         run: pip install -r docs/requirements.txt
       - name: Fetch git tags
         run: git fetch --tags origin
-      - name: Install Dependencies
-        run: poetry install
       - name: Build docs
         run: |
           sphinx-multiversion docs doc_build/

From 9266587d5fe1e2aff47bec76f828631f84b44704 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 26 Jun 2023 19:10:18 +0200
Subject: [PATCH 074/124] add edit on github button

---
 docs/conf.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index b490d92737..1e00086865 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -15,10 +15,16 @@
 top_level = project.replace("-", "_")
 
 linkcode_url = importlib_metadata.metadata(project)["Home-page"]
-
+html_show_sourcelink = True
 
 smv_current_version = ""  # will by overwritten by sphinx-multi-version to the name of the tag or branch.
-html_context = {"github_version": ""}  # dummy value that sphinx-github-style won't crash when run in temp folder.
+html_context = {
+    "display_github": True,
+    "github_user": "flairNLP",
+    "github_repo": "flair",
+    "github_version": "",
+    "conf_py_path": "/docs/",
+}  # dummy value that sphinx-github-style won't crash when run in temp folder.
 
 
 def linkcode_resolve(*args):
@@ -45,7 +51,6 @@ def linkcode_resolve(*args):
     "sphinx_design",
 ]
 
-
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ["_templates"]
 

From 451e7b69189be9ba93c34aee124ac38bfa0f6ad0 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 3 Jul 2023 12:02:56 +0200
Subject: [PATCH 075/124] hide ToC on main page

---
 docs/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/index.rst b/docs/index.rst
index b7beeaa079..0e37ddc70e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -70,6 +70,7 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
 
 .. toctree::
    :maxdepth: 3
+   :hidden:
 
    Tutorials <tutorial/index>
    API reference <api/index>

From 61694ef91178e2cf9cb683edd4fee374033a67d8 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 3 Jul 2023 12:24:36 +0200
Subject: [PATCH 076/124] add links to api docs in tutorials

---
 docs/api/flair.datasets.rst                   |   1 +
 docs/api/flair.embeddings.rst                 |   1 +
 docs/api/flair.nn.rst                         |   7 +
 docs/api/index.rst                            |   1 +
 docs/contributing/index.rst                   |   1 +
 docs/contributing/updating_documentation.md   |   3 +-
 docs/tutorial/intro.md                        |   3 +-
 docs/tutorial/tutorial-basics/basic-types.md  |  35 ++-
 .../tutorial-basics/entity-linking.md         |   2 +-
 .../tutorial-basics/how-predictions-work.md   |   4 +-
 .../tutorial-basics/how-to-tag-corpus.md      |   6 +-
 docs/tutorial/tutorial-basics/index.rst       |   2 +-
 docs/tutorial/tutorial-basics/other-models.md |   1 +
 .../tutorial-basics/part-of-speech-tagging.md |   9 +-
 .../tutorial-basics/tagging-entities.md       |   6 +-
 .../classic-word-embeddings.md                |   8 +-
 .../tutorial-embeddings/embeddings.md         |  10 +-
 .../tutorial-embeddings/flair-embeddings.md   | 115 ++++----
 docs/tutorial/tutorial-embeddings/index.rst   |   2 +-
 .../tutorial-embeddings/other-embeddings.md   |  95 ++-----
 .../how-model-training-works.md               |  47 ++--
 .../how-to-load-custom-dataset.md             |  23 +-
 .../how-to-load-prepared-dataset.md           | 245 +++---------------
 .../how-to-train-sequence-tagger.md           |   6 +-
 .../how-to-train-text-classifier.md           |   7 +-
 docs/tutorial/tutorial-training/index.rst     |   1 +
 .../docs/embeddings/TRANSFORMER_EMBEDDINGS.md |  43 ++-
 27 files changed, 234 insertions(+), 450 deletions(-)
 create mode 100644 docs/api/flair.nn.rst

diff --git a/docs/api/flair.datasets.rst b/docs/api/flair.datasets.rst
index aebd70c2da..9a883c3e61 100644
--- a/docs/api/flair.datasets.rst
+++ b/docs/api/flair.datasets.rst
@@ -3,5 +3,6 @@ flair.datasets
 
 .. toctree::
    :glob:
+   :maxdepth: 2
 
    datasets/*
diff --git a/docs/api/flair.embeddings.rst b/docs/api/flair.embeddings.rst
index 81241a7428..3f70e62bef 100644
--- a/docs/api/flair.embeddings.rst
+++ b/docs/api/flair.embeddings.rst
@@ -3,5 +3,6 @@ flair.embeddings
 
 .. toctree::
    :glob:
+   :maxdepth: 2
 
    embeddings/*
\ No newline at end of file
diff --git a/docs/api/flair.nn.rst b/docs/api/flair.nn.rst
new file mode 100644
index 0000000000..4d42b88e55
--- /dev/null
+++ b/docs/api/flair.nn.rst
@@ -0,0 +1,7 @@
+flair.nn
+========
+
+.. automodule:: flair.nn
+   :members:
+   :undoc-members:
+   :show-inheritance:
\ No newline at end of file
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4788ffb599..0f67f3cf2d 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -3,6 +3,7 @@ API Docs
 
 .. toctree::
    :glob:
+   :maxdepth: 2
 
    flair
    flair.*
\ No newline at end of file
diff --git a/docs/contributing/index.rst b/docs/contributing/index.rst
index d1d37e8455..ef72362f37 100644
--- a/docs/contributing/index.rst
+++ b/docs/contributing/index.rst
@@ -2,6 +2,7 @@ Contributing
 ============
 
 .. toctree::
+   :maxdepth: 1
 
    writing_a_good_issue
    local_development
diff --git a/docs/contributing/updating_documentation.md b/docs/contributing/updating_documentation.md
index 112b858683..7fb28cbcc3 100644
--- a/docs/contributing/updating_documentation.md
+++ b/docs/contributing/updating_documentation.md
@@ -20,7 +20,8 @@ A tutorial should always be easy to understand, and reference api documentation
 
 ```{note}
   You can reference symbols by defining links
-  `[flair.set_seed](#flair.set_seed)`
+  e.g.: `[`flair.set_seed`](#flair.set_seed)` for a function
+  e.g.: `[entity-linking](project:../tutorial/tutorial-basics/entity-linking.md)` for another tutorial
 ```
 
 ## Docstrings
diff --git a/docs/tutorial/intro.md b/docs/tutorial/intro.md
index 4a0e5b8908..44eb7f1b3a 100644
--- a/docs/tutorial/intro.md
+++ b/docs/tutorial/intro.md
@@ -24,7 +24,8 @@ Let's run **named entity recognition**  (NER) over the following example sentenc
 
 Our goal is to identify names in this sentence, and their types.
 
-To do this, all you need is to make a `Sentence` for this text, load a pre-trained model and use it to predict tags for the sentence:
+To do this, all you need is to make a [`Sentence`](#flair.data.Sentence) for this text, load a pre-trained model and use it to predict tags for the sentence:
+
 
 ```python
 from flair.data import Sentence
diff --git a/docs/tutorial/tutorial-basics/basic-types.md b/docs/tutorial/tutorial-basics/basic-types.md
index 0f84674c3c..703a5d7cd5 100644
--- a/docs/tutorial/tutorial-basics/basic-types.md
+++ b/docs/tutorial/tutorial-basics/basic-types.md
@@ -2,18 +2,18 @@
 
 This tutorial explains the basic concepts used in Flair:
 
--    what is a `Sentence`
--    what is a `Label`
+-    what is a [`Sentence`](#flair.data.Sentence)
+-    what is a [`Label`](#flair.data.Label)
 
 You should be familiar with these two concepts in order to get the most out of Flair.
 
 ## What is a Sentence
 
-If you want to tag a sentence, you need to first make a `Sentence` object for it.
+If you want to tag a sentence, you need to first make a [`Sentence`](#flair.data.Sentence) object for it.
 
 For example, say you want to tag the text "_The grass is green._".
 
-Let's start by making a `Sentence` object for this sentence.
+Let's start by making a [`Sentence`](#flair.data.Sentence) object for this sentence.
 
 
 ```python
@@ -35,9 +35,9 @@ Sentence[5]: "The grass is green."
 
 The print-out tells us that the sentence consists of 5 tokens.
 
-:::info
+```{note}
 A token is an atomic unit of the text, often a word or punctuation. The printout is therefore telling us that the sentence "_The grass is green._" consists of 5 such atomic units. 
-:::
+```
 
 ### Iterating over the tokens in a Sentence
 
@@ -84,12 +84,11 @@ This print-out includes the token index (3) and the lexical value of the token (
 
 ### Tokenization
 
-When you create a `Sentence` as above, the text is automatically tokenized (segmented into words) using the [segtok](https://pypi.org/project/segtok/) library.
-
+When you create a [`Sentence`](#flair.data.Sentence) as above, the text is automatically tokenized (segmented into words) using the [segtok](https://pypi.org/project/segtok/) library.
 
-:::info
+```{note}
 You can also use a different tokenizer if you like. To learn more about this, check out our tokenization tutorial.
-:::
+```
 
 
 ## What is a Label
@@ -141,13 +140,13 @@ Token[4]: "."
 
 This shows that there are 5 tokens in the sentence, one of which has a label.
 
-:::info
-The `add_label` method used here has two mandatory parameters.
-:::
+```{note}
+The [`add_label`](#flair.data.DataPoint.add_label) method used here has two mandatory parameters.
+```
 
 ### Example 2: Labeling a whole sentence
 
-Sometimes you want to label an entire sentence instead of only a token. Do this by calling `add_label` for the whole sentence.
+Sometimes you want to label an entire sentence instead of only a token. Do this by calling [`add_label`](#flair.data.DataPoint.add_label) for the whole sentence.
 
 For example, say we want to add a sentiment label to the sentence "_The grass is green._":
 
@@ -197,7 +196,7 @@ Indicating that the sentence is now labeled with two different types of informat
 
 ### Accessing labels
 
-You can iterate through all labels of a sentence using the `.get_labels()` method:
+You can iterate through all labels of a sentence using the [`get_labels()`](#flair.data.Sentence.get_labels) method:
 
 ```python
 # iterate over all labels and print
@@ -230,16 +229,16 @@ Token[3]: "green" → color (1.0)
 
 This printout tells us that there are two labels: The first is for the whole sentence, tagged as POSITIVE. The second is only for the token "green", tagged as "color".
 
-:::info
+````{note}
 
-If you only want to iterate over labels of a specific type, add the label name as parameter to get_labels(). For instance, to only iterate over all NER labels, do:
+If you only want to iterate over labels of a specific type, add the label name as parameter to [`get_labels()`](#flair.data.Sentence.get_labels). For instance, to only iterate over all NER labels, do:
 
 ```python
 # iterate over all NER labels only
 for label in sentence.get_labels('ner'):
     print(label)
 ```
-:::
+````
 
 ### Information for each label
 
diff --git a/docs/tutorial/tutorial-basics/entity-linking.md b/docs/tutorial/tutorial-basics/entity-linking.md
index 639b5d1e47..8137c2dc5f 100644
--- a/docs/tutorial/tutorial-basics/entity-linking.md
+++ b/docs/tutorial/tutorial-basics/entity-linking.md
@@ -47,7 +47,7 @@ Entity linking typically works best when applied to a whole document instead of
 
 To illustrate how this works, let's use the following short text: "_Bayern played against Barcelona. The match took place in Barcelona._"
 
-In this case, split the text into sentences and pass a list of Sentence objects to the .predict() method:
+In this case, split the text into sentences and pass a list of Sentence objects to the [`Classifier.predict()`](#flair.nn.Classifier.predict) method:
 
 ```python
 from flair.nn import Classifier
diff --git a/docs/tutorial/tutorial-basics/how-predictions-work.md b/docs/tutorial/tutorial-basics/how-predictions-work.md
index abc5973c87..9911f6efa5 100644
--- a/docs/tutorial/tutorial-basics/how-predictions-work.md
+++ b/docs/tutorial/tutorial-basics/how-predictions-work.md
@@ -34,7 +34,7 @@ as LOC (location.)
 ## Getting the predictions
 
 A common question that gets asked is **how to access these predictions directly**. You can do this by using
-the `get_labels()` method to iterate over all predictions:
+the [`get_labels()`](#flair.data.Sentence.get_labels) method to iterate over all predictions:
 
 ```python
 for label in sentence.get_labels():
@@ -52,7 +52,7 @@ The confidence of the prediction is indicated as a score in brackets.
 
 ## Values for each prediction
 
-For each prediction, you can even **directly access** the label value, it's score and the entity text:  
+For each prediction, you can even **directly access** the label value, and all other attributes of the [`Label`](#flair.data.Label) class:  
 
 ```python
 # iterate over all labels in the sentence
diff --git a/docs/tutorial/tutorial-basics/how-to-tag-corpus.md b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
index 537b102946..8aa75a4027 100644
--- a/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
+++ b/docs/tutorial/tutorial-basics/how-to-tag-corpus.md
@@ -1,9 +1,9 @@
 # How to tag a whole corpus
 
 Often, you may want to tag an entire text corpus. In this case, you need to split the corpus into sentences and pass a
-list of `Sentence` objects to the `.predict()` method.
+list of [`Sentence`](#flair.data.Sentence) objects to the [`Classifier.predict()`](#flair.nn.Classifier.predict) method.
 
-For instance, you can use the sentence splitter of segtok to split your text:
+For instance, you can use a [`SentenceSplitter`](#flair.splitter.SentenceSplitter) to split your text:
 
 ```python
 from flair.nn import Classifier
@@ -27,6 +27,6 @@ for sentence in sentences:
     print(sentence)
 ```
 
-Using the `mini_batch_size` parameter of the `.predict()` method, you can set the size of mini batches passed to the
+Using the `mini_batch_size` parameter of the [`Classifier.predict()`](#flair.nn.Classifier.predict) method, you can set the size of mini batches passed to the
 tagger. Depending on your resources, you might want to play around with this parameter to optimize speed.
 
diff --git a/docs/tutorial/tutorial-basics/index.rst b/docs/tutorial/tutorial-basics/index.rst
index dcbd8cd18d..6e59970237 100644
--- a/docs/tutorial/tutorial-basics/index.rst
+++ b/docs/tutorial/tutorial-basics/index.rst
@@ -5,7 +5,7 @@ This tutorial shows you in more detail how to tag your text and access predictio
 and showcases various models we ship with Flair.
 
 .. toctree::
-   :glob:
+   :maxdepth: 1
 
    basic-types
    how-predictions-work
diff --git a/docs/tutorial/tutorial-basics/other-models.md b/docs/tutorial/tutorial-basics/other-models.md
index adb011d4da..74c39479bc 100644
--- a/docs/tutorial/tutorial-basics/other-models.md
+++ b/docs/tutorial/tutorial-basics/other-models.md
@@ -116,6 +116,7 @@ extractor.predict(sentence)
 relations = sentence.get_labels('relation')
 for relation in relations:
     print(relation)
+print("")
 
 # Use the `get_labels()` method with parameter 'relation' to iterate over all relation predictions. 
 for label in sentence.get_labels('relation'):
diff --git a/docs/tutorial/tutorial-basics/part-of-speech-tagging.md b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
index 16d5acb28c..b19587100b 100644
--- a/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
+++ b/docs/tutorial/tutorial-basics/part-of-speech-tagging.md
@@ -37,9 +37,9 @@ Sentence[6]: "Dirk went to the store." → ["Dirk"/NNP, "went"/VBD, "to"/IN, "th
 
 This printout tells us for instance that "_Dirk_" is a proper noun (tag: NNP), and "_went_" is a past tense verb (tag: VBD).
 
-:::info
+```{note}
 To better understand what each tag means, consult the [tag specification](https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html) of the Penn Treebank.
-:::
+```
 
 ### ... in German 
 
@@ -141,9 +141,6 @@ Sentence: "George Washington went to Washington . Dort kaufte er einen Hut ."
 
 However note that they were trained for a mix of European languages and therefore will not work for other languages.
 
-## Tagging Language-Specific Parts-of-Speech (POS) in English
-
-
 ## List of POS Models
 
 We end this section with a list of all models we currently ship with Flair. 
@@ -165,7 +162,7 @@ We end this section with a list of all models we currently ship with Flair.
 | 'pt-pos-clinical' | POS-tagging | Portuguese | [PUCPR](https://github.com/HAILab-PUCPR/portuguese-clinical-pos-tagger) | **92.39** | [LucasFerroHAILab](https://github.com/LucasFerroHAILab) for clinical texts |
 | '[pos-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-pos)' | POS-tagging | Ukrainian |  [Ukrainian UD](https://universaldependencies.org/treebanks/uk_iu/index.html)  | **97.93** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
 
-You choose which pre-trained model you load by passing the appropriate string to the `load()` method of the `Classifier` class.
+You choose which pre-trained model you load by passing the appropriate string to the [`Classifier.load()`](#flair.nn.Classifier.load) method.
 
 A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
 
diff --git a/docs/tutorial/tutorial-basics/tagging-entities.md b/docs/tutorial/tutorial-basics/tagging-entities.md
index 77d3d1efcb..f05ce16b23 100644
--- a/docs/tutorial/tutorial-basics/tagging-entities.md
+++ b/docs/tutorial/tutorial-basics/tagging-entities.md
@@ -56,9 +56,9 @@ print(sentence)
 As you can see, it's the same code, just with '**ner-large**' as model instead of '**ner**'. 
 This model also works with most languages. 
 
-:::hint
+```{note}
 If you want the fastest model we ship, you can also try 'ner-fast'.
-:::
+```
 
 ## Tagging entities in non-English text
 
@@ -196,7 +196,7 @@ We end this section with a list of all models we currently ship with Flair.
 | '[ner-ukrainian](https://huggingface.co/dchaplinsky/flair-uk-ner)' | NER (4-class) | Ukrainian |  [NER-UK dataset](https://github.com/lang-uk/ner-uk)  | **86.05** (F1)  | [dchaplinsky](https://github.com/dchaplinsky) |
 
 
-You choose which pre-trained model you load by passing the appropriate string to the `load()` method of the `Classifier` class.
+You choose which pre-trained model you load by passing the appropriate string to the [`Classifier.load()`](#flair.nn.Classifier.load) method.
 
 A full list of our current and community-contributed models can be browsed on the [__model hub__](https://huggingface.co/models?library=flair&sort=downloads).
 
diff --git a/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
index 73e70838bb..817190071d 100644
--- a/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/classic-word-embeddings.md
@@ -3,7 +3,7 @@
 Classic word embeddings are static and word-level, meaning that each distinct word gets exactly one pre-computed
 embedding. Most embeddings fall under this class, including the popular GloVe or Komninos embeddings.
 
-Simply instantiate the `WordEmbeddings` class and pass a string identifier of the embedding you wish to load. So, if
+Simply instantiate the [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) class and pass a string identifier of the embedding you wish to load. So, if
 you want to use GloVe embeddings, pass the string 'glove' to the constructor:
 
 ```python
@@ -12,7 +12,7 @@ from flair.embeddings import WordEmbeddings
 # init embedding
 glove_embedding = WordEmbeddings('glove')
 ```
-Now, create an example sentence and call the embedding's `embed()` method. You can also pass a list of sentences to
+Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method. You can also pass a list of sentences to
 this method since some embedding types make use of batching to increase speed.
 
 ```python
@@ -31,7 +31,7 @@ for token in sentence:
 This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100.
 
 You choose which pre-trained embeddings you load by passing the appropriate
-id string to the constructor of the `WordEmbeddings` class. Typically, you use
+id string to the constructor of the [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) class. Typically, you use
 the **two-letter language code** to init an embedding, so 'en' for English and
 'de' for German and so on. By default, this will initialize FastText embeddings trained over Wikipedia.
 You can also always use FastText embeddings over Web crawls, by instantiating with '-crawl'. So 'de-crawl'
@@ -111,5 +111,5 @@ word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/path/to/fasttex
 word_vectors.save('/path/to/converted')
 ```
 
-However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try `FastTextEmbeddings` class.
+However, FastText embeddings have the functionality of returning vectors for out of vocabulary words using the sub-word information. If you want to use this then try [`FastTextEmbeddings`](#flair.embeddings.token.FastTextEmbeddings) class.
 
diff --git a/docs/tutorial/tutorial-embeddings/embeddings.md b/docs/tutorial/tutorial-embeddings/embeddings.md
index 0df422cf55..dfb78344b4 100644
--- a/docs/tutorial/tutorial-embeddings/embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/embeddings.md
@@ -8,7 +8,7 @@ top of embeddings, so if you want to train your own models, you should understan
 
 Let's use a standard BERT model (bert-base-uncased) to embed the sentence "the grass is green".
 
-Simply instantate `TransformerWordEmbeddings` and call `embed()` over an example sentence: 
+Simply instantiate [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) and call [`embed()`](#flair.embeddings.base.Embeddings.embed) over an example sentence: 
 
 ```python
 from flair.embeddings import TransformerWordEmbeddings
@@ -80,14 +80,14 @@ Now, the whole sentence is embedded. Print the embedding like this:
 print(sentence.embedding)
 ```
 
-Transformer document embeddings are the most important concept in Flair. Check out more info in this dedicated chapter.
+[`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) are the most important concept in Flair. Check out more info in [this](project:transformer-embeddings.md) dedicated chapter.
 
 
 ## How to Stack Embeddings
 
 Flair allows you to combine embeddings into "embedding stacks". When not fine-tuning, using combinations of embeddings often gives best results!
 
-Use the `StackedEmbeddings` class and instantiate it by passing a list of embeddings that you wish to combine. For instance, lets combine classic GloVe embeddings with forward and backward Flair embeddings. 
+Use the [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) class and instantiate it by passing a list of embeddings that you wish to combine. For instance, lets combine classic GloVe [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) with forward and backward [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). 
 
 First, instantiate the two embeddings you wish to combine:
 
@@ -102,7 +102,7 @@ flair_embedding_forward = FlairEmbeddings('news-forward')
 flair_embedding_backward = FlairEmbeddings('news-backward')
 ```
 
-Now instantiate the `StackedEmbeddings` class and pass it a list containing these two embeddings.
+Now instantiate the [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) class and pass it a list containing these two embeddings.
 
 ```python
 from flair.embeddings import StackedEmbeddings
@@ -116,7 +116,7 @@ stacked_embeddings = StackedEmbeddings([
 ```
 
 
-That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
+That's it! Now just use this embedding like all the other embeddings, i.e. call the [`embed()`](#flair.embeddings.base.Embeddings.embed) method over your sentences.
 
 ```python
 sentence = Sentence('The grass is green .')
diff --git a/docs/tutorial/tutorial-embeddings/flair-embeddings.md b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
index fd78f355fc..817928a2aa 100644
--- a/docs/tutorial/tutorial-embeddings/flair-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/flair-embeddings.md
@@ -2,10 +2,9 @@
 
 Contextual string embeddings are [powerful embeddings](https://drive.google.com/file/d/17yVpFA7MmXaQFTe-HDpZuqw9fJlmzg56/view?usp=sharing)
  that capture latent syntactic-semantic information that goes beyond
-standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and
-thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their
-surrounding text, meaning that the *same word will have different embeddings depending on its
-contextual use*.
+standard word embeddings. Key differences are:
+1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters.
+2) they are **contextualized** by their surrounding text, meaning that the *same word will have different embeddings depending on its contextual use*.
 
 With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings:
 
@@ -22,56 +21,56 @@ sentence = Sentence('The grass is green .')
 flair_embedding_forward.embed(sentence)
 ```
 
-You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class.
+You choose which embeddings you load by passing the appropriate string to the constructor of the [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings) class.
 Currently, the following contextual string embeddings are provided (note: replace '*X*' with either '*forward*' or '*backward*'):
 
-| ID | Language | Embedding |
-| -------------     | ------------- | ------------- |
-| 'multi-X'    | 300+ | [JW300 corpus](http://opus.nlpl.eu/JW300.php), as proposed by [Agić and Vulić (2019)](https://www.aclweb.org/anthology/P19-1310/). The corpus is licensed under CC-BY-NC-SA
-| 'multi-X-fast'    | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly |
-| 'news-X'    | English | Trained with 1 billion word corpus |
-| 'news-X-fast'    | English | Trained with 1 billion word corpus, CPU-friendly |
-| 'mix-X'     | English | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
-| 'ar-X'     | Arabic | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'bg-X'  | Bulgarian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'bg-X-fast'  | Bulgarian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes) |
-| 'cs-X'     | Czech | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'cs-v0-X'    | Czech | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'de-X'  | German  | Trained with mixed corpus (Web, Wikipedia, Subtitles) |
-| 'de-historic-ha-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger* |
-| 'de-historic-wz-X'  | German (historical) | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung* |
-| 'de-historic-rw-X'  | German (historical) | Added by [@redewiedergabe](https://github.com/redewiedergabe): Historical German trained over 100 million tokens |
-| 'es-X'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia |
-| 'es-X-fast'    | Spanish | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia, CPU-friendly |
-| 'es-clinical-'    | Spanish (clinical) | Added by [@matirojasg](https://github.com/flairNLP/flair/issues/2292): Trained with Wikipedia |
-| 'eu-X'    | Basque | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'eu-v0-X'    | Basque | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'fa-X'     | Persian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'fi-X'     | Finnish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'fr-X'    | French | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia |
-| 'he-X'     | Hebrew | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'hi-X'     | Hindi | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'hr-X'     | Croatian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'id-X'     | Indonesian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'it-X'     | Italian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'ja-X'    | Japanese | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)|
-| 'nl-X'     | Dutch | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'nl-v0-X'    | Dutch | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version) |
-| 'no-X'     | Norwegian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'pl-X'  | Polish  | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl) |
-| 'pl-opus-X'     | Polish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'pt-X'    | Portuguese | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings |
-| 'sl-X'     | Slovenian | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'sl-v0-X'  | Slovenian  | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018) |
-| 'sv-X'    | Swedish | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS |
-| 'sv-v0-X'    | Swedish | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018) |
-| 'ta-X'    | Tamil | Added by [@stefan-it](https://github.com/stefan-it/plur) |
-| 'pubmed-X'    | English | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)|
-| 'de-impresso-hipe-v1-X' | German (historical)  | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
-| 'en-impresso-hipe-v1-X' | English (historical) | In-domain data (Chronicling America material) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
-| 'fr-impresso-hipe-v1-X' | French (historical)  | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
-| 'am-X' | Amharic  | Based on 6.5m Amharic text corpus crawled from different sources. See [this paper](https://www.mdpi.com/1999-5903/13/11/275) and the official [GitHub Repository](https://github.com/uhh-lt/amharicmodels) for more information. |
-| 'uk-X' | Ukrainian | Added by [@dchaplinsky](https://github.com/dchaplinsky): Trained with [UberText](https://lang.org.ua/en/corpora/) corpus. |
+| ID                      | Language                                        | Embedding                                                                                                                                                                                                                                    |
+|-------------------------|-------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| 'multi-X'               | 300+                                            | [JW300 corpus](http://opus.nlpl.eu/JW300.php), as proposed by [Agić and Vulić (2019)](https://www.aclweb.org/anthology/P19-1310/). The corpus is licensed under CC-BY-NC-SA                                                                  
+| 'multi-X-fast'          | English, German, French, Italian, Dutch, Polish | Mix of corpora (Web, Wikipedia, Subtitles, News), CPU-friendly                                                                                                                                                                               |
+| 'news-X'                | English                                         | Trained with 1 billion word corpus                                                                                                                                                                                                           |
+| 'news-X-fast'           | English                                         | Trained with 1 billion word corpus, CPU-friendly                                                                                                                                                                                             |
+| 'mix-X'                 | English                                         | Trained with mixed corpus (Web, Wikipedia, Subtitles)                                                                                                                                                                                        |
+| 'ar-X'                  | Arabic                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'bg-X'                  | Bulgarian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'bg-X-fast'             | Bulgarian                                       | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or SETimes)                                                                                                                 |
+| 'cs-X'                  | Czech                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'cs-v0-X'               | Czech                                           | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'de-X'                  | German                                          | Trained with mixed corpus (Web, Wikipedia, Subtitles)                                                                                                                                                                                        |
+| 'de-historic-ha-X'      | German (historical)                             | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Hamburger Anzeiger*                                                                                                                           |
+| 'de-historic-wz-X'      | German (historical)                             | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Historical German trained over *Wiener Zeitung*                                                                                                                               |
+| 'de-historic-rw-X'      | German (historical)                             | Added by [@redewiedergabe](https://github.com/redewiedergabe): Historical German trained over 100 million tokens                                                                                                                             |
+| 'es-X'                  | Spanish                                         | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia                                                                                                                                             |
+| 'es-X-fast'             | Spanish                                         | Added by [@iamyihwa](https://github.com/zalandoresearch/flair/issues/80): Trained with Wikipedia, CPU-friendly                                                                                                                               |
+| 'es-clinical-'          | Spanish (clinical)                              | Added by [@matirojasg](https://github.com/flairNLP/flair/issues/2292): Trained with Wikipedia                                                                                                                                                |
+| 'eu-X'                  | Basque                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'eu-v0-X'               | Basque                                          | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'fa-X'                  | Persian                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'fi-X'                  | Finnish                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'fr-X'                  | French                                          | Added by [@mhham](https://github.com/mhham): Trained with French Wikipedia                                                                                                                                                                   |
+| 'he-X'                  | Hebrew                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'hi-X'                  | Hindi                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'hr-X'                  | Croatian                                        | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'id-X'                  | Indonesian                                      | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'it-X'                  | Italian                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'ja-X'                  | Japanese                                        | Added by [@frtacoa](https://github.com/zalandoresearch/flair/issues/527): Trained with 439M words of Japanese Web crawls (2048 hidden states, 2 layers)                                                                                      |
+| 'nl-X'                  | Dutch                                           | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'nl-v0-X'               | Dutch                                           | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): LM embeddings (earlier version)                                                                                                                                               |
+| 'no-X'                  | Norwegian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'pl-X'                  | Polish                                          | Added by [@borchmann](https://github.com/applicaai/poleval-2018): Trained with web crawls (Polish part of CommonCrawl)                                                                                                                       |
+| 'pl-opus-X'             | Polish                                          | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'pt-X'                  | Portuguese                                      | Added by [@ericlief](https://github.com/ericlief/language_models): LM embeddings                                                                                                                                                             |
+| 'sl-X'                  | Slovenian                                       | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'sl-v0-X'               | Slovenian                                       | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia and OpenSubtitles2018)                                                                                                      |
+| 'sv-X'                  | Swedish                                         | Added by [@stefan-it](https://github.com/zalandoresearch/flair/issues/614): Trained with Wikipedia/OPUS                                                                                                                                      |
+| 'sv-v0-X'               | Swedish                                         | Added by [@stefan-it](https://github.com/stefan-it/flair-lms): Trained with various sources (Europarl, Wikipedia or OpenSubtitles2018)                                                                                                       |
+| 'ta-X'                  | Tamil                                           | Added by [@stefan-it](https://github.com/stefan-it/plur)                                                                                                                                                                                     |
+| 'pubmed-X'              | English                                         | Added by [@jessepeng](https://github.com/zalandoresearch/flair/pull/519): Trained with 5% of PubMed abstracts until 2015 (1150 hidden states, 3 layers)                                                                                      |
+| 'de-impresso-hipe-v1-X' | German (historical)                             | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'en-impresso-hipe-v1-X' | English (historical)                            | In-domain data (Chronicling America material) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU)       |
+| 'fr-impresso-hipe-v1-X' | French (historical)                             | In-domain data (Swiss and Luxembourgish newspapers) for [CLEF HIPE Shared task](https://impresso.github.io/CLEF-HIPE-2020). More information on the shared task can be found in [this paper](https://zenodo.org/record/3752679#.XqgzxXUzZzU) |
+| 'am-X'                  | Amharic                                         | Based on 6.5m Amharic text corpus crawled from different sources. See [this paper](https://www.mdpi.com/1999-5903/13/11/275) and the official [GitHub Repository](https://github.com/uhh-lt/amharicmodels) for more information.             |
+| 'uk-X'                  | Ukrainian                                       | Added by [@dchaplinsky](https://github.com/dchaplinsky): Trained with [UberText](https://lang.org.ua/en/corpora/) corpus.                                                                                                                    |
 
 So, if you want to load embeddings from the German forward LM model, instantiate the method as follows:
 
@@ -87,7 +86,7 @@ flair_bg_backward = FlairEmbeddings('bg-backward')
 
 ## Recommended Flair usage
 
-We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard word embeddings into the mix. So, our recommended `StackedEmbedding` for most English tasks is:
+We recommend combining both forward and backward Flair embeddings. Depending on the task, we also recommend adding standard [`WordEmbeddings`](#flair.embeddings.token.WordEmbeddings) into the mix. So, our recommended [`StackedEmbeddings`](#flair.embeddings.token.StackedEmbeddings) for most English tasks is:
 
 
 ```python
@@ -101,7 +100,7 @@ stacked_embeddings = StackedEmbeddings([
                                        ])
 ```
 
-That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences.
+That's it! Now just use this embedding like all the other embeddings, i.e. call the [`embed()`](#flair.embeddings.base.Embeddings.embed) method over your sentences.
 
 ```python
 sentence = Sentence('The grass is green .')
@@ -119,11 +118,11 @@ Words are now embedded using a concatenation of three different embeddings. This
 
 ## Pooled Flair embeddings
 
-We also developed a pooled variant of the `FlairEmbeddings`. These embeddings differ in that they *constantly evolve over time*, even at prediction time (i.e. after training is complete). This means that the same words in the same sentence at two different points in time may have different embeddings.
+We also developed a pooled variant of the [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). These embeddings differ in that they *constantly evolve over time*, even at prediction time (i.e. after training is complete). This means that the same words in the same sentence at two different points in time may have different embeddings.
 
-`PooledFlairEmbeddings` manage a 'global' representation of each distinct word by using a pooling operation of all past occurences. More details on how this works may be found in [Akbik et al. (2019)](https://www.aclweb.org/anthology/N19-1078/).
+[`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) manage a 'global' representation of each distinct word by using a pooling operation of all past occurences. More details on how this works may be found in [Akbik et al. (2019)](https://www.aclweb.org/anthology/N19-1078/).
 
-You can instantiate and use `PooledFlairEmbeddings` like any other embedding:
+You can instantiate and use [`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) like [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings):
 
 ```python
 from flair.embeddings import PooledFlairEmbeddings
@@ -138,6 +137,6 @@ sentence = Sentence('The grass is green .')
 flair_embedding_forward.embed(sentence)
 ```
 
-Note that while we get some of our best results with `PooledFlairEmbeddings` they are very ineffective memory-wise since they keep past embeddings of all words in memory. In many cases, regular `FlairEmbeddings` will be nearly as good but with much lower memory requirements.
+Note that while we get some of our best results with [`PooledFlairEmbeddings`](#flair.embeddings.token.PooledFlairEmbeddings) they are very ineffective memory-wise since they keep past embeddings of all words in memory. In many cases, regular [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings) will be nearly as good but with much lower memory requirements.
 
 
diff --git a/docs/tutorial/tutorial-embeddings/index.rst b/docs/tutorial/tutorial-embeddings/index.rst
index f452e9aaec..71c4acb340 100644
--- a/docs/tutorial/tutorial-embeddings/index.rst
+++ b/docs/tutorial/tutorial-embeddings/index.rst
@@ -7,7 +7,7 @@ All Flair models are trained on top of embeddings, so if you want to train your
 you should understand how embeddings work.
 
 .. toctree::
-   :glob:
+   :maxdepth: 1
 
    embeddings
    transformer-embeddings
diff --git a/docs/tutorial/tutorial-embeddings/other-embeddings.md b/docs/tutorial/tutorial-embeddings/other-embeddings.md
index 0edd291416..d93802e12a 100644
--- a/docs/tutorial/tutorial-embeddings/other-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/other-embeddings.md
@@ -2,19 +2,19 @@
 
 Flair supports many other embedding types. This section introduces these embeddings.
 
-:::info
-We mostly train our models with either TransformerEmbeddings or FlairEmbeddings. The embeddings presented here might be useful 
+```{note}
+We mostly train our models with either [`TransformerEmbeddings`](#flair.embeddings.transformers.TransformerEmbeddings) or [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). The embeddings presented here might be useful 
 for specific use cases or for comparison purposes. 
-:::
+```
 
 
 ## One-Hot Embeddings
 
-`OneHotEmbeddings` are embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding
+[`OneHotEmbeddings`](#flair.embeddings.token.OneHotEmbeddings) are embeddings that encode each word in a vocabulary as a one-hot vector, followed by an embedding
 layer. These embeddings
 thus do not encode any prior knowledge as do most other embeddings. They also differ in that they
 require to see a vocabulary (`vocab_dictionary`) during instantiation. Such dictionary can be passed as an argument
-during class initialization or constructed directly from a corpus with a `from_corpus` method. The dictionary consists
+during class initialization or constructed directly from a corpus with a [`OneHotEmbeddings.from_corpus`](#flair.embeddings.token.OneHotEmbeddings.from_corpus) method. The dictionary consists
 of all unique tokens contained in the corpus plus an UNK token for all rare words.
 
 You initialize these embeddings like this:
@@ -83,7 +83,7 @@ This should print a vocabulary of size 18 consisting of universal part-of-speech
 
 ## Byte Pair Embeddings
 
-`BytePairEmbeddings` are word embeddings that are precomputed on the subword-level. This means that they are able to
+[`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) are word embeddings that are precomputed on the subword-level. This means that they are able to
 embed any word by splitting words into subwords and looking up their embeddings. `BytePairEmbeddings` were proposed
 and computed by [Heinzerling and Strube (2018)](https://www.aclweb.org/anthology/L18-1473) who found that they offer nearly the same accuracy as word embeddings, but at a fraction
 of the model size. So they are a great choice if you want to train small models.
@@ -108,7 +108,7 @@ embedding.embed(sentence)
 More information can be found
 on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
 
-`BytePairEmbeddings` also have a multilingual model capable of embedding any word in any language.
+[`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) also have a multilingual model capable of embedding any word in any language.
  You can instantiate it with:
 
 ```python
@@ -116,65 +116,22 @@ on the [byte pair embeddings](https://nlp.h-its.org/bpemb/) web page.
 embedding = BytePairEmbeddings('multi')
 ```
 
-You can also load custom `BytePairEmbeddings` by specifying a path to model_file_path and embedding_file_path arguments. They correspond respectively to a SentencePiece model file and to an embedding file (Word2Vec plain text or GenSim binary). For example:
+You can also load custom [`BytePairEmbeddings`](#flair.embeddings.token.BytePairEmbeddings) by specifying a path to model_file_path and embedding_file_path arguments. They correspond respectively to a SentencePiece model file and to an embedding file (Word2Vec plain text or GenSim binary). For example:
 
 ```python
 # init custom embedding
 embedding = BytePairEmbeddings(model_file_path='your/path/m.model', embedding_file_path='your/path/w2v.txt')
 ```
 
-
-## ELMo Embeddings
-
-[ELMo embeddings](http://www.aclweb.org/anthology/N18-1202) were presented by Peters et al. in 2018. They are using
-a bidirectional recurrent neural network to predict the next word in a text.
-We are using the implementation of [AllenNLP](https://allennlp.org/elmo). As this implementation comes with a lot of
-sub-dependencies, which we don't want to include in Flair, you need to first install the library via
-`pip install allennlp==0.9.0` before you can use it in Flair.
-Using the embeddings is as simple as using any other embedding type:
-
-```python
-from flair.embeddings import ELMoEmbeddings
-
-# init embedding
-embedding = ELMoEmbeddings()
-
-# create a sentence
-sentence = Sentence('The grass is green .')
-
-# embed words in sentence
-embedding.embed(sentence)
-```
-
-ELMo word embeddings can be constructed by combining ELMo layers in different ways. The available combination strategies are:
-- `"all"`: Use the concatenation of the three ELMo layers.
-- `"top"`: Use the top ELMo layer.
-- `"average"`: Use the average of the three ELMo layers.
-
-By default, the top 3 layers are concatenated to form the word embedding.
-
-AllenNLP provides the following pre-trained models. To use any of the following models inside Flair
-simple specify the embedding id when initializing the `ELMoEmbeddings`.
-
-| ID | Language | Embedding |
-| ------------- | ------------- | ------------- |
-| 'small' | English | 1024-hidden, 1 layer, 14.6M parameters |
-| 'medium'   | English | 2048-hidden, 1 layer, 28.0M parameters |
-| 'original'    | English | 4096-hidden, 2 layers, 93.6M parameters |
-| 'large'    | English |  |
-| 'pt'   | Portuguese | |
-| 'pubmed' | English biomedical data | [more information](https://allennlp.org/elmo) |
-
-
 ## Document Pool Embeddings
 
-DocumentPoolEmbeddings calculate a pooling operation over all word embeddings in a document.
+[`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) calculate a pooling operation over all word embeddings in a document.
 The default operation is `mean` which gives us the mean of all words in the sentence.
 The resulting embedding is taken as document embedding.
 
-To create a mean document embedding simply create any number of `TokenEmbeddings` first and put them in a list.
-Afterwards, initiate the `DocumentPoolEmbeddings` with this list of `TokenEmbeddings`.
-So, if you want to create a document embedding using GloVe embeddings together with `FlairEmbeddings`,
+To create a mean document embedding simply create any number of [`TokenEmbeddings`](#flair.embeddings.base.TokenEmbeddings) first and put them in a list.
+Afterwards, initiate the [`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) with this list of [`TokenEmbeddings`](#flair.embeddings.base.TokenEmbeddings).
+So, if you want to create a document embedding using GloVe embeddings together with [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings),
 use the following code:
 
 ```python
@@ -187,7 +144,7 @@ glove_embedding = WordEmbeddings('glove')
 document_embeddings = DocumentPoolEmbeddings([glove_embedding])
 ```
 
-Now, create an example sentence and call the embedding's `embed()` method.
+Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method.
 
 ```python
 # create an example sentence
@@ -244,11 +201,11 @@ document_embeddings = DocumentPoolEmbeddings([embeddings], fine_tune_mode='none'
 
 ## Document RNN Embeddings
 
-Besides simple pooling we also support a method based on an RNN to obtain a `DocumentEmbeddings`.
+Besides simple pooling we also support a method based on an RNN to obtain a [`DocumentEmbeddings`](#flair.embeddings.base.DocumentEmbeddings).
 The RNN takes the word embeddings of every token in the document as input and provides its last output state as document
 embedding. You can choose which type of RNN you wish to use.
 
-In order to use the `DocumentRNNEmbeddings` you need to initialize them by passing a list of token embeddings to it:
+In order to use the [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) you need to initialize them by passing a list of token embeddings to it:
 
 ```python
 from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings
@@ -258,7 +215,7 @@ glove_embedding = WordEmbeddings('glove')
 document_embeddings = DocumentRNNEmbeddings([glove_embedding])
 ```
 
-By default, a GRU-type RNN is instantiated. Now, create an example sentence and call the embedding's `embed()` method.
+By default, a GRU-type RNN is instantiated. Now, create an example sentence and call the embedding's [`embed()`](#flair.embeddings.base.Embeddings.embed) method.
 
 ```python
 # create an example sentence
@@ -289,10 +246,10 @@ document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding], rnn_type='LS
 
 ### Need to be trained on a task
 
-Note that while `DocumentPoolEmbeddings` are immediately meaningful, `DocumentRNNEmbeddings` need to be tuned on the
+Note that while [`DocumentPoolEmbeddings`](#flair.embeddings.document.DocumentPoolEmbeddings) are immediately meaningful, [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) need to be tuned on the
 downstream task. This happens automatically in Flair if you train a new model with these embeddings. 
 
-Once the model is trained, you can access the tuned `DocumentRNNEmbeddings` object directly from the classifier object and use it to embed sentences.
+Once the model is trained, you can access the tuned [`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) object directly from the classifier object and use it to embed sentences.
 
 ```python
 document_embeddings = classifier.document_embeddings
@@ -304,18 +261,4 @@ document_embeddings.embed(sentence)
 print(sentence.get_embedding())
 ```
 
-`DocumentRNNEmbeddings` have a number of hyper-parameters that can be tuned to improve learning:
-
-```text
-:param hidden_size: the number of hidden states in the rnn.
-:param rnn_layers: the number of layers for the rnn.
-:param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-layer before putting them into the rnn or not.
-:param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-dimension as before will be taken.
-:param bidirectional: boolean value, indicating whether to use a bidirectional rnn or not.
-:param dropout: the dropout value to be used.
-:param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used.
-:param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
-:param rnn_type: one of 'RNN' or 'LSTM'
-```
+[`DocumentRNNEmbeddings`](#flair.embeddings.document.DocumentRNNEmbeddings) have a number of hyperparameters that can be tuned, please take a look at their [API docs](#flair.embeddings.document.DocumentRNNEmbeddings) to find out more.
diff --git a/docs/tutorial/tutorial-training/how-model-training-works.md b/docs/tutorial/tutorial-training/how-model-training-works.md
index 59d0e96a62..1380c22e72 100644
--- a/docs/tutorial/tutorial-training/how-model-training-works.md
+++ b/docs/tutorial/tutorial-training/how-model-training-works.md
@@ -2,7 +2,7 @@
 
 In this section, we explain the main ideas of model training in Flair.
 
-In particular, we give an introduction to the `ModelTrainer` class, and discuss what decisions you have to make to train good models.
+In particular, we give an introduction to the [`ModelTrainer`](#flair.trainers.ModelTrainer) class, and discuss what decisions you have to make to train good models.
 
 ## Example: Training a Part-of-Speech Tagger
 
@@ -128,16 +128,18 @@ Now let's individually look at each of the main steps in the above script:
 
 The first thing you need is data to train and evaluate your model on.
 
-In Flair, training is done using the `Corpus` object that holds three "splits": a `train`, a `dev` and a `test` split.
+In Flair, training is done using the [`Corpus`](#flair.data.Corpus) object that holds three "splits": a `train`, a `dev` and a `test` split.
 
-:::info
+```{note}
 
 Splitting your data into three splits is standard procedure in machine learning: the `train` split is used to train the model while the `dev` split is used for model selection and early stopping. The `test` split is used only for the final evaluation.
-:::
+```
 
-In this example, we use the <a href="https://universaldependencies.org/treebanks/en_ewt/index.html">English Universal Dependencies</a> dataset to train on. It contains many sentences fully annotated with both universal and language-specific part-of-speech tags. Running these lines will load and print the corpus:
+In this example, we use the [English Universal Dependencies](https://universaldependencies.org/treebanks/en_ewt/index.html) dataset to train on. It contains many sentences fully annotated with both universal and language-specific part-of-speech tags. Running these lines will load and print the corpus:
 
 ```python
+from flair.datasets import UD_ENGLISH
+
 # 1. load the corpus
 corpus = UD_ENGLISH().downsample(0.1)
 print(corpus)
@@ -151,9 +153,9 @@ Corpus: 1254 train + 200 dev + 208 test sentences
 
 Showing us that our downsampled training data has three splits: a training split of 1254 sentences, a dev split of 200 sentences, and a test split of 208 sentences.
 
-:::tip
-The `Corpus` object has a number of very handy helper functions that let you manipulate the data and compute statistics. For instance, in the code above we called `.downsample(0.1)` to downsample the corpus to 10% of its original size. To learn about more helper functions, check out the corpus tutorial.
-:::
+```{note}
+The [`Corpus`](#flair.data.Corpus) object has a number of very handy helper functions that let you manipulate the data and compute statistics. For instance, in the code above we called [`Corpus.downsample(0.1)`](#flair.data.Corpus.downsample) to downsample the corpus to 10% of its original size. To learn about more helper functions, check out the [corpus tutorial](how-to-load-prepared-dataset.md).
+```
 
 ### Step 2: Choose the label type
 
@@ -166,17 +168,16 @@ We choose the label type **'upos'**, since we want to predict universal part-of-
 label_type = 'upos'
 ```
 
-:::info
-
+```{note}
 You might ask: why is specifying the `label_type` even necessary? Well, some corpora have more than one label type. The English UD treebank for instance has both universal PoS tags ('upos') and regular PoS tags ('pos'), plus many other layers of annotation. A tagger is normally trained to predict just type of annotation.
 
 This means that you need to know which label types a specific corpus has labels for, and choose one of them.
-:::
+```
 
 
 ### Step 3: Creating a label dictionary
 
-Our model needs to predict a set of labels. To determine the label set, run `make_label_dictionary` on the corpus and pass the label type you want to predict. In this example, we pass **'upos'** since we want to predict universal part-of-speech tags.
+Our model needs to predict a set of labels. To determine the label set, run [`Corpus.make_label_dictionary()`](#flair.data.Corpus.make_label_dictionary) on the corpus and pass the label type you want to predict. In this example, we pass **'upos'** since we want to predict universal part-of-speech tags.
 
 Running these lines will compute and print the label dictionary from the corpus:
 
@@ -212,7 +213,7 @@ But this is only to make the example code run fast. We generally advise to use t
 
 Depending on what you want to do, you need to initialize the appropriate model type.
 
-For this example, we use the `SequenceLabeler` since we do part-of-speech tagging:
+For this example, we use the [`SequenceTagger`](#flair.models.SequenceTagger) since we do part-of-speech tagging:
 
 ```python
 # 5. initialize sequence tagger
@@ -224,16 +225,15 @@ model = SequenceTagger(hidden_size=256,
 
 Printing it will give you the PyTorch model that is initialized.
 
-:::info
-
-Depending on the task, you need a different model type: For sequence labeling (NER, part-of-speech tagging) you need the `SequenceLabeler`. For text classification you need the `TextClassifier`.
+```{note}
+Depending on the task, you need a different model type: For sequence labeling (NER, part-of-speech tagging) you need the [`SequenceTagger`](#flair.models.SequenceTagger). For text classification you need the [`TextClassifier`](#flair.models.TextClassifier).
 
 For each model type, we are creating dedicated tutorials to better explain what they do.
-:::
+```
 
 ### Step 6: Initialize the Trainer
 
-The ModelTrainer is initialized simply by passing the model and the corpus because that is all it needs.
+The [`ModelTrainer`](#flair.trainers.ModelTrainer) is initialized simply by passing the model and the corpus because that is all it needs.
 
 ```python
 # 6. initialize trainer
@@ -242,7 +242,7 @@ trainer = ModelTrainer(model, corpus)
 
 ### Step 7: Train
 
-Once the trainer is initialized, you can call `train` to launch a standard training run.
+Once the trainer is initialized, you can call [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) to launch a standard training run.
 
 ```python
 # 7. start training
@@ -256,14 +256,13 @@ This will launch a "standard training run" with SGD as optimizer. By default, th
 
 The max_epochs parameter is set to a small number in this script to make it run fast, but normally you should use a much higher value (150 or 200).
 
-:::info
-
-There are two main mechanisms to train a model in Flair. (1) The "classic" workflow (SGD with annealing) is invoked as above using the `train()` method. (2) The current state-of-the-art based on fine-tuning (AdamW with One-Cycle) is invoked using the `fine_tune()` method. In most cases, you will want to use the latter.
-:::
+```{note}
+There are two main mechanisms to train a model in Flair. (1) The "classic" workflow (SGD with annealing) is invoked as above using the [`ModelTrainer.train()`](#flair.trainers.ModelTrainer.train) method. (2) The current state-of-the-art based on fine-tuning (AdamW with Linear Learning Rate Schedulle) is invoked using the [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune) method. In most cases, you will want to use the latter.
+```
 
 ### Step 8: Predict
 
-Once the model is trained you can use it to predict tags for new sentences. Just call the `predict` method of the model.
+Once the model is trained you can use it to predict tags for new sentences. Just call the [`.predict()`](#flair.nn.Classifier.predict) method of the model.
 
 ```python
 # load the model you trained
diff --git a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
index cdde05b4a2..1e7fadb0f1 100644
--- a/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
+++ b/docs/tutorial/tutorial-training/how-to-load-custom-dataset.md
@@ -2,9 +2,9 @@
 
 This part of the tutorial shows how you can load a corpus for training a model. 
 
-## Reading a dataset in column format
+## loading a ColumnCorpus
 
-In cases you want to train over a sequence labeling dataset that is not in the above list, you can load them with the ColumnCorpus object.
+In cases you want to train over a sequence labeling dataset that is not in the above list, you can load them with the [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus) object.
 Most sequence labeling datasets in NLP use some sort of column format in which each line is a word and each column is
 one level of linguistic annotation. See for instance this sentence:
 
@@ -22,7 +22,7 @@ home N O
 ```
 
 The first column is the word itself, the second coarse PoS tags, and the third BIO-annotated NER tags. Empty line separates sentences. To read such a
-dataset, define the column structure as a dictionary and instantiate a `ColumnCorpus`.
+dataset, define the column structure as a dictionary and instantiate a [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus).
 
 ```python
 from flair.data import Corpus
@@ -42,7 +42,7 @@ corpus: Corpus = ColumnCorpus(data_folder, columns,
 
 ```
 
-This gives you a `Corpus` object that contains the train, dev and test splits, each has a list of `Sentence`.
+This gives you a [`Corpus`](#flair.data.Corpus) object that contains the train, dev and test splits, each has a list of [`Sentence`](#flair.data.Sentence).
 So, to check how many sentences there are in the training split, do
 
 ```python
@@ -75,7 +75,7 @@ load specified text and labels from a simple CSV file or format your data to the
 
 Many text classification datasets are distributed as simple CSV files in which each row corresponds to a data point and
 columns correspond to text, labels, and other metadata.  You can load a CSV format classification dataset using
-`CSVClassificationCorpus` by passing in a column format (like in `ColumnCorpus` above).  This column format indicates
+[`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) by passing in a column format (like in [`ColumnCorpus`](#flair.datasets.sequence_labeling.ColumnCorpus) above).  This column format indicates
 which column(s) in the CSV holds the text and which field(s) the label(s). By default, Python's CSV library assumes that
 your files are in Excel CSV format, but [you can specify additional parameters](https://docs.python.org/3/library/csv.html#csv-fmt-params)
 if you use custom delimiters or quote characters.
@@ -104,25 +104,26 @@ corpus: Corpus = CSVClassificationCorpus(data_folder,
 
 
 ### FastText format
-If using `CSVClassificationCorpus` is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this:
+If using [`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this:
 
 ```bash
 __label__<label_1> <text>
 __label__<label_1> __label__<label_2> <text>
 ```
 
-As previously mentioned, to create a `Corpus` for a text classification task, you need to have three files (train, dev, and test) in the
+As previously mentioned, to create a [`Corpus`](#flair.data.Corpus) for a text classification task, you need to have three files (train, dev, and test) in the
 above format located in one folder. This data folder structure could, for example, look like this for the IMDB task:
 ```text
 /resources/tasks/imdb/train.txt
 /resources/tasks/imdb/dev.txt
 /resources/tasks/imdb/test.txt
 ```
-Now create a `ClassificationCorpus` by pointing to this folder (`/resources/tasks/imdb`).
-Thereby, each line in a file is converted to a `Sentence` object annotated with the labels.
+Now create a [`CSVClassificationCorpus`](#flair.datasets.document_classification.CSVClassificationCorpus) by pointing to this folder (`/resources/tasks/imdb`).
+Thereby, each line in a file is converted to a [`Sentence`](#flair.data.Sentence) object annotated with the labels.
 
-Attention: A text in a line can have multiple sentences. Thus, a `Sentence` object can actually consist of multiple
-sentences.
+```{important}
+A text in a line can have multiple sentences. Thus, a [`Sentence`](#flair.data.Sentence) object can actually consist of multiple sentences.
+```
 
 ```python
 from flair.data import Corpus
diff --git a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
index 428898ed84..ed29bea502 100644
--- a/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
+++ b/docs/tutorial/tutorial-training/how-to-load-prepared-dataset.md
@@ -4,19 +4,19 @@ This part of the tutorial shows how you can load a corpus for training a model.
 
 ## The Corpus Object
 
-The `Corpus` represents a dataset that you use to train a model. It consists of a list of `train` sentences,
+The [`Corpus`](#flair.data.Corpus) represents a dataset that you use to train a model. It consists of a list of `train` sentences,
 a list of `dev` sentences, and a list of `test` sentences, which correspond to the training, validation and testing
 split during model training.
 
 The following example snippet instantiates the Universal Dependency Treebank for English as a corpus object:
 
 ```python
-import flair.datasets
-corpus = flair.datasets.UD_ENGLISH()
+from flair.datasets import UD_ENGLISH
+corpus = UD_ENGLISH()
 ```
 
 The first time you call this snippet, it triggers a download of the Universal Dependency Treebank for English onto your
-hard drive. It then reads the train, test and dev splits into the `Corpus` which it returns. Check the length of
+hard drive. It then reads the train, test and dev splits into the [`Corpus`](#flair.data.Corpus) which it returns. Check the length of
 the three splits to see how many Sentences are there:
 
 ```python
@@ -30,7 +30,7 @@ print(len(corpus.test))
 print(len(corpus.dev))
 ```
 
-You can also access the Sentence objects in each split directly. For instance, let us look at the first Sentence in
+You can also access the [`Sentence`](#flair.data.Sentence) objects in each split directly. For instance, let us look at the first Sentence in
 the training split of the English UD:
 
 ```python
@@ -55,20 +55,20 @@ So the corpus is tagged and ready for training.
 
 ### Helper functions
 
-A `Corpus` contains a bunch of useful helper functions.
-For instance, you can downsample the data by calling `downsample()` and passing a ratio. So, if you normally get a
+A [`Corpus`](#flair.data.Corpus) contains a bunch of useful helper functions.
+For instance, you can downsample the data by calling [`Corpus.downsample()`](#flair.data.Corpus.downsample) and passing a ratio. So, if you normally get a
 corpus like this:
 
 ```python
-import flair.datasets
-corpus = flair.datasets.UD_ENGLISH()
+from flair.datasets import UD_ENGLISH
+corpus = UD_ENGLISH()
 ```
 
 then you can downsample the corpus, simply like this:
 
 ```python
-import flair.datasets
-downsampled_corpus = flair.datasets.UD_ENGLISH().downsample(0.1)
+from flair.datasets import UD_ENGLISH
+downsampled_corpus = UD_ENGLISH().downsample(0.1)
 ```
 
 If you print both corpora, you see that the second one has been downsampled to 10% of the data.
@@ -94,7 +94,7 @@ Corpus: 1255 train + 201 dev + 208 test sentences
 ### Creating label dictionaries
 
 For many learning tasks you need to create a "dictionary" that contains all the labels you want to predict.
-You can generate this dictionary directly out of the `Corpus` by calling the method `make_label_dictionary`
+You can generate this dictionary directly out of the [`Corpus`](#flair.data.Corpus) by calling the method [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary)
 and passing the desired `label_type`.
 
 For instance, the UD_ENGLISH corpus instantiated above has multiple layers of annotation like regular
@@ -117,7 +117,7 @@ Dictionary with 17 tags: PROPN, PUNCT, ADJ, NOUN, VERB, DET, ADP, AUX, PRON, PAR
 
 #### Dictionaries for other label types
 
-If you don't know the label types in a corpus, just call `make_label_dictionary` with
+If you don't know the label types in a corpus, just call [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary) with
 any random label name (e.g. `corpus.make_label_dictionary(label_type='abcd')`). This will print
 out statistics on all label types in the corpus:
 
@@ -125,7 +125,7 @@ out statistics on all label types in the corpus:
 The corpus contains the following label types: 'lemma' (in 12543 sentences), 'upos' (in 12543 sentences), 'pos' (in 12543 sentences), 'dependency' (in 12543 sentences), 'number' (in 12036 sentences), 'verbform' (in 10122 sentences), 'prontype' (in 9744 sentences), 'person' (in 9381 sentences), 'mood' (in 8911 sentences), 'tense' (in 8747 sentences), 'degree' (in 7148 sentences), 'definite' (in 6851 sentences), 'case' (in 6486 sentences), 'gender' (in 2824 sentences), 'numtype' (in 2771 sentences), 'poss' (in 2516 sentences), 'voice' (in 1085 sentences), 'typo' (in 399 sentences), 'extpos' (in 185 sentences), 'abbr' (in 168 sentences), 'reflex' (in 98 sentences), 'style' (in 31 sentences), 'foreign' (in 5 sentences)
 ```
 
-This means that you can create dictionaries for any of these label types for the UD_ENGLISH corpus. Let's create dictionaries for regular part of speech tags
+This means that you can create dictionaries for any of these label types for the [`UD_ENGLISH`](#flair.datasets.treebanks.UD_ENGLISH) corpus. Let's create dictionaries for regular part of speech tags
 and a morphological number tagging task:
 
 ```python
@@ -141,216 +141,55 @@ If you print these dictionaries, you will find that the POS dictionary contains
 
 #### Dictionaries for other corpora types
 
-The method `make_label_dictionary` can be used for any corpus, including text classification corpora:
+The method [`Corpus.make_label_dictionary`](#flair.data.Corpus.make_label_dictionary) can be used for any corpus, including text classification corpora:
 
 ```python
 # create label dictionary for a text classification task
-corpus = flair.datasets.TREC_6()
+from flair.datasets import TREC_6
+corpus = TREC_6()
 corpus.make_label_dictionary('question_class')
 ```
 
 ### The MultiCorpus Object
 
-If you want to train multiple tasks at once, you can use the `MultiCorpus` object.
-To initiate the `MultiCorpus` you first need to create any number of `Corpus` objects. Afterwards, you can pass
-a list of `Corpus` to the `MultiCorpus` object. For instance, the following snippet loads a combination corpus
+If you want to train multiple tasks at once, you can use the [`MultiCorpus`](#flair.data.MultiCorpus) object.
+To initiate the [`MultiCorpus`](#flair.data.MultiCorpus) you first need to create any number of [`Corpus`](#flair.data.Corpus) objects. Afterwards, you can pass
+a list of [`Corpus`](#flair.data.Corpus) to the [`MultiCorpus`](#flair.data.MultiCorpus) object. For instance, the following snippet loads a combination corpus
 consisting of the English, German and Dutch Universal Dependency Treebanks.
 
 ```python
-english_corpus = flair.datasets.UD_ENGLISH()
-german_corpus = flair.datasets.UD_GERMAN()
-dutch_corpus = flair.datasets.UD_DUTCH()
+from flair.datasets import UD_ENGLISH, UD_GERMAN, UD_DUTCH
+english_corpus = UD_ENGLISH()
+german_corpus = UD_GERMAN()
+dutch_corpus = UD_DUTCH()
 
 # make a multi corpus consisting of three UDs
 from flair.data import MultiCorpus
 multi_corpus = MultiCorpus([english_corpus, german_corpus, dutch_corpus])
 ```
 
-The `MultiCorpus` inherits from `Corpus`, so you can use it like any other corpus to train your models.
+The [`MultiCorpus`](#flair.data.MultiCorpus) inherits from `[`Corpus`](#flair.data.Corpus), so you can use it like any other corpus to train your models.
 
 ## Datasets included in Flair
 
-Flair supports many datasets out of the box. It automatically downloads and sets up the
-data the first time you call the corresponding constructor ID.
+Flair supports many datasets out of the box. It usually automatically downloads and sets up the data the first time you
+call the corresponding constructor ID.
+The datasets are split into multiple modules, however they all can be imported from `flair.datasets` too.
+You can look up the respective modules to find the possible datasets.
 
 The following datasets are supported:
 
-### Named Entity Recognition
-
-| Object                      | Languages                | Description                                                                                                                                                                                       |
-|-----------------------------|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-| 'CONLL_03'                  | English                  | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER (requires manual download)                                                                                                 |
-| 'CONLL_03_GERMAN'           | German                   | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER (requires manual download)                                                                                                 |
-| 'CONLL_03_DUTCH'            | Dutch                    | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER                                                                                                                            |
-| 'CONLL_03_SPANISH'          | Spanish                  | [CoNLL-03](https://www.clips.uantwerpen.be/conll2002/ner/) 4-class NER                                                                                                                            |
-| 'ONTONOTES'                 | Arabic, English, Chinese | [Ontonotes](https://paperswithcode.com/dataset/ontonotes-5-0/) 18-class NER                                                                                                                          |
-| 'FEWNERD'                   | English                  | [FewNERD](https://ningding97.github.io/fewnerd/) 66-class NER                                                                                                                             |
-| 'NER_ARABIC_ANER'           | Arabic                   | [Arabic Named Entity Recognition Corpus](http://curtis.ml.cmu.edu/w/courses/index.php/ANERcorp) 4-class NER                                                                                       |
-| 'NER_ARABIC_AQMAR'          | Arabic                   | [American and Qatari Modeling of Arabic](http://www.cs.cmu.edu/~ark/AQMAR/) 4-class NER (modified)                                                                                                |
-| 'NER_BASQUE'                | Basque                   | [NER dataset for Basque](http://ixa2.si.ehu.eus/eiec/)                                                                                                                                            |
-| 'NER_CHINESE_WEIBO'         | Chinese                  | [Weibo NER corpus](https://paperswithcode.com/sota/chinese-named-entity-recognition-on-weibo-ner/).                                                                                               |
-| 'NER_DANISH_DANE'           | Danish                   | [DaNE dataset](https://github.com/alexandrainst/danlp/blob/master/docs/datasets.md#danish-dependency-treebank)                                                                                    |
-| 'NER_ENGLISH_MOVIE_SIMPLE'  | English                  | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - simple NER                                                                                                   |
-| 'NER_ENGLISH_MOVIE_COMPLEX' | English                  | [NER dataset for movie reviews](https://groups.csail.mit.edu/sls/downloads/movie/) - complex NER                                                                                                  |
-| 'NER_ENGLISH_PERSON'        | English                  | [PERSON_NER](https://github.com/das-sudeshna/genid) NER with person names                                                                                                                         |
-| 'NER_ENGLISH_RESTAURANT'    | English                  | [NER dataset for restaurant reviews](https://groups.csail.mit.edu/sls/downloads/restaurant/)                                                                                                      |
-| 'NER_ENGLISH_SEC_FILLINGS'  | English                  | [SEC-fillings](https://github.com/juand-r/entity-recognition-datasets) with 4-class NER labels from (Alvarado et al, 2015)[https://aclanthology.org/U15-1010/] here                               |
-| 'NER_ENGLISH_STACKOVERFLOW' | English                  | NER on StackOverflow posts                                                                                                                                                                        |
-| 'NER_ENGLISH_TWITTER'       | English                  | [Twitter NER dataset](https://github.com/aritter/twitter_nlp/)                                                                                                                                    |
-| 'NER_ENGLISH_WIKIGOLD'      | English                  | [Wikigold](https://github.com/juand-r/entity-recognition-datasets/tree/master/data/wikigold) a manually annotated collection of Wikipedia text                                                    |
-| 'NER_ENGLISH_WNUT_2020'     | English                  | [WNUT-20](https://github.com/jeniyat/WNUT_2020_NER) named entity extraction                                                                                                                       |
-| 'NER_ENGLISH_WEBPAGES'      | English                  | 4-class NER on web pages from [Ratinov and Roth (2009)](https://aclanthology.org/W09-1119/)                                                                                                       |
-| 'NER_FINNISH'               | Finnish                  | [Finer-data](https://github.com/mpsilfve/finer-data)                                                                                                                                              |
-| 'NER_GERMAN_BIOFID'         | German                   | [CoNLL-03](https://www.aclweb.org/anthology/K19-1081/) Biodiversity literature NER                                                                                                                |
-| 'NER_GERMAN_EUROPARL'       | German                   | [German Europarl dataset](https://nlpado.de/~sebastian/software/ner_german.shtml) NER in German EU parliament speeches                                                                            |
-| 'NER_GERMAN_GERMEVAL'       | German                   | [GermEval 14 NER](https://sites.google.com/site/germeval2014ner/data/) corpus                                                                                                                     |
-| 'NER_GERMAN_LEGAL'          | German                   | [Legal Entity Recognition](https://github.com/elenanereiss/Legal-Entity-Recognition) NER in German Legal Documents                                                                                |
-| 'NER_GERMAN_POLITICS'       | German                   | [NEMGP](https://www.thomas-zastrow.de/nlp/) corpus                                                                                                                                                |
-| 'NER_HIPE_2022'             | 5 languages              | NER dataset for [HIPE-2022](https://hipe-eval.github.io/HIPE-2022/) (Identifying Historical People, Places and other Entities)                                                                    |
-| 'NER_HUNGARIAN'             | Hungarian                | NER on Hungarian business news                                                                                                                                                                    |
-| 'NER_ICELANDIC'             | Icelandic                | NER on Icelandic                                                                                                                                                                                  |
-| 'NER_JAPANESE'              | Japanese                 | [Japanese NER](https://github.com/Hironsan/IOB2Corpus) dataset automatically generated from Wikipedia                                                                                             |
-| 'NER_MASAKHANE'             | 10 languages             | [MasakhaNER: Named Entity Recognition for African Languages](https://github.com/masakhane-io/masakhane-ner) corpora                                                                               |
-| 'NER_SWEDISH'               | Swedish                  | [Swedish Spraakbanken NER](https://github.com/klintan/swedish-ner-corpus/) 4-class NER                                                                                                            |
-| 'NER_TURKU'                 | Finnish                  | [TURKU_NER](https://github.com/TurkuNLP/turku-ner-corpus) NER corpus created by the Turku NLP Group, University of Turku, Finland                                                                 |
-| 'NER_UKRAINIAN'             | Ukrainian                | [lang-uk](https://github.com/lang-uk/flair-ner) NER corpus created by the [Lang-uk community](https://lang.org.ua/en/)                                                                            |
-| 'NER_MULTI_WIKIANN'         | 282 languages            | Gigantic [corpus for cross-lingual NER derived from Wikipedia](https://elisa-ie.github.io/wikiann/).                                                                                              |
-| 'NER_MULTI_WIKINER'         | 8 languages              | [WikiNER](https://github.com/dice-group/FOX/tree/master/input/Wikiner) NER dataset automatically generated from Wikipedia (English, German, French, Italian, Spanish, Portuguese, Polish, Russian) |
-| 'NER_MULTI_XTREME'          | 176 languages            | [Xtreme](https://github.com/google-research/xtreme) corpus by Google Research for cross-lingual NER consisting of datasets of a total of 176 languages                                            |
-| 'WNUT_17'                   | English                  | [WNUT-17](https://noisy-text.github.io/2017/files/) emerging entity detection                                                                                                                     |
-
-### Biomedical Named Entity Recognition
-
-We support 31 biomedical NER datasets, listed
-
-### Entity Linking
-| Object | Languages | Description |
-| -------------    | ------------- |-------------  |
-| 'NEL_ENGLISH_AIDA' | English  |  [AIDA CoNLL-YAGO Entity Linking corpus](https://www.mpi-inf.mpg.de/departments/databases-and-information-systems/research/ambiverse-nlu/aida/downloads) on the CoNLL-03 corpus |
-| 'NEL_ENGLISH_AQUAINT' | English  | Aquaint Entity Linking corpus introduced in [Milne and Witten (2008)](https://www.cms.waikato.ac.nz/~ihw/papers/08-DNM-IHW-LearningToLinkWithWikipedia.pdf) |
-| 'NEL_ENGLISH_IITB' | English  | ITTB Entity Linking corpus introduced in [Sayali et al. (2009)](https://dl.acm.org/doi/10.1145/1557019.1557073) |
-| 'NEL_ENGLISH_REDDIT' | English  | Reddit Entity Linking corpus introduced in [Botzer et al. (2021)](https://arxiv.org/abs/2101.01228v2) (only gold annotations)|
-| 'NEL_ENGLISH_TWEEKI' | English  | ITTB Entity Linking corpus introduced in [Harandizadeh and Singh (2020)](https://aclanthology.org/2020.wnut-1.29.pdf) |
-| 'NEL_GERMAN_HIPE' | German  | [HIPE](https://impresso.github.io/CLEF-HIPE-2020/) Entity Linking corpus for historical German as a [sentence-segmented version](https://github.com/stefan-it/clef-hipe) |
-
-### Relation Extraction
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'RE_ENGLISH_CONLL04' | English  |  [CoNLL-04](https://github.com/bekou/multihead_joint_entity_relation_extraction/tree/master/data/CoNLL04) Relation Extraction |
-| 'RE_ENGLISH_SEMEVAL2010' | English  |  [SemEval-2010 Task 8](https://aclanthology.org/S10-1006.pdf) on Multi-Way Classification of Semantic Relations Between Pairs of Nominals |
-| 'RE_ENGLISH_TACRED' | English  |  [TAC Relation Extraction Dataset](https://nlp.stanford.edu/projects/tacred/) with 41 relations (download required) |
-| 'RE_ENGLISH_DRUGPROT' | English  |  [DrugProt corpus: Biocreative VII Track 1](https://zenodo.org/record/5119892#.YSdSaVuxU5k/) - drug and chemical-protein interactions |
-
-
-### GLUE Benchmark
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'GLUE_COLA' | English | The Corpus of Linguistic Acceptability from GLUE benchmark |
-| 'GLUE_MNLI' | English | The Multi-Genre Natural Language Inference Corpus from the GLUE benchmark |
-| 'GLUE_RTE' | English | The RTE task from the GLUE benchmark |
-| 'GLUE_QNLI' | English | The Stanford Question Answering Dataset formated as NLI task from the GLUE benchmark |
-| 'GLUE_WNLI' | English | The Winograd Schema Challenge formated as NLI task from the GLUE benchmark |
-| 'GLUE_MRPC' | English | The MRPC task from GLUE benchmark |
-| 'GLUE_QQP' | English | The Quora Question Pairs dataset where the task is to determine whether a pair of questions are semantically equivalent |
-| 'SUPERGLUE_RTE' | English | The RTE task from the SuperGLUE benchmark |
-
-### Universal Proposition Banks
-
-We also support loading the [Universal Proposition Banks](https://github.com/System-T/UniversalPropositions)
-for the purpose of training multilingual frame detection systems.
-
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'UP_CHINESE' | Chinese  |  Universal Propositions for [Chinese](https://github.com/System-T/UniversalPropositions/tree/master/UP_Chinese) |
-| 'UP_ENGLISH'| English  |  Universal Propositions for [English](https://github.com/System-T/UniversalPropositions/tree/master/UP_English-EWT) |
-| 'UP_FINNISH'| Finnish  |  Universal Propositions for [Finnish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Finnish)
-| 'UP_FRENCH'| French  |  Universal Propositions for [French](https://github.com/System-T/UniversalPropositions/tree/master/UP_French)
-| 'UP_GERMAN'| German  |  Universal Propositions for [German](https://github.com/System-T/UniversalPropositions/tree/master/UP_German) |
-| 'UP_ITALIAN', | Italian  |  Universal Propositions for [Italian](https://github.com/System-T/UniversalPropositions/tree/master/UP_Italian) |
-| 'UP_SPANISH' | Spanish  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish) |
-| 'UP_SPANISH_ANCORA' | Spanish (Ancora Corpus)  |  Universal Propositions for [Spanish](https://github.com/System-T/UniversalPropositions/tree/master/UP_Spanish-AnCora) |
-
-### Universal Dependency Treebanks
-
-| Object             | Languages         | Description                                                                                                                         |
-|--------------------|-------------------|-------------------------------------------------------------------------------------------------------------------------------------|
-| 'UD_ARABIC'        | Arabic            | Universal Dependency Treebank for [Arabic](https://github.com/UniversalDependencies/UD_Arabic-PADT)                                 |
-| 'UD_BASQUE'        | Basque            | Universal Dependency Treebank for [Basque](https://github.com/UniversalDependencies/UD_Basque-BDT)                                  |
-| 'UD_BULGARIAN'     | Bulgarian         | Universal Dependency Treebank for [Bulgarian](https://github.com/UniversalDependencies/UD_Bulgarian-BTB)                            
-| 'UD_CATALAN',      | Catalan           | Universal Dependency Treebank for [Catalan](https://github.com/UniversalDependencies/UD_Catalan-AnCora)                             |
-| 'UD_CHINESE'       | Chinese           | Universal Dependency Treebank for [Chinese](https://github.com/UniversalDependencies/UD_Chinese-GSD)                                |
-| 'UD_CHINESE_KYOTO' | Classical Chinese | Universal Dependency Treebank for Classical [Chinese](https://github.com/UniversalDependencies/UD_Classical_Chinese-Kyoto/tree/dev) |
-| 'UD_CROATIAN'      | Croatian          | Universal Dependency Treebank for [Croatian](https://github.com/UniversalDependencies/UD_Croatian-SET)                              |
-| 'UD_CZECH'         | Czech             | Very large Universal Dependency Treebank for [Czech](https://github.com/UniversalDependencies/UD_Czech-PDT)                         |
-| 'UD_DANISH'        | Danish            | Universal Dependency Treebank for [Danish](https://github.com/UniversalDependencies/UD_Danish-DDT)                                  |
-| 'UD_DUTCH'         | Dutch             | Universal Dependency Treebank for [Dutch](https://github.com/UniversalDependencies/UD_Dutch-Alpino)                                 |
-| 'UD_ENGLISH'       | English           | Universal Dependency Treebank for [English](https://github.com/UniversalDependencies/UD_English-EWT)                                |
-| 'UD_FINNISH'       | Finnish           | Universal Dependency Treebank for [Finnish](https://github.com/UniversalDependencies/UD_Finnish-TDT)                                |
-| 'UD_FRENCH'        | French            | Universal Dependency Treebank for [French](https://github.com/UniversalDependencies/UD_French-GSD)                                  |
-| 'UD_GERMAN'        | German            | Universal Dependency Treebank for [German](https://github.com/UniversalDependencies/UD_German-GSD)                                  |
-| 'UD_GERMAN-HDT'    | German            | Very large Universal Dependency Treebank for [German](https://github.com/UniversalDependencies/UD_German-HDT)                       |
-| 'UD_HEBREW'        | Hebrew            | Universal Dependency Treebank for [Hebrew](https://github.com/UniversalDependencies/UD_Hebrew-HTB)                                  |
-| 'UD_HINDI'         | Hindi             | Universal Dependency Treebank for [Hindi](https://github.com/UniversalDependencies/UD_Hindi-HDTB)                                   |
-| 'UD_INDONESIAN'    | Indonesian        | Universal Dependency Treebank for [Indonesian](https://github.com/UniversalDependencies/UD_Indonesian-GSD)                          |
-| 'UD_ITALIAN'       | Italian           | Universal Dependency Treebank for [Italian](https://github.com/UniversalDependencies/UD_Italian-ISDT)                               |
-| 'UD_JAPANESE'      | Japanese          | Universal Dependency Treebank for [Japanese](https://github.com/UniversalDependencies/UD_Japanese-GSD)                              |
-| 'UD_KOREAN'        | Korean            | Universal Dependency Treebank for [Korean](https://github.com/UniversalDependencies/UD_Korean-Kaist)                                |
-| 'UD_NORWEGIAN',    | Norwegian         | Universal Dependency Treebank for [Norwegian](https://github.com/UniversalDependencies/UD_Norwegian-Bokmaal)                        |
-| 'UD_PERSIAN'       | Persian / Farsi   | Universal Dependency Treebank for [Persian](https://github.com/UniversalDependencies/UD_Persian-Seraji)                             |
-| 'UD_POLISH'        | Polish            | Universal Dependency Treebank for [Polish](https://github.com/UniversalDependencies/UD_Polish-LFG)                                  |
-| 'UD_PORTUGUESE'    | Portuguese        | Universal Dependency Treebank for [Portuguese](https://github.com/UniversalDependencies/UD_Portuguese-Bosque)                       |
-| 'UD_ROMANIAN'      | Romanian          | Universal Dependency Treebank for [Romanian](https://github.com/UniversalDependencies/UD_Romanian-RRT)                              |
-| 'UD_RUSSIAN'       | Russian           | Universal Dependency Treebank for [Russian](https://github.com/UniversalDependencies/UD_Russian-SynTagRus)                          |
-| 'UD_SERBIAN'       | Serbian           | Universal Dependency Treebank for [Serbian](https://github.com/UniversalDependencies/UD_Serbian-SET)                                |
-| 'UD_SLOVAK'        | Slovak            | Universal Dependency Treebank for [Slovak](https://github.com/UniversalDependencies/UD_Slovak-SNK)                                  |
-| 'UD_SLOVENIAN'     | Slovenian         | Universal Dependency Treebank for [Slovenian](https://github.com/UniversalDependencies/UD_Slovenian-SSJ)                            |
-| 'UD_SPANISH'       | Spanish           | Universal Dependency Treebank for [Spanish](https://github.com/UniversalDependencies/UD_Spanish-GSD)                                |
-| 'UD_SWEDISH'       | Swedish           | Universal Dependency Treebank for [Swedish](https://github.com/UniversalDependencies/UD_Swedish-Talbanken)                          |
-| 'UD_TURKISH'       | Turkish           | Universal Dependency Treebank for [Tturkish](https://github.com/UniversalDependencies/UD_Turkish-IMST)                              |
-| 'UD_UKRAINIAN'     | Ukrainian         | Universal Dependency Treebank for [Ukrainian](https://github.com/UniversalDependencies/UD_Ukrainian-IU)                             |
-
-### Text Classification
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'AMAZON_REVIEWS' | English |  [Amazon product reviews](https://nijianmo.github.io/amazon/index.html/) dataset with sentiment annotation |
-| 'COMMUNICATIVE_FUNCTIONS' | English |  [Communicative functions](https://github.com/Alab-NII/FECFevalDataset) of sentences in scholarly papers |
-| 'GERMEVAL_2018_OFFENSIVE_LANGUAGE' | German | Offensive language detection for German |
-| 'GO_EMOTIONS' | English | [GoEmotions dataset](https://github.com/google-research/google-research/tree/master/goemotions) Reddit comments labeled with 27 emotions |
-| 'IMDB' | English |  [IMDB](http://ai.stanford.edu/~amaas/data/sentiment/) dataset of movie reviews with sentiment annotation  |
-| 'NEWSGROUPS' | English | The popular [20 newsgroups](http://qwone.com/~jason/20Newsgroups/) classification dataset |
-| 'YAHOO_ANSWERS' | English | The [10 largest main categories](https://course.fast.ai/datasets#nlp) from the Yahoo! Answers |
-| 'SENTIMENT_140' | English | [Tweets dataset](http://help.sentiment140.com/for-students/) with sentiment annotation |
-| 'SENTEVAL_CR' | English | Customer reviews dataset of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
-| 'SENTEVAL_MR' | English | Movie reviews dataset of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
-| 'SENTEVAL_SUBJ' | English | Subjectivity dataset of [SentEval](https://github.com/facebookresearch/SentEval) |
-| 'SENTEVAL_MPQA' | English | Opinion-polarity dataset of [SentEval](https://github.com/facebookresearch/SentEval) with opinion-polarity annotation |
-| 'SENTEVAL_SST_BINARY' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with sentiment annotation |
-| 'SENTEVAL_SST_GRANULAR' | English | Stanford sentiment treebank dataset of of [SentEval](https://github.com/facebookresearch/SentEval) with fine-grained sentiment annotation |
-| 'TREC_6', 'TREC_50' | English | The [TREC](http://cogcomp.org/Data/QA/QC/) question classification dataset |
-
-### Text Regression
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'WASSA_ANGER' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (anger) |
-| 'WASSA_FEAR' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (fear) |
-| 'WASSA_JOY' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (joy) |
-| 'WASSA_SADNESS' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (sadness) |
-
-### Other Sequence Labeling
-
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'CONLL_2000' | English  | Syntactic chunking with [CoNLL-2000](https://www.clips.uantwerpen.be/conll2000/chunking/)  |
-| 'BIOSCOPE' | English  | Negation and speculation scoping wih [BioScope](https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-S11-S9/) biomedical texts annotated for uncertainty, negation and their scopes |
-| 'KEYPHRASE_INSPEC' | English | Keyphrase dectection with [INSPEC](https://www.aclweb.org/anthology/W03-1028) original corpus (2000 docs) from INSPEC database, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
-| 'KEYPHRASE_SEMEVAL2017' | English | Keyphrase dectection with [SEMEVAL2017](https://arxiv.org/abs/1704.02853) dataset (500 docs) from ScienceDirect, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
-| 'KEYPHRASE_SEMEVAL2010' | English | Keyphrase dectection with [SEMEVAL2010](https://www.aclweb.org/anthology/S10-1004/) dataset (~250 docs) from ACM Digital Library, adapted by [midas-research](https://arxiv.org/abs/1910.08840) |
-
-### Experimental: Similarity Learning
-| Object | Languages | Description |
-| -------------    | ------------- |------------- |
-| 'FeideggerCorpus' | German |  [Feidegger](https://github.com/zalandoresearch/feidegger/) dataset fashion images and German-language descriptions  |
-| 'OpusParallelCorpus' | Any language pair | Parallel corpora of the [OPUS](http://opus.nlpl.eu/) project, currently supports only Tatoeba corpus |
-
-
+| Task                                | Module                                                                                                                                      |
+|-------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------|
+| Named Entity Recognition            | [flair.datasets.sequence_labeling](#flair.datasets.sequence_labeling)                                                                       |
+| Text Classification                 | [flair.datasets.document_classification](#flair.datasets.document_classification)                                                           |
+| Text Regression                     | [flair.datasets.document_classification](#flair.datasets.document_classification)                                                           |
+| Biomedical Named Entity Recognition | [flair.datasets.biomedical](#flair.datasets.biomedical)                                                                                     |
+| Entity Linking                      | [flair.datasets.entity_linking](#flair.datasets.entity_linking)                                                                             |
+| Relation Extraction                 | [flair.datasets.relation_extraction](#flair.datasets.relation_extraction)                                                                   |
+| Sequence Labeling                   | [flair.datasets.sequence_labeling](#flair.datasets.sequence_labeling)                                                                       |
+| Glue Benchmark                      | [flair.datasets.text_text](#flair.datasets.text_text) and [flair.datasets.document_classification](#flair.datasets.document_classification) |
+| Universal Proposition Banks         | [flair.datasets.treebanks](#flair.datasets.treebanks)                                                                                       |
+| Universal Dependency Treebanks      | [flair.datasets.treebanks](#flair.datasets.treebanks)                                                                                       |
+| OCR-Layout-NER                      | [flair.datasets.ocr](#flair.datasets.ocr)                                                                                                   |
 
diff --git a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
index e56e6a5375..247b3daa11 100644
--- a/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
+++ b/docs/tutorial/tutorial-training/how-to-train-sequence-tagger.md
@@ -58,8 +58,8 @@ trainer.fine_tune('resources/taggers/sota-ner-flert',
                   )
 ```
 
-As you can see, we use 'xlm-roberta-large' embeddings, enable fine-tuning and set `use_context` to True. 
-We also deactivate the RNN, CRF and reprojection in the `SequenceTagger`. This is because the 
+As you can see, we use [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) based on 'xlm-roberta-large' embeddings. We enable fine-tuning and set `use_context` to True. 
+We also deactivate the RNN, CRF and reprojection in the [`SequenceTagger`](#flair.models.SequenceTagger). This is because the 
 transformer is so powerful that it does not need these components. We then fine-tune the model with a very small
 learning rate on the corpus.
 
@@ -171,7 +171,7 @@ This script will give you the state-of-the-art accuracy reported in [Akbik et al
 ## Multi-dataset training
 
 Now, let us train a single model that can PoS tag text in both English and German. To do this, we load both the English
-and German UD corpora and create a MultiCorpus object. We also use the new multilingual Flair embeddings for this task.
+and German UD corpora and create a [`MultiCorpus`](#flair.data.MultiCorpus) object. We also use the new multilingual Flair embeddings for this task.
 
 All the rest is same as before, e.g.:
 
diff --git a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
index 39d6227464..265689c21f 100644
--- a/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
+++ b/docs/tutorial/tutorial-training/how-to-train-text-classifier.md
@@ -8,8 +8,8 @@ could train your own sentiment analysis model, or offensive language detection m
 
 For text classification, you reach state-of-the-art scores by fine-tuning a transformer. 
 
-Training a model is easy: load the appropriate corpus, make a label dictionary, then fine-tune a `TextClassifier`
-model using the `fine_tune()` method of the `ModelTrainer`. See the example script below:
+Training a model is easy: load the appropriate corpus, make a label dictionary, then fine-tune a [`TextClassifier`](#flair.models.TextClassifier)
+model using the [`ModelTrainer.fine_tune()`](#flair.trainers.ModelTrainer.fine_tune) method. See the example script below:
 
 ```python
 from flair.data import Corpus
@@ -44,8 +44,7 @@ trainer.fine_tune('resources/taggers/question-classification-with-transformer',
                   )
 ```
 
-Once the model is trained you can load it to predict the class of new sentences. Just call the `predict` method of the
-model.
+Once the model is trained you can load it to predict the class of new sentences. Just call the [`predict`](#flair.nn.DefaultClassifier.predict) method of the model.
 
 ```python
 classifier = TextClassifier.load('resources/taggers/question-classification-with-transformer/final-model.pt')
diff --git a/docs/tutorial/tutorial-training/index.rst b/docs/tutorial/tutorial-training/index.rst
index ce4682eb7b..70209a3f70 100644
--- a/docs/tutorial/tutorial-training/index.rst
+++ b/docs/tutorial/tutorial-training/index.rst
@@ -5,6 +5,7 @@ This tutorial illustrates how you can train your own state-of-the-art NLP models
 
 .. toctree::
    :glob:
+   :maxdepth: 1
 
    how-model-training-works
    train-vs-fine-tune
diff --git a/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md b/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
index 75c312ee63..7c7cf46442 100644
--- a/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
+++ b/resources/docs/embeddings/TRANSFORMER_EMBEDDINGS.md
@@ -1,7 +1,7 @@
 # Transformer Embeddings
 
 Flair supports various Transformer-based architectures like BERT or XLNet from [HuggingFace](https://github.com/huggingface), 
-with two classes `TransformerWordEmbeddings` (to embed words) and `TransformerDocumentEmbeddings` (to embed documents).
+with two classes [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) (to embed words or tokens) and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) (to embed documents).
 
 ## Embeddings Words with Transformers
 
@@ -35,12 +35,12 @@ sentence = Sentence('The grass is green .')
 embedding.embed(sentence)
 ```
 
-[Here](https://huggingface.co/transformers/pretrained_models.html) is a full list of all models (BERT, RoBERTa, XLM, XLNet etc.). You can use any of these models with this class.
+[Here](https://https://huggingface.co/models) you can search for models to use. You can use any NLP model.
 
 
-## Embeddings Documents with Transformers
+## Embedding Documents with Transformers
 
-To embed a whole sentence as one (instead of each word in the sentence), simply use the TransformerDocumentEmbeddings 
+To embed a whole sentence as one (instead of each word in the sentence), simply use the [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) 
 instead:
 
 ```python
@@ -58,18 +58,18 @@ embedding.embed(sentence)
 
 ## Arguments
 
-There are several options that you can set when you init the TransformerWordEmbeddings 
-and TransformerDocumentEmbeddings classes:
+There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) 
+and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) classes:
 
-| Argument             | Default             | Description
-| -------------------- | ------------------- | ------------------------------------------------------------------------------
-| `model` | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)
-| `layers`             | `all`       | Defines the layers of the Transformer-based model that produce the embedding
-| `subtoken_pooling`  | `first`             | See [Pooling operation section](#Pooling-operation).
-| `layer_mean`     | `True`             | See [Layer mean section](#Layer-mean).
-| `fine_tune`     | `False`             | Whether or not embeddings are fine-tuneable.
-| `allow_long_sentences`     | `True`             | Whether or not texts longer than maximal sequence length are supported.
-| `use_context` | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation
+| Argument               | Default              | Description
+|------------------------|----------------------| ------------------------------------------------------------------------------
+| `model`                | `bert-base-uncased`  | The string identifier of the transformer model you want to use (see above)
+| `layers`               | `all`                | Defines the layers of the Transformer-based model that produce the embedding
+| `subtoken_pooling`     | `first`              | See [Pooling operation section](#Pooling-operation).
+| `layer_mean`           | `True`               | See [Layer mean section](#Layer-mean).
+| `fine_tune`            | `False`              | Whether or not embeddings are fine-tuneable.
+| `allow_long_sentences` | `True`               | Whether or not texts longer than maximal sequence length are supported.
+| `use_context`          | `False`              | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation.
 
 
 ### Layers
@@ -116,7 +116,7 @@ I.e. the size of the embedding increases the mode layers we use (but ONLY if lay
 
 ### Pooling operation
 
-Most of the Transformer-based models (except Transformer-XL) use subword tokenization. E.g. the following
+Most of the Transformer-based models use subword tokenization. E.g. the following
 token `puppeteer` could be tokenized into the subwords: `pupp`, `##ete` and `##er`.
 
 We implement different pooling operations for these subwords to generate the final token representation:
@@ -138,7 +138,7 @@ print(sentence[0].embedding.size())
 ### Layer mean
 
 The Transformer-based models have a certain number of layers. By default, all layers you select are
-concatenated as explained above. Alternatively, you can set layer_mean=True to do a mean over all
+concatenated as explained above. Alternatively, you can set `layer_mean=True` to do a mean over all
 selected layers. The resulting vector will then always have the same dimensionality as a single layer:
 
 ```python
@@ -175,11 +175,4 @@ tensor([-0.0323, -0.3904, -1.1946,  ...,  0.1305, -0.1365, -0.4323],
 
 ### Models
 
-Please have a look at the awesome Hugging Face [documentation](https://huggingface.co/transformers/v2.3.0/pretrained_models.html)
-for all supported pretrained models!
-
-
-## Next
-
-You can now either go back to the [embedding overview](/resources/docs/TUTORIAL_EMBEDDINGS_OVERVIEW.md), 
-or check out [how to train models](/resources/docs/TUTORIAL_TRAINING_OVERVIEW.md).
\ No newline at end of file
+Please have a look at the awesome Hugging Face [hub](https://huggingface.co/models) for all supported pretrained models!

From 85c3de4e3d22fb1623749d97ced25f26a539f1ce Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 3 Jul 2023 18:50:13 +0200
Subject: [PATCH 077/124] add favicon

---
 docs/_static/favicon.ico                       | Bin 0 -> 106536 bytes
 docs/conf.py                                   |   2 ++
 .../tutorial-embeddings/other-embeddings.md    |   2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 docs/_static/favicon.ico

diff --git a/docs/_static/favicon.ico b/docs/_static/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..e497a63061dcf2581338da48d85b6acaed77c61d
GIT binary patch
literal 106536
zcmeHQ1zc2F7atl#8WBN7>}~}`!T2!NTGusJT(CPZu@LO;pv87o6tNS=URN<N01J$@
z00EVmp}+rm^Bz0k49oz71;5{)JNLaC=iYPU+;h*pkH=HtY4Ye#g;#}l#)8KiiQk@{
zVt;LIJhkNU+O=c-9eBJh)l_(m8;kva)!^}5M<7fU)^Et;nXgpg8N#LXMyvC9d_NW5
z@?U#))+=LC21)C6?ef3h=q*8;vK}<$FXU-4zw7_n|333k!qZu+hDT11Nlu*gU{0?|
z%_{0&w;t7HMODY;RUPZDaJ*q$@zRp`%ahk#uE6&+YG>V|qjx*iw+1ylvzHmFr`Bt1
zSYvq9Dl;eTN{g3|AFzCLmBB-u^sjVTQT@@5eF5n${hv>p-mB4rZV#?owXi;L%YE3x
z&(k7jcvyL=zmI(oC)}SNuysz_lR2wb9nA3a88diCi<V6Xrk>>|9Bi`Ca=uTJpwGs=
z`+Et~D*vxx=eu#bhU=g8RyDltZS2|QYNy%mDo=l#GxMspPO$UII&LE`ZSVRgFG{6l
zhoHN0bBs?9imuL!t>XGui(gHzWb$M5uT@gr(;+_aY;uC3-P1Tr1h7zDHoUt|yXwP!
z;Z?0*`HzLCLmAKGS6d!v88@->rR_&+1kS%#KkfKHt$rQ5JGQGZv!f5s^Pk1p&t5ra
zG<B_8(XB=Ji@^u#9*!_;lKuKs#z(8gFE{aq$BxmivY|;0-uhK_RF(}NUwa(SuU<0~
z{#@<b>syL8J<YBez9Yuz$&ArvT90>}5AC)7MvF~4{J71-_%3?Sg=#kEGS4PVk9}Y`
z=%`8EwChQC4O!~?pwv;nKID0JjqS3mhnt|E$0v`fmnPbpcs}0TeC51JO~UqEva#DQ
z>Q=M4I$J-Rc>inK(!T|7*rsD~la5`o>9w%YW#R0u<@iB6O+FmhefxI#MTsxpuIW`y
z@6m?~+a^A=50Cb*G;T(#6BYa4?=*gBy}3ST0%ki-UuWHXmk$5h_DmJNPkho--N7#t
z-VEDtIFy&!$2@3V&9evQKkiU}!sLme&blGZn(zJJ$wyWHtk!;s=MvN@QO)pb$x+Q;
zL)TxMu{1HDO8r`z^TJ<`8Wn7C?YwF|zw@DY?;bSLdVervK|q-gpC)Ko=ze(ku+qE1
zBNLA6?F~6|X?8~Ztx5H7MP5w3#XBw#K044mPGgcqlQ~0<)qXK5Y=qE1A*FMzvSU<}
zUSE3F@5zx{Cj8&8>jdw;;A7af^U=c(``jC6a=q%F<JXTH*%*wg^eBPf`QqbI0mDb_
zI@0s+$gbrbGe0c8z3qX`{C^Fdsy1jo@t%X@{h%+H*gP`Dqvqv9Pcq-dFPz-R$i`z{
zYO|vuhYmM!OZ?^JwG(;~X;<s<t(Ha!=TBM`HOJF!X_Q`b%jE;Zc?sdYZ%tU#;lT4q
zqxV&(3j_DtScDu+y4}+3&ExlGdKt+xUGDAO@-kxlk!wquso1`mU%tcVr)OI|&~Q^d
zd2DIC+XWYcYZHv?eoPOyPyQ{sXQyBnQR2b1uXH<inji9Tz=Wxv-aLI=FZFdr-i!P1
z#zdr;4n5nPpHL%2FQUiqq1{E}RFYmM&0acfLZ|3qUo>{RU{#lKt9>S$xVqU+ThO>+
z=(1U{?H;H4&$;^E@h_Dqk0%2L4bVw+ANleAZ~fy`%lED|VNjaBYwOF-yP{v+d*6|t
zV*T*l@V7_CL^(J{oFDz^xpzX!(%0F|FZZkGm>lzR^2=JcuheYt?#{0@JrkDn*KypF
zydt3Hu|pBb7AiHK&JRCyCaP(=hxQel*l1^u$~=Lvmra+IlN%YX*nRxUVxta`fiG<~
z*7WSTIdj$10mf(RoO`y%XPeV`mAP#%x8&9E{4+wbV+^ng8hcW>-{4lcMRm@<T>Nx(
z@&(tojXD}1xbw%*LA55NrP`0#o-l6L>ee1c&XJS1mI*m{J$=8!d%>vwPJ?W_cw4^}
z1<#L(p0~^0{M1;hM!Va$J`tln&v59d3)xP~&bJOZolz@!uE5~z<6SMZmmJZy&9=PJ
z^}@K9+fRGBhCZ0FE!xGp!K%J*yZ<pHNv%f6f@6JcvZ_qnIOF_;J8C9-x7;$?lRB~f
zt;|J9{S9Lexrc6AFwL!2tqFbRm78gQxl^59FMi?8&)rpyXu28&-x*(h)<zq@#i0=v
z%`$@3PM!#Ru;!k>@mb5n8V@Txc`2H=lb_Z2^6bzjQGRZdvN|{SN*{Y#V`9~PUQrJ&
zkGV8wZk=5YM?3T9wY~f-s$yWa>$0|g%(;`@XW79X8CRzax>&znbal_`>08Dv>d8NT
z<@B2_k^1LMPPIIjIb}uX5zB|!pR0WQ^SIAFE{vT!?_a$RLk^#4ezN<(zs66S8}W3d
z$7zdNne)QeW&OMRmq&j@*>pY8KlhNcDZn#epp(C)_5JoMMwTB`v0mkB4VRCo`015x
zw})!=pKPtR>djNjUP~`ol&gQM>gSO3{gb1EwkCDH7j-q_NzAFF)5qpk7j&KBFwV76
zi!NRjo%Z~pJu$_*{FQ3fGiJYk`NlaeMQ>WSmX7?U=r?X%Ryt1pr}6Qnquwu4zq74&
zm$NDxt9tit)2$27MA+cUt>d91Yeht*Rjt?V$+NvKGYlJc**LR(>&*1V)z5YsmC}8*
z<WOBduIk{uV-0SbF1o$Heeb5@8fV(pbM9&}`)aTAcWrvupFN&pXR;CXHG9H~dd)5O
z4eW8(A#C)}^)vKrt%hbt-wZPy>af-JbjFn6M)Mqo_f3jAxP0TA8?B#cW?6?;+_~|1
zf~dm%aVFE#=0wIEX?W4xuWAD`K{wUMQ#;NXR`sKR#_B!lm^jL*jJO@t=vH>O*w+ue
zFWg=~vHqjmC%TN%$%<Q&p13sX=sjVBu)EJ&rEkg7keFt=yYtkB51Sl1enYoQZYyj*
zh41~~5V<n!npc8@?Xsa0#z$;8Z*1=59_yN+@m}kL!@b+?hB`GoFHF&MZF<>R{l%Ko
zaf?z6&R(?b(f!x0@yqWdUhOhtm0-?I(eV>b)6>Ie59nn)U?_h}g?o~n&U-9x;qD-9
z+fj>MV$7P{dRTeu@}p1Qo7B3UC{*7wq>id)KhKj6^{#g4egCEX;<&7R$212AzE4SU
z@i*qT3#k*k#?XIvQm-XZ*}SYjmi3(<8QuA{bG7)Rzn<v%g4dd7X45&-yWDD*u`6xE
zx<<8IY~O2HuJNmn-r%Io4cyMhI-fhxU}&4TwNq+_M)!<=SH`|t^&ZVTjw|CmZl!tu
ziOwxN`r3Y||M^;${)kOH%@rNff=~302=C-!5kFvNT>1G{i(bBue-|;brB=l<7yg-e
zt2uAtsiYVux3%u3-CEYoY#Tc*xRGFJuI9bRqFvy}Miu8bo_FX$%WFx)&v|Tnm9TTs
zy94~~kNz1HKgemxzwawM3si;<seizw>AXtYldrAz`4nh(*i`??)>N$n?su!~R=*Rr
z;e^Yjk(E3~*&d3YW8345e$2sVwLj>G>>WKr`-9HHo&41YoAk==(B$Sk{TH7{Wj75E
zSnd?CvzK7=-?i;lcG*QOg+z>fns@e&Js#dW_Mu_u{A2eH-K%at>VwUqs6#gAg4)_P
z>yh^9D6eAKkRcP)f3aOPFk{}Sr_GNvGajXGuQL8~+U)i^+a{e(H%*G{Ao3ov@obxa
zH@}`+<?`0ktGW%UVro(8klLDr57RmhX|qM3mu~pX*<2&b-@Yp^4<EJ}jBnH2<<9sW
zW2T#nd<^}<RyAt&{A!0~2`9FCmf!4lui~eZtq=Q6)oXTblIEIYS5`&;d;8|0B^_<v
zeE9R-Y`1O3?L2C0_vtqI&;`-r4KHVmQ$1Dn@}3#)Esvh{6du=dpJ=n!V#-OQGg%b_
zYbEgGuP>>ZC;P!(bJs68yfVJu<k4I2q(1yBa;>q+!PM~Q<980W(sOyid)NKUsc{Lr
z8z)s=eg1sP((-HC*RFCVru~`D7e0(Db7|bBB>!>~6DvFp8L%$1?lg~TeQS6=?Pkmi
zUs~P%*vSp^ZY2+~Qp>vK7<$z?WW%TFzQ3+fz1A`Q&+=It!v;02^vK_BLzd&4$+Oq-
zay5}H)AhCqVxOpVu<`GIO5@a#%r164-%dGjp)Y^t;~}fl>Q7EPFnQVVpa~&y!59B6
zCu$usGU|Y}nVaFrm)jS|%@-V3d+M?ISZw*TyxkoqKIxhk?Gai&a>I1hZK=H5i=x8$
zb{*myMl?L*wxQqo&QaM~vreAq61k^doW0$kDcM#$`vFsekJw$CKX6&fE0>qKYAY@>
z`TdA}_O>>^)V{O2+?7gZqk?_H%5V7B_fVBT`*>`f)*uuT#D}z2+wIpTpFgqe(}GL7
z6Rf%(jVeEmcg@=8mnXM{j}rSWvs`9+CH;Ku{Rczd&AGMpaiVU8haGxGZ;9^QwO+KF
zTB3fFVYVha*Jmw$(c1r%)13*I_gy~{IkMYOlSQispK1TcAhEE`h_n1i-ox^v-e||Y
zt01tB?f+=)=bii(&l8%x+Ni!Z_IlJky%(*g`9?fedp&RIgSZg8q|KRGXVe0Dqs`KX
z+Qb~^2Uxdsnlh;8)#V3oMD!Z@?n0xTgXX0#@1m0<dv6=dJHAxE;hpin&)RrTR7G??
zp>tZrsO?EJYS_JV`jF)uq1VwWIx*PpSDo04DOX!QUAW6GqMv5cv+9%nOjK3R>}Pu2
zm)03HHa|T1^401qi`DpZ1`g}_GL!G1w{*?IE1F(I{KLKcG?RudTfOeXjy9%2t<?Hh
zE<DyaqR#md<3!VkweRt6_3W8#Z)OiYX!ub3?UEr`?HpfCsEvgIv+Qv1tu_FI^5WOC
zKTnzxb2|KHn4x~9TH{|?cN}*k_J(!Kf0LS7x4iCJf5?os`&E{8v7r?O7qRiD1=BX%
zu0CeLZx{c$(erxTslXw-FHKE3yI|U^6Yi1ab+*-hztq#i=%UvAYUe{+zVkU_>0vaq
zoSTRJ<wMQlw>Z1R|9fQQLZ?U*&v`pn{I>{-o2@r^*yIb#4g}iv*|l-*9=o=C%Px*m
z?}H^!P=_;<F4Vka?N@cM#!L^V%csmTyk@&Lh;SJDJZ0>JP1gphZEiiR+T$yBPQykz
zTP``h?b%NK{(36xyhcn(*`fLCy83aiPPfRoyueHC!G@?#UF#p(Fl^WIbE>NrN14Bi
zxL56SuUb#XJx?jqwbr%@hs}msw*4z0Vd>T<{{x|C*Q$k8zId;t`eOfXhPuv=H9d@n
zI(^7!e0j!`Wp;n}E4$J_P=DqVoz&W{eU^o{SetNu!@jUMU8@5pGRyJH2RB(E*q>d~
za%#t9k<G?{)+&y>=5Uj4h<mU~kb~>t^A|3-m+7`*-phc%kl@V?-hDQSxcKzm^A!E0
z0K><c&UIRO)Guqc>&D(*>&+f))!M2Ta*1EgdPcc$AMfP3$>RpC=^K&!L3lB4<#f@5
zZqC=H>J4=q<7b_qzRF?nr;*!DTu0AL>(M;j`1$sLsLGiKMV-!O-?T{o+hMU$<yGMa
zM#apZJS=_wl^FF86Lnie9<clS*({%=Dr=jiB)+V1&0$s4zW)B}%&}@XS=+-^vw=>~
z&K4+wpJWjRTv@yLPW^>8_VxNaJeAycL(dkWJqF%+pH_2S`O9&C?rBwdQrlNLhnqb=
zWxz{LpV{`E>4&C$BSI2})d;-aQ|rwc@0s&<dbF-+cieGc>=U>2K;A0h;ycF-I;%$2
zoOdZmRDa9w9|FBIL=pdHxF+hP|Nh7J@&8VbO?kM?;h%e#hdz(Es53g-zIE1tY3q1p
zoolp-*IUuc&e5s!&Vbn?#(AB5y-R1XVfKi#oeYfmRp$L#^+;z)%1_Nozwct@8{Rfd
zrT*fmX8dTwxY<*$ji@_re`@9M1w#%T=wesyMw?Ep7rQN=*6G#bf#@^s2erQ8wx{u0
zjj;AXO&Z)?_`<b(Yn9Bj6JA%%?U%&_@<nySwL(*zJsPJbzyBq;QNodlaq2xcsBhV-
zzIFWS@om~T^zwIp(q&`7gU<GLCtCH^d+zK0V)xTy=PkA;y=^>UVgKY_rvtX9dU+no
zwa|#lisU^xRA&8o|9+1eyjiYc@up5hoglA4Q5J)Hsoq{ZAo`SjX8ep+3CDknuB>us
zqOQf#*_rL*^*kyC#W^@U_12$jd#>{31>qLq?|dgs&c1tj+>Ob?N9_-Y5+A*~{{Ehq
z$Chrx!pC}K%o><9zkXV+TMH%)${N#fb*|;XnciyNy1`ma-nB89cc#J6pk~$Wek)sM
z(!_~Vx@lWI4nLi?Cb{Cf-#@xXu2wfZIkhr>#p=KsJ~p>UrkXlMcP{7K;O~P=UIg<-
z&NJK4!$t74N>)p=^=)@>@&tNlW6$_Dw@w?zr#5xDbkj=XVxzwXZXWYwUdxKJw?#&t
z(Y36e-dQ9(m^Aieqg|Fd+sZqwOKdP|<+1So8vM*r&#(M->G*T+kZ!gvVNur8Ru8;u
zYt!=RnPWFTCHUqpL=*e9mhH?JwyY85J$ab*|FkCed&&PRz|T@+UXw+^7nj*}Gl*@}
z#A8F_v-d5pXkI#9bMU)E4b-lzE;A;=;#9hMV-434%l^FJI=)6&hqq~;mb7RXbS<i{
zns-~r;lVkB>(jqfE>Hic+xl_e6O+8t5AyFC&N~$u*`oU`j}QC{4<lDExow*A!fNp)
z&5(DsM$FZi+2DU?Lu>J-)l#`!sh3vB!ifuxrS#99{m3afxb9A9SPp829XR%`O{W%@
zXMY^@cIg0#c`R$~zEy#X{;qv%`uWi0Zs(ToT-3z(Lg%q7bwlbUBrosNEiNF?rdR0S
z+S}LfKhYv>u<iJ=pQ;4}9-q{<^VGCNr<+?3pH9<#e(I2$N{2}+JQs?B*K8PfqDjR5
zV`tRt`UqNG2`jt0skcMD-T|Vr=8s=`B+l(sSv1<*ey~w=4Q{Fn(mkt|Zqs47ar~$`
z?k~OEokX>Co;vAeOgdQq);V*1_hEwjsqM$ldt{PTd*|f^F{c-MbQ^TzcE-;0p%2|p
zdmN3?ocF|Ryz8wX2a6Y<8?mY@W}<oeR&AL_V#O^wl{Bp9TipEpaP&iOo4OAkXz8Ck
zxMEstR<qUmw;Wu@uAHhd<XqVCfq~WmZ<}3k4T?8;(6v7wiXgM#Mpv`%tX>*YJFEY)
zn-NJKPvR_N)Rt~^x>8wnT>40x+C7^`^ssUnX8UKynTIXys)ac|xpOb0;gUDy+aE|&
zn=mfjFV(8&6DRl972o*Br|fQ-OD_{-`SS4Uq}iK}H6H!vnJt-H8vZe2^-ISq{U6(}
zTWa=VRr^5OQ`78c#VvP^>3J_L%x-|Ssc^*-t19(3e`vmD{Y1?<o+IxDCp?_(-fPab
zpmqH<dM+K{(mn08x4XyNnjij5>lYXIXwvQQqazwEz0|2|<P%F<{lsNWnq^m=IcG#v
zg30s44Pv|1*c)`u>U9^Zco&s5OQUp#UbtOhlx<~E?R_G<HW7`tY74V}3!9d?IhPi1
zfKYc*PqmpX{#VO<fd$`j)jG3P6ZL~nj(u?YY=@xGkkO-t-Ml$eC1gnDGCj7=bF>K_
z>^#Qd=GCscUK_gx-L+j1JwK_&#ACM4Ce5>&n!0<+mSYzVlnGiqz{Smd#(CSLp|%bx
zPpmHXH_zDU>|x&a{<hOI1|9gs_v$$Q!Oc+1p)IC9GI4CCa%?xxCh+RqLz#`wZfTzU
ztn78))ixVv*w^%^@b}GX`Y~lfVgg(Dbw3nYeny=s*B@5>tplecjdK+KsW$UOi^1U&
zFGMvN(Ad9Er$#kfJ~AH@a`^R!d)6l$>v!7ReCw$uDrR9<x9esfZ#~f1uS#NaR+%xA
z6KBQx7+RicYkelGN6PJCi?rrUo%8zCoPR@H-<rFQFmSM3pPhI+{F0{4)i4W(KQhPK
z-kQLV*p+M4nB7}_+J-+Sp3rPld5xWRMu%SZ<*rB99kVnhP$m0h+gE$-+PJvH%~n0U
zR%OVVX$jB&Y1M3At%|c(p3R;cUFVE@_=e3#2gLvLIlHy}iHc{No=l(RJ85q&x&2|I
zxTFNbGy5wx@|?8v<!evB+3q1toA-laFkLe#%t=ktVMl7SM`L5JW#6f#Gx)`;2(zSx
z52Mm9eQL2PcyY(Ie)UxT-rCAaciwAg^$yOgUa|ktja^mE%jA~9SAyri&(DsqXgst0
zyMdo|`613OL7&gpg$(Jgb@J4#gg)&WTuvL|v7~L}Nu9w{LeHokbDui)L8=Yk-c-M>
z{vq`$lSW;MU1}{5i5`94X?<c?;LLr&Zl9vN<<91@1uwkkffGH--OSKfdVA5%iM1L;
z4AOfN6=%8bP;DRQTVu5Srd?dyIBV3*H#eF`{=VwXjZQH&J$0Riot!i=`R&J6+a^>x
z5x)C$54Sm!()}v5*Qru#vM^x(%b2nudu!Q;R;-uh-S_D1*lM}uy9s)+RsE+8e0`6<
zw_|+ESUYQhJ=Xb_mH4--jB*+<AkfO$=1n*28CN&9$gu4-QX{ly+4}}z0kt2CW~b`K
zu722M!sH=Z+K>N8H;Qfhd{epM>zaE`wHEyL{IXu_TfLr~%;^5{L!a}Z8Cg56S*e^f
zCL#0ps|;EY$Q$*j>PQzi(IBV4GcWNIo}WK?H@^D#zS}aj1)DdT3R_;t?l_J=c=g4h
zHZFQITb*+oy*Kg0@$KDt(9k{9acU9lE?Chb_V`2Zg!n<-(hf)ddi+%R7lOx@i>6nt
z)UQ_JzIisGlg(SceOoS9f%Qh9zz;{SspsTwBwA>A*1}saBv5nBqNvWJ4|R@uUwfAO
zp1Yb|Rc?j#{CN7YmEqp!{Y3X%8}5DfEY|U;QJ0sQZ8wD*#C>i%%5qb8%kxPwel5+)
zpYVJqxRlW;;?RZFpHIbm*xs;z**8KprW=3cxWON{zMq;reD{^M(?2ED$wkPBK5D1b
z-+1meO#d@tc<MXzqp!2ncRoLLcG%52FOI!>(IH^_18vtH%lekB*>U{+A$2dm-*s=#
z$*z%Mc9X)#97)dV|0vk2{Du7qOH&PW?BWLAo%zp-t7FFn^=|ZV^2%YUJtAgz{gAS8
z>#wTXHG4JK+`U!L87u1ab@KPK?!h}yBV=Z1nQ*P<D>Pc{{G_!v{?yD+eUrrxv;#Ib
zH|gA+*KJs{15LJ%ck#ZO>D6a)aIIYZO_0s|+}^gg<JJocu_wjr>e%ythdK;iP#eEJ
z=>ayZXh%!@r@{+2<1t_#d>4*^kH`S1#$q_!k%m4(Pa_|p58#X6d=~fMJ_lYH`iMS=
zAr!%%1!MvO771|Q11}Ng7K@{Bg8+Y|IfsDvU4ixnzQVe^{n?r<zKc5hk%%ni|Bu8?
z>19EG(*q3A->L!~S-8TT4lD+ISe%4=3b+j10qz3m?+oJLCIcS<A&U&S*#M0J=`847
z3P9oBvUmbF8ldzpvN!;DFF<1q(XiOSM>q)h184_-H3Ba?TZM)6!j(QPum-^U3P5Gl
zC)KMv3*zr1fW~U#;nzYM@}_o`48HFNb^~tkw<nbHzd-ip3Sb(LqDnCG6=<OT$`Zi)
zIzS`*?g=bqaUbqefco{<;yvX{{HL+vBlsQy1OfBm|CL}6C@^9WAka1N5oiILXcN2~
z<SKos2fQ~0EWm&2KRZ|uk7<17#_O-iZw2!fV9sE$7QF8Z^eP<_WViED{>!eLAN7rq
zk4PVAPe321@jVbY3_NB*G8wJWNKR9*|9$oFw4Nl;d=(0uHuM$F5(DM+POAh|_)&Q)
zPr1PN^1v7Z>WAbynonsx{=M;99(vQk`^yY`L}S7GMgXZ~evZHWkbU-JO#~Qv3ylDC
z@VOCq51oz36Tc&Y&j9s#dFu0f3q$-5XF%okPK8V>>IAvKn1P>!eE1`_2r=&&eHnN^
z5eNj1{h+)qsBMx=n+m)J?`JUZ6V%F)4@>!9Q2l-H?|&Zti~GH*fuG0(Yxx-jwEHmN
zIY4^0@12*0p`YYgl51(Md%$8d<^wmtrQ(`wO%|%({ZOz!TNQJ@8u)Js-v32Fd9lax
zQ}R8Zwj#O)-qT*wa`0Y<#dptt13#e|P#tnlYw-RY0la@#%KtCgKh}vNjEkb}#@?dl
z47^eej5lPf0pD#qF!UAt0c-$Br<YIrFMY?>&qrpBNNYtOQ5*1HAE48w-{oH^{}pc6
z-#!10d`0SjEdgzMJMb@1I_Aqa{?T3sv#*G8;2^LCn7|8!t%(?P-29vV1K%}(t^}0j
zBtY`+_o&1Dy7QH<@HTiK2CM|{wOM?-{11eFFECqG&o5OM{GQ3;pHluSJO^;??iqN0
z57=wqlWIrMN8Nt2d&q&=>eNPc{nALcOKUs=>2$gBl)8nhi?@c}qC*V)L{+|7Jz**T
zmDRq#3;x62p#?Aun1DV{I!*%Vxs+8eveG44gR}oTf#1-r3D-d$8Nt9u)DLT`9t^xi
z-O;bQ<2P*hL_-m7BCwuCC|sJq$li?Pvae-_LiPAb48d&F?_j|GTN46_CSWSy4q9dL
zrX-#d|DUqh2X_H5my=12d{S+I1`NDX>l*>!y#s*#IMMG2OY{?t0DORK$hr1RX_rJ9
zWaJ@Of%=^cbp2-d&)T)2Ua0N@fL$eAN2Et5JOPMb)Nhxw7!8+f^16fn4H)>ORt4`X
zGVn^Z1ph6;_gX+bpgF=~Y+yj+80}Y(o$qcIH{eD9G`@U|G2ytF4zvN^&&Djiu6~fU
zbp~A|^Tz<Rw<DwdD&;xZ36eeVP9tyOAm9($b6`>A^^&olKnJKufUUwTfZEnE*n0{I
z_+64?0@<C?-f;+c-<XBQR~-vV`TtY%-^fpBOF)|>U0)^(W%vJh$ZVLGMfZ%o1Px8R
zQfk9BN8i^Xpzo8tkr+kn9O(BdfGPofUk89qw}|G9W(2IyF9P8x_bWiz<>t~So$4uq
zL4dH_*IW~V_iYKFh2}_(K4rTY??tx>rdv|1%5DB^4wNk-tL2UkzUu?b`9tvjJa8Rl
zCz(Lm<>u0-`R)<{{5SiW{72cH8DLH<f%4Nji0nim<B4Vwz<(p~UJnpo|0*{Z3m*`?
z$C(b12?1pv4NL+}fj}e+j!(*RsUBh&_zKYf1a$!O!k4J<F(CsjB7l~wfU?TVrAy-;
z^|KXvJ|9h46j{bAeCZLBJqHY$e#6|ek5B^jz|}`K&diJj1^`WqRE|PMF6F=Y9Qe=5
ze~f)+7>LKCoOY><J^<Ov8R!PoD`az3UNj%r>G))85HJRuBA^bQDWe|pq!|T2>8NJq
z$N{92DWSho{<9VRv-00Zi2E`aD9fIgbS&@0fL#gXZossJnz#Jf_+bBA+y+!ZD|GY>
z4#6dPmE<*Ljg1LNn{;4X4E%-FS(KPegnVOJkll(h%9sQh;~|Uk@-yV?g)F=EQ#k_a
zV=?d=P*&fNtnWjxwe7&7#B{<a8|ixq%HsbA@cu3f+*b7M|AF@=1k{NKK>HFUAqRwj
z_rI`EE2&yy`A)niy|iR|EUw>CtPL^Q5#xI--_M_cx3C2;44{4G;;Ii>=|rQCz*dMs
ziRnU0`7gUY{v`Y-`HVn&<g(gkQJ&M<hV)AWec84B?Y+_W5tajLqffR3?IfoWL_ksM
zLsnc$`#wQ;Ynm?fQtE)x<pAVIdy)j&`;kRkQJ&wyS>d@X^u9eV=R*;D{!QN>dgA}&
zBxS7yX#Pzil-ZW1&*~+WE(ah#YR?2^*+)fy_c$}p;JfPo-~ruvpr-F9GD7{(I7avY
z6s14Nic5Wv)|wy72Beyo2~0BtwDW15#qtXIVK0_}0In?Bit_vs#?LJ*zAO2Ut6<Px
z2B5nFX+4O2OQqCPP8}Qgin;;+1FCW~D9dGePi>%-|1v812j~A|@SdQoagB6aI3F#%
z26?(X&l6)R;XC@nKCvc%J3>W&pd8q{R&M{tT<Hpo16nA%not&QUL!u`j6ucto8+aZ
z1OqQo1@K<=JLJCxcu)4loluSw#aK7e>)RW5QHo_|rQ3<4EM)saAU&sadW!a(<`vra
zCp0wh7MTNIMGnMXR|TLF=)nDLVwi1ASI2z~s({Qf33e*#fSnkOUAZ#q>Ay6+g)2ed
z5}>29`#<hN-$Ov1JOuuuK^eb&g|K_ZzKXBVM%m?(mOk<w56nY3-P!&%7yLJ3@9U6h
zhYQ;zRe)^N%n^5&vg-+Z&p~8|0`!$uj^cb)%Kx96|73$FhVX*2+cnasG3G9?3O1*W
zSSUxwCDz@j8u~K(&64+2a{FDZE5g_(+KxSG7gL`U6GGnpg?`TiGzp;H7ofGevh<}i
z2LA)loIE@)Eye9exyKQZ*BL;$<G=VkG0B6wIlE)H%H+Qq_}>uaTTVc}Qr0*koCjiT
zk>`N-6$ts&G1Bk?NLNQ~Kw0%gdSK8c9Frd{#qkdH(uaV&f`Kd+$|@J>8ELP9k9l|o
zi@I=YfluYiZO&FFC|oX2*h^tespp-lsqZT?BS3dz3$zDshogK43CNe)UY@dvVE-VZ
z{oMfEZ-u%L_ObBER%7vnOo(vafU^64H1dc7#w%PG3Pv#AET}|4UUbfu_5cWZ>ZsWL
z8PKVS&tLiCd?(3dz9L8PzAQn(A~3wCbtU+30^WBffTy^NS`-dEm9spg1A9OK@5`|G
zLMFs}UqIRQL~Es2z!Zh+Lcs{7{Lg9IKP&&$(FW8Fy#=FzP=L;*D7*i2^9;FE@1%R8
zF_>gSIv2Eo1+CL20ON6H+J%7ec?2*TAe)R8EYR1Pv%`PGrF%;VG}hBxBkw#-`(HG#
z1EM)BT2XhxSfB!E6Yzd3pzQi#_JNSkECsuBp@W0(gMp*KsS+rkwE81Hr2r^9&D9Jx
zf%l{f^8oK%3E;aMK;Ls)fUrAR+<{ByO5Op|(pK=9Xf5(5bdcabGbbSZz~adxUphY{
z55nw1GgQHd;Cp9aF|eit$|sE<T>nF#W#(<n-8h>^XQjp8mDmfHZ~!<2909JfAl@ec
z)NiDvt>`n-@e&3Y1}1761te>mc!??z@Qie2il(P%cpA3^z#IkHSm@yBPgQ_6z%M0G
zK1J)Jq{GuV1mM1wz(hmCfMgRRFM%Ba&#B!hn;seq3GRhvsDcqo`JaoYKO_HfK3+&?
zpNxRR1hhXo8=&mA{w=0Yaw%aC)}{ji+%F}VOu#eR%T_YYqD0Jr1UCh{agl?A?_~k*
z9IU)+XC>E%vg=!B`uo9q+OHbQed`0fC%vqa%ggW{y2`J@f4YNXBmw0-2hci-#zQ5S
z`I}7tICwAvWuFD?VDU)#bs>6#d<j#F+@Tdr0DBygd!3t6*4F^tSMf9QAG#1`9k>bQ
zf4~CcveN4!26;RKCMn3iA_p(!e=gqsbo`fWK#&Kk1GFaWkG4qbQG&AU#J|Pyg>(ue
z15tl0N%~0daTaiv<VQ*WF6BR?Q~H_tZ-h30dlQAYZ&m0`K%0Dwy*sjf_!-6l(!<ew
zp9GX-|KErFNYB`(r2D+I{O~=aGC&sscpt{@-O^r3N!H0vl$QYgOaPfv@QXBll=L(3
zT?gnu0Pl|iZvmPcGg*+X<R{{fvdhZwejWJVrKEXXP=5M8f*ODg_&*3dCmRKVsFeT3
z*Cvd7g+l-*pkYD$DCzg5{4YK}%jz@VIREw02FgJnS`*{bP%)T06Ulx!0-(8|r00X@
z(9x29<{s#QEdb&B)mh@M;?_Vjz@nu4x4iioY)MiFj0n&b&jfsd6R3AiCMY?1z!&$I
ziSIFl?TP^Vvjj$NFF6{DQT{RU7gRM3NU;FE+4~Z}cQrtt0KSg^76AUJ_s1;A7Kd!R
zXzfJnC<3*O;vXC6{%&8~*XT=omI5y^nDKyg*Vsq-Dr1tOZI9N|Og(}BmBD*6Kv&-W
z_$`Dp*di(eRAt~JoC+C{&dhH`xo)sXflGD`#jHCJU4@=L7+A>F1KMI2fMnFQY@U>F
z@wdI=>-TiW9G&|hXmREEp4?LY7fCk!5%`Zeff)nnz6{*2FRTMJL3_3*;CD~pSNtA?
zZ~2TP;5XSDt^@wUy9)sJTyn;1dHXBfF>(X&=K2fzz8OHe%XUa_1q+%B6djjKDttJ;
zWwwIF7v~PRa(&mX56<@Beq}?yG*c75)H(#%_6`D^!Mg>(Zs0HQh?61Z;y>cN03w0y
zdFu!Fs#j$1S08|Mv8Vpc@}Cdh!`7XF;%A`p7W`ZE<6IT^uMYkjg7;Mj;63b)g``&=
z4tM|yz#}>bKp?vVt}V#yhloS`r#oh)@!t&m{~rNqVSUJ)H6WW7ndz1AJCcP6C&2pw
zER=rl*SFX_h$gfJRluHs*xsA=oJlUuk6&b`NW9&eU%4<BtONc6NXE)9y^?%KZSf<p
zLEk^wn8g?84)ZJ5kMypT|4bYBnfb4RHlR;{j&UHc5!hNlf4zZm^CYl5zrKL`r|bYn
zq(w4kN%nD>dEs1g5U>MqM&DOwkzX->ns?xPM_@XzC|`bgi>`q8^o{MD^}n=A(%T}k
z25O-V%*{8w5^NjbyBDwm=p~KjpZOX1-Wd2Da@|Pqi}W`Hk_Yqlrz7A!&I*e1wU-3%
z%LA5(KM5ecBtfn<2EygdvnYNl15+OGeFiWRX#F$uT3Xr5@5t5!bdCK|tFyc!eo?*U
zt@~x{8jm@zDJ?C@XV4S&25yT{+`7mwXv<40`RR-?bP=Kt`hJ4??Dr-VzxP10P(R@B
zQvUydOjV#r;J*q$cQ~1$&yZe_Kso{HLu7OEh=4oHx|iK5!Z$nQyH^l#a4!sFU$+PD
z*RW%|i}Bu_fITZi^nW!#IrktbP~YEbQ1HkUAbV&6r^6$i0Im7&(LF!xed%)PO%cZ)
zpfv^IYuGMb2k)6X$>4rhdwgG`jetkQ|1&J;-ZtVr-H}842J688`rto(FF`K78RE1C
z=u8ixAl;I*`Ma>sXzU`o4}kCOS$vm!eI?lrpszCpP7_df(E!!sH9cQZO%@9EA*?5w
z1KfS?q+cv7KNXGlH}X0M%x3RFrn`_!vMRsJyh{21U9zp%h(>?W2bK`9r`rSfa5MmF
zpzo`(P)J{j?~t_u)-zE4_@9cg<{iPvThPVCE2Rct&h&fny@kbAPd{oxEdOa=kZjD_
zvv<}N`g`dt|LJ>l8<3{5=YJ7+|Aqkmx5fEdx<}gJN9C0=$^{;21Dy!qe>L#m5Gb^5
zB<uqnfbC)^P5w^@y|lJ`0^V;Tfd4hXdp$tu-}O-jAAXcH!FRN!96MoY`b0(LI%(m_
z){*vg&lz~9G-sjnqtxN|T#i!yf6v@20`2>=HAqhcec>>+Z&Lp(&ix<S*ZIuC6?zH!
zhF6m!P+S_dIBTGk|HWCGa;1g!nHFG5Fz^-B1sVYLA$v10@kupfp^y$ouFUfU()ShC
z1;`%R10dOqAZ_ke^cm@pX}*sE=)A#N==JT;_pN~M;+yArs;bz$;JZG+=;^@so<JYq
zH}JnV0lcpR)CNq8EmIi@!Iq#a&=>si1{CFSe&NZ6kIuti#hE=@7GLEXS2BuTB99Gx
zQd9tSF@$<p^BCa!$(3*)zYPI=9|Yh&J>2E(D{v+l!|nlS1zmV`*yR`y487BEA31}f
z?}_b)ew77y-y3iNpJ?AFKYq&dj^qYf;}K|2>M3v<yl>8;(CgC@VU`jy0pDS>m(o-W
zVG#I#6`(y@dFt~ecu(Jmya?VmVo~%lf&GRr?G1sJQvlhCzm}7x{OHD*FS^OV8{dW$
zpL?@ZLbZN3NtW_o^i%R5ebx%7gFe*@2x9RJE}a2R2J-7iG(JQDXMg~}2X=h6EDEjT
z<vI=ex+TzufV64vFcBbGEkBy1z2nYl(7BZ3z+QY06zKxqqH5^>dUW=U%lBt;!TY8_
zN7RWMK(a7F-gWgwgyjR!$1-PsJeji};xiyc?oZ%*J%HQmEz0>n(i0MnA-$!*bl!$+
zEfz(WQ@*nXgLD5X>>U7V;C(0HPt?tFfLm8d<3IO~_&|0Xgx$anU>PtG7!Uq8XHn?2
z6XZ!fz!2%R0k6CX_)Yes()zZnXGAY|K8MCBf*;bF0t^H1HCg<O{Li`n19fr~ps}9D
zR#|mP&#C?%02{HVPxp!vlu~!#J$=`14D#{;s4YrskFsdx!av0KQ0SXD`;2{4>#;EU
z&bq2n$VJ~T<-byD_WMW_dnm>LWGVuEvx}MYX@1Pxr^vpF)>{D9{62!7Ky&naV-^b8
zZb+*g*a&L?+D3jNE9ewPFc8a=B&!wl4GWT0$OhmEa9#}h&I5zr(D!YC=IHx+Eab8=
zl}6`}^bGZ54!DZJ-1(E2@0_3ZSRNA$d{b((=Ua<aPA1+1y5N5U@Sd~lc#Awa8uD{#
zEkJuV;lMV+e>#km@;?&KinA=>yCvWuM!xrBQ{O*L0RO9i|N20o?H};1b`?O?c(X{$
zz*}HRz&#?(fVQ|dZvgmzfPq*aOMItw8})Aj*^N@WcmYJQxBxc{2n6O3!2g;ojEl3*
zex$Te$2Eb&48(UTP+d#IdG4F6a0|_MuI|8lO+W|yuLRz^5s>d?;3dGF4Iw%*S>%jA
z*pt}}_yhB~J3&nRGmK3Gvh{#cJ8Z6!<#0>+FGnT)z(LT5>HuL3q_6vEKaAFd{(6BT
zBNjzo>lp_m>j3(OzMm^%FXvA&gd>m-)_S6mfC~X@zb-%rpe^1tW8f>qm?f+ZR7U?d
zC7}PSqwlK$MW*csX7?9GLmkq0+b=Vau2-RXmd4}_yzFch7GI>J=mUI@u?&5a(ZK%`
z@xLzvUtw3)*2aO`J2LgoFlN8uT{^BSTFG*R2VZLgbe4r6O|SYC&q>#_yqNi~1^(-T
z|EA!*9RpwJpM6D*fO_D+H39rD3;t^XrRz00_^D_R*kM-(xbFl?%3X{M2>s|~;3P0Z
z(X>m$fBQ91%KvYlN6MrS<9c~uEDO@>=UV$gw|51gGn*y;4O3;7yEJWa4S?UK0Ay_j
zky7}7wUqxqZfk)Jy&lk(0N&FcKFNTeB-+7!@c$oR0c@{SSp2vZ^0Spp%u`M9RTDfX
z+l~8x<X#KOgrxi3OtyC{ezscram!oE{~wpP@{~-%t{Qs%T0k?b^`L*t`6d|I&X5lN
zJaom~SQJ_RFAr&@VM?|JaK57wo$&zw2Z7(Tw?^P>0WN^|%vle(rRQHumNg}k8+dO3
zKpzLYJ+Tev0=Q(;c?&oOEMf7}orx(C27Du#=xzF_0hkbA7vK(T0?5Ae5{q)*2&tuH
zD4_;Q`LBdZRWQkH`tw92E#{Z_UmianuX#L!oB(<JBIY%Z=fMUj$S;o0<5{uc^Z2>w
zJZuBeUqSv@HhV5X*?w^m|9dappOYL*KpuaFgx>+=71f_4kscAm_zlEPL4FlBJh`4o
zN0y&UhkMWYc@E<I;E2m1rys$^8M5J}`eQ+m<a=p;v?(et6QADa=cjfc`Ch`0Y{cz^
zeJ{nIrII^*zJAmXpJ)4DevIGYKR?d{iP4ozY<>m$5tK(u8HCSiFM0hGM)JL!{#@4q
zBEG`@Sjvd~qxi9$2d*?f)g$*u@ALX43841{`uSXH;`~tcIe$$2Eb{W1XAYEJL4GEH
zxLYydzwi@DOd`}inZJVkvEuO5f0(~7{W<-QeV@Ny#LGSAe&JWk^S&Uz6%*b;OdWWV
z$B*Zps6g&DT&Dba{4^|ZfApU7C!yTZ{8r+?cqJaM^7`4da^ACk>O22+d!_*~el_+z
zpWfr&e}1O_<;*X9A`I_6a>8pdjqpFe1NoV@LqYgpGfE@jrv{M2OT3rE&(#H!9>Pod
zC4MvcDe9Nx3!6Tr!~GF`l7D&qXgs<8^7^y#ywByYjhbl(cmgWCD)^7Tc067<PErw+
z^@nqi+5o=!CmI2r|0>A)2Z8GVbOChNJ#$W;bV6j0iSzOztPMqnDJ(D)&svZ!8Bi99
z#hxzcE(f~10-nHO0O!FOTX(wSHy)t<5ISE#cLZ=}ipU<B>ftj$Hgc~3vX7^=@>#5*
z=fmx6=qIcIJ{S3HL~$1y-=x*R{RXi87xo9%0<^w=2oPU$?Ex$3r~09Kp}j;(6KzP~
z0{fo&Og&Ja{>p!_C8T?ZZD40k``%%I5FomXQf^ssah6W-H_~2;w$RqlH`N?=O`0WM
z1+1PI=P}Em{M|v%E`ZKV$fCQrpJ&3x5aWnwprLna6|@IUw1<+b266c{QGU{4MVDZ?
z^VTEjQxBp1Jxgp{z#fmje=5$=GX0#+>QevAOIJzxsh`n!iZNES5N80i0i}+Q&{tOk
zh9l2s0H@C`sq)Iohvtp@=o52|ywb`WZ;<cKdqq18&eIqm59;rv$CX9P*Lt1~+AwE|
zhQa33iWgYSw%dW(D){C|bCi1>psew|=ym5Syo2)psppHkvx;*}3}p87p}%Ehn4;6E
z%=Y6gqHq6=D$X$x^X_jb`{^R`p3cJ4{Jk7~sXucT!5e20yhV<Xk)7~-DG&<018ASL
zu=Px`<$2KDPtiUuH9TZaq_+W}@xCxL(S0v_fiXZ^pccw+X5^D<VC0>O?>eR$qWmO3
z)&yDszXK%mo`dXL*!d+6@o^UIvo%npa~wti!b(UB^~uO~ij<$`2jb;Q*v58-TLEjC
z0_P{l(j-^GmL7LV!Bz(G=*$GI85GHz#wf%SOf%Rf)GEqYfN`}4(js|Yk^E^*^BQH?
zWx6G$9PYPP*ybK>LIbcw{$6bUiprQIi{CZ$7g`x^g^XCpKk#4?(xLrpMaoZWpA&jM
zA5DQm=FCRJ8W`=z9qHXykOqccoM{>YR46PFsQ)v_m&Ptd${&n2Y**Mkzleu#grg2z
zfEx<ZK{P)=8QK<hK0th0e=71#nD=Ni3sL^!+uQcZR>3`ql|cusGvfh8c#(qgk2UZU
zT6|Hjay&p@G=?c6>;8lC50xWng@WLFXc!L!O+m{}1<RiedIDjWYOhd+Y+&R?@->ZT
z^42-@K7t;F$*Z6^xKqglAQ_z6W2OQ$-9a4;lV|?PO9S$tIam>ycQ4k)o${trEPl+N
zWdl;nfSyNyB6A|m9SgK}WGHNJhIrK9NT!mD|EOoWQ?gI7lvhe3NE`h?xES<6XDS~}
zY`EQ68#I%WWWJ9OSJCm0<izRto=t(WPQDq-cnO&R_jZbYMIKMta>>OPggu9L*<VJo
zG9P2yIElEVua-9->YLj!<~EX<XaO$}7ji4)F7$Qt^%=SJ8khrBfV#-zfdVuU?cs<w
zrGTuzcnvy8r%U>4dCO1X=$;*SBR^qz;EOb6K0yD*yjcx#X{`_p95V724KVf=RW|iW
zVPr_ik}7}!-jRM>k#XrM%D+-(-ZEZ*9v9?8<BcNar}>g}G6R8fSnFt(+bp9$;G0in
zx1_4VURKw@H?<1#n}WDk|I1Gl1%2BPV;@09oL$Z7Q+R(JP-G6EHPJR1#gzG&zOBUO
zrAS><U1egN5~BQ`D1RN4U%CvAaRTMnMftHt5``h2w7y2?2wds>E=MT*Bnv7+6UpQU
zxcKtAD8DN1ds>6I&j3Z*0EH#`XzqCnKyOChBp1$rd@u?;b;Yx#zyW~zGOd%T-AF4F
z@tNWeK^^b_9r$kpiqJ#z_z8K-Eg25w{{?Z6E0~`oZA5<(K<lU|;2!W0pgMgI&{|D0
zytLmGF3@mmQdwg!VSW6j_|n43dPXwb8Od_V`y0HffVc|)T8qm{U)j%z{ws!lf|lS%
zKm1m-JyP2`DQ}tchQrv|71)n7C>>>&RaW|u7*EDy9k2>CP@j?&PTF(yL($>9<&xKr
zIY$rQ>*`;Ke5KJ+*k_QfL>oW@jTiFt8OeMS5ofQw<&_o={ebT1>xy&)fV6Q+GCyVf
zrhFA`_Z%G*Z<Vxi$@>gtR|hbk3tcg{(O96!{L9f%6qoW-l<!=AcQEhFkhiR|!a){O
z1*}p2d7$MVfOx{?@h!XOQT_q4%BARYeIG#;l$&I=GXU9a6t+JQKi>hgo>N56plDt1
zft^ANMa!9Ac-;9~24kWF<^_MyPU~chv5dYzk$&+SWBN((hU^Gvj#tL~LiM@^wkk@w
zzYq7kYcSt?@Da{MJ|riRyehBG05Xxthw2&i_#5I~28z2KL?g{H`SG$S?@)H!{Z0CM
zA<lUSaSxNQI@ah-u+Q@g$-D3qZ;1wJeIya@8My@I*MqLQ7s4F^6m8%6h2M^Swsu7+
zqe5}9PSf*E*D&!*HAMMQ7tFra2$bnBAaDC7xj08IMftHe#^`wqrSF$gZ#2G)2P_pT
zLoo+NJ(z-qKEU6AB(JE?2T5iJe=}fxo^X<J6!{&FbUNUUQR(YX$&8i!8**zUU;^lo
z93P*eZLXHgT5pR;A9~(y2&agSQ(B#5K%d|NRFzEsYyHNZylN%^>E%%_YV+I}cL{C&
z3w=4p#5w@2aV6_YS{Y?ML;9i*`u-`_*gN>@yr()x`FR)<by21;fc6z=&R?UmJ?)Ec
z-QhXO7drs1o@A9%*7Mik`&7)W+9?0mV)+;OVr;|yHX}1p*?llJ+F@Vfi#>Wgr+H*G
z!pJ+1O4b9(A|dEkRe>+k{(`6ye1Nj+0NX&{TEG$KWTk!=3}v?jRu-cCG-iYXgBTq=
zeN+A$`U71Wz;GikL3QZXWS;+qon2+X2VoWAe+KI8KHx?=;%}(f3X=kP7<zE}CbdHS
zA7hGOCcZ6grErGdQUv-on;&Cu_Yvin?ANk>lU(^3`&$D6^RHPHI`@umiqbbt+oAp1
z<2#mCfI|C@VmU!LN>1LBJoy4^yI~mXEl~avpX+cpI(_4j#;f7%m__n_FbjXkGPsvp
z)E@Tg1+Txb2ZS^_0`jZ@5|PJ6U^?ukEC98V<~_=$4OB(Hn*^F~ftI{&HXiT!fGcd>
zY7|5t+78Wovw{5dn<NvFE)aDr8ilpjS2;(|eDg{Ct+&yjCl%n<cjS_v<UAU~$X32T
z)&O<#D?j8FEtGo*!iE9T`Z|@H(j^<44f^;_D89F(o?n{Ed`J2Z@O>!gA$dcRUh*Hq
zJWt;<%Gt9kh!&LJ8S!oaTs=yrNx$Di_kW1}?vO%bN<n!S<~_=82b%l<Ng0*aa~tT}
z9;~dBZ_>0TFb=?37GJ?!#CrnJdR4OAq+g^y?uq<5qWqR9zZM{Oz2-|m5nh7kL7*!%
z554=)Hh;l-GhdrWoK?^;^b#~cTxvfg=a5dH>oc@RMtVhW^l$20U4YscV~fwGtO(`i
zqN4n+m~$dP8^^B{*h`Gp_bYe~HuDme!M7S{ocBQdy#UP@+kv&f45T{%>%x{M{+VUI
z)w|e1<22ApyyD6qgJ+%PvA1RJtoBW5jPiHHoQ-b^rPVg`$)r1~OMDLF_^GH1ecdSF
zI#+&_<vdV)cAtt8RKnp=?zTW6SAL9lVZcvQe)MzF>5Tz6`yd*>I~T^zu!LDxJlR2S
zf$@%Xl^X$)`v{{A`~*roSzSD&7gs7+@95&2?o<aP=e+<nVqW-Z`T*4<?m1Fp@3>lq
z@*e}n7gzn3R62A~i}=a}Pv%+|<{lHe=m0k@FgG>ui4yoZ*OUAX;(I4}R_tjr1M$5R
zxIKaHlb{<WI9GfviV7v`z(+!G>5d4v%rlP+aaai#_c`DWhzz>d!IS5Z%jJ_F#IbHH
z<EmtPwxj2~xxB`?Tq49lOvisNs}*y|aXl#*3dFg5^1{zuG(z=2ek9~ccT;d~b}WxC
zzN>;4D|S8MqN1`rseBSHpRM5wta~Ym%7DLE_U?*Us>f_T{BX0Vo*d{t8Mt^X;yHK{
zgG4ANMH8toRgf*xN}^!dB7-C*H@H^tW{a%w4}T&n{KKD!y&r>fGgyC?*rj_S8UurY
zoO>dw7yX_H%!fSm2X&w<V1<3b-{7tSP63Z0-zLD#07&k~+Cg*^*o%16Ft)bV^B0*w
zrqISY9#xbN)RoL1_KJDXCuo8u(!=)!4gf?eC+o@Ml5U3N`bCfnZJ`q~(GP%*LdoU8
zI+E;YEua(XhrDS&xv+aEl4XRA0`#$dLI>PEfPKw0<*dJ;|AC&MA!yzLK%c2leM{0S
z>vs&=*Bt2B>geq&=5GxX?Y=@A(0l?=L~kvNR$0%fT^+=k<PKOz7TbCO^LA6DOZE%2
z4waR6aXqItc?0dVH_oi*yH=K`$75g30_g<;DJ4KVR|iyvyWm4#oH;K%9}pMmOaf?6
zj7z(uTxzHPV4tjeVaFwm3yqM*c|ehIhOv7<oaN|`uE0e6UH}{fo&t)bMeX$@=%6V~
zu|JH(h&u&{1LUc5nx{g5-arksJKO=2s%zwxS_X5Q5uRBAEiiwLgG+Y&G`Go<?k9xX
zg!9U#iWC6nE2!P?0pzK#4+yv1*jr!?R|D%bsaav(Q^DK;-6qZ<`!Y7_q|=k93^54j
ztVlb;-0pxlIXZ-V@*zFoX6V37=nRNd!t*=?KUB(YP1Qxbu>kbt1(t#8@F4U~HS;7T
z(~J5coozvO;iS)nZa|c;{##}y&<R1$G689D>+Jk^5f7S1>Ft+q-y3O82B_`jm*z+C
zX{k*5<ahxa1mXkqnR1QKJ0Sa2kt12^yR!kB5A&n{Ip#KJdD6=-4EAtZBA*CAmVTb2
zAsY47MOOck%mZo70Sb~8VjvR@kc?NT-z1-ib<Nb5I9jE-sc1tJ^#Tgsw?sPfw4XTC
z#|+pb6pC!gNJ!^*<Ox|cU;3#`zCcCExYB+jj0^AzkX8@eGiqNnFIh@UL)tUcp9#XZ
zHu4fyfsB@8g91L*#B+JZqI2l;4Wy+l`3(7VM0!{=<x4-6gUUD^^M#RQ{4aiEPJlc^
zv=d)10Rb4d+DM$g)4?-Ioi|rUd0h(A=8N?6JU|&MBisQXZ~VM|8k?Vh|2R)3s)_St
zx#caiE#1wj8qlS;#=Nx&VPn}esjZ*FeB27Ub%qb{KV;wK%|p^pbDm=!a=-AxhD9BF
zQZo@YUXC<r+@N&%pm8PI0``6czk)VeUs8Ps0^va3@mPdseu!g*GLZeuBS4Z?X}?Jh
z?)C-2QXU|GlKZd7ktUZ9mE$8o^HB^ybxq|WI=L`;-S-%SyTM+(HXA<g`@DV{6MIS_
z_lrlMUjyxEHp0i{lYidy74>JKuPwp4k?b<$7=J`)cOtp!p9+L~QpzLUBLIyxiqa-$
zc+nxG9R}pbCoU~&&z-)g?0g=855!wVWL_>Wd0nb^d1$Bf?gOp!Ay?`voNZ}>G3FZL
zmc$x3zjB|#*ic_8kp(=&*_1k<pXMt$+G$Q(qduJo(7G%?y5xPg0c!~J0!Wqm8g>;t
zL$6Op=+jsiGVL`6F6n}ZMp^mO9dO4`XUh>Lx`42<(%_!coG}_YKuxIx%J~p>IjRO;
zBHU3a{1fRd1$F{hrwZ=?G=6gVk&AVrC>UiX``AC=znka1WY~g!-$vZ_a*`=uFq~;H
z^h?vl+)L*f$R2^}nf7})p3r&kOOzPaPi0Yt5qMV^eFm+Uy)idh$VXV=LLq%)<g)~z
zHqX|-FqlgX@2lYXcJ^IC^)2~6263EV|B+ih;Sw%J7wx?U@*;VGctGRgR4#d(A<#g3
z8ir?F`6OwS^_%A09Z1)TOGlY5Y=2adAJsAGRHAW5Q8mf@@XQgQaa<O?d7nQ(nSaAt
zQ|Wcj(TR4x5%j%6+wWgyYg#T_G(5AxyNhztPkdSmJzR;}_#n>-7;`#-)-u@3<%-3)
zkk?w_{S`UrPec48#$JNT#j(cAMMMD}(54Mray7`|UxdV5&<XF+Ch~1Rw66FEwnz=-
z)hm6GQ2`H7uPU(XRLB00Cgwy9*aH`+ztY~GhVa7Cpe;W+gvQDnSo0{Z_bh;BdJSD0
zY;gr;!P`2Z<#)7)325(5m;;)?o|pE3a@MuElEWGj@v4Hh-9X+x%K2%Iy#!q&Z9NwI
zA9OV;fH~-x4*Fvx=%8`(4(J~Wdt9k~4R?{??s`VACv8nfW9@5<Gl7`fD-}E8NRZ?o
z1Mkmek<Wa9(+!bJ<)Av90JOmzRW@(_;Ee{}bpWLGTN-07qikc#c%|vVb~bPHV(}Y#
zr>GeEq_jm|2>|h%Xuk~%M*A_5mI<AWG6_i2!QAr;!kq@Vc0&D=<XAq|*$WWA0c<~W
zjkD6yE6Ovpb9G=P^5p8CbSI-^ogu8SK2rnZM@xj64%`B04fiiVJO~89F9FsSDJ5dF
zi#pT*Xe~)~M`OepeSeWmn<NvT6g~8l!3ehs?V8F><rsuC+K>zn`^yrWPl<2fu?9do
zA)-G8?R8UrdkVM{MicZ`#yDt!G0E_2+fagLHDCrn?T6(2<wa?aB^?(w7b>vrs{y(n
z12q5aly*k1r0J47Kg>U6{h<pK@6nN7>L&VgL%HLB0|7}E)Atr=;w%?rFy?NAx!(X;
zVxuig8?e2C*%4e%9;}t%f;a>GBReZC)>VTGdn{OZ@pz16DdEC43f59Q9%edEo&&iK
zFnb~wmQLi7S(w<(qEB(bFLr4lAU`krlGyc75xW(6p3HNk3zx}H!X-NBInhUMHs^Y>
ze$I`BANeA8PsYvW^Tlo~%(3u1J2qDACdCr{u$yDuWIiRDEpp&fxwB&pJhjN>8+cN^
zWP>=dJD06D`iu{r&lll4NQ~>ixC_Zxi~M=q3t~SBOI3h&K#rZ3q2hL0G{&M|GzG>2
zK>*3g$pFn4&_9c!;L;d88U3iKL4dGGx)6}d{E>WuIi)eu-3z<`C_iqlm*&zOfN+9+
zkU`tP_DG9MOJNuGx=4#;C&+L4<|&O1dPeoL4EDVWpSMCBEyQ&LQUF=$%k!M->niH8
z7VKsUTt_f<6v)^(kj_RCT|j=ckPLhY^KA{L5f{J+v>!8s{~O3JO_CK!Hx>v`+aUc{
zerq2}>nz4~nQ{$B7UUQwgq2nAv>xyUs7)HcmIHhemBDXw^bwMy_W`t4kw!boc`FRI
z38b%sSOO4chm80nA8a<=@>yn^h7E%@!psFoj*vz}GW2cvuAva`sf@VS(2fc|lS#Ti
z@f{+v_9g-_KFBNs;>xP~l^l->av|RtBOS?gfn;4gMjMl9TY-3_Uy=;Z{hn7)zB%ui
zH7)Z!b7^U1Vr&|a6E@Ee<SncI&w}nR)^J>Ov|HkRO{_1teM>9&rL9S`u@>CK#mnm=
z4vphnnI+wi$b-&sX<<J#Cz|f2G@ia6+0BS1BVHuj@y5`zKtG}mzpTCTb2$m*`2pIT
z5l>pZ5iK;HK?f+J^Px8AbN29)-trv4wKv2SjimisJd-tc-^-J2ju&ZGK)h>!WZ5Nu
z(|AVv3nVL2{nD5q`JR5?qct4+KJPp7Kg!8a$`9z+`XP+8bfu*w?HSo_c>}WQ@wSv~
za~`1%G!d8hDJ#CTJSCqIpJer83Kx`<nT#LUThl>%cSBfO8%w60_czs(qID&gd<@-+
zDtKXw^tS;o0pfk${IBBKOn~Y<?>lLJs+SHjiv7i7><dA^BgC3jXbAhk>WEAIgT^+F
zmiwTQ){rg$wM%JvN<O=QzEl2-j1+i)JgkA6Y+ihdiZl)3Kh8dr%un(+(L5fy5&2{h
z88kuvLvzeDfcahl7YzM=BA#*eBAGAu`!4SIkvWEP>B!@vkC2?O8T!8*9Tj-=JD!mY
z%H=2NzQyzIhF;8SL!K=14TH2Spv%^k41_U<#zJZN(_JSsX-t+3ugGucqzxfI(ft%<
zq{Y+oW9A=evJ1&WQz8G!ob#on$2~*a*9Gm9kOs+wk-%Z-W53v|M;IHt%R5%U)<iS`
zdgH?7y(NkI)pUUBl;)QcKu@&wDja>}LN1~?;|##@I~Bi!K~oFpO$*)D!OxmV=K-6p
zzoBo6KH6PQ*02{Vi+7H|KR^<2AMiyyd(fy2-DghN!u-Ho-wNr`JV$4>o8?UddfTcf
z<7m|J5QOP~@YPHMvWiXrhjFbA(v1dUfXaFESGHfzKb7Xe#Xus;)<oI)NlPE+B{8lt
zcNohxXGu$=pl9?2V>(-hk%^A5ImGgKRxAu+4ftBIR<T;K4oPBIEz^qC<Y~o1bwfaS
z28gQ#y@?idCR*7^o&et<TLh$fT4k^GbjZ^3^vJ;1r83GwS<}WKJHr9T)qpI379>zh
zFuIByJq1Rp{JuL3y?`G0;tE^_k^x#Dp}#YG#CBNw%UoxHmsJoh0^s^<UYFAHM!)`I
ztV2JbHo?ymm-Za)0(sY6e__nbbzTKw8~|>Nrnzbr03M4-UN!@k10+8YP4s>YswU?T
zej1NBdhdhAT<P<0=R4k0np~KtpgCvG%M<?=`fpBuMZJ)`L+e>9fRm|+20M-b{HAp-
zM?cX?{e$F^+Q4ey7HFyjSF$dKbMaZ(@(SK_^pQ()CXJcgGnz-Z{OQ?PE<U+PgZM{d
z9T%QlqDhi2$#=g<#%I#-78oG>NMIX4`h?fO6o9LHE<U}>S8ss#e*k}D%q1SuJjKN&
zcLU^|yxHgSqdZh!jQuX$Ac^>qNK@(=56p`+H~j{T$9xDaB~+C(NB?Ipd<Va)<jP3G
zgLi)duK-7yE8v;}JAuc*T(k?R`<+}82t(~OXRiXks{wV;FTdzBTpURk>DNN|6_U>i
r`W=f>@-dTpqQ!b*R$VwBfJ?s#d>|IcARx?hgrzvI9eBJqShD^fw0BJL

literal 0
HcmV?d00001

diff --git a/docs/conf.py b/docs/conf.py
index 1e00086865..ffcf7c6a82 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -107,3 +107,5 @@ def linkcode_resolve(*args):
 
 # Determines whether remote or local git branches/tags are preferred if their output dirs conflict
 smv_prefer_remote_refs = False
+
+html_favicon = "_static/favicon.ico"
diff --git a/docs/tutorial/tutorial-embeddings/other-embeddings.md b/docs/tutorial/tutorial-embeddings/other-embeddings.md
index d93802e12a..814cf0f8d7 100644
--- a/docs/tutorial/tutorial-embeddings/other-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/other-embeddings.md
@@ -3,7 +3,7 @@
 Flair supports many other embedding types. This section introduces these embeddings.
 
 ```{note}
-We mostly train our models with either [`TransformerEmbeddings`](#flair.embeddings.transformers.TransformerEmbeddings) or [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). The embeddings presented here might be useful 
+We mostly train our models with either [`TransformerEmbeddings`](#flair.embeddings.transformer.TransformerEmbeddings) or [`FlairEmbeddings`](#flair.embeddings.token.FlairEmbeddings). The embeddings presented here might be useful 
 for specific use cases or for comparison purposes. 
 ```
 

From 8c570ce73b51ee9eb093e058aa435b9fa20ff1d1 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 3 Jul 2023 19:04:06 +0200
Subject: [PATCH 078/124] fix sphinx warnings in docstrings

---
 flair/data.py                                 |  72 ++--
 flair/datasets/base.py                        |  10 +-
 flair/datasets/biomedical.py                  |  15 +-
 flair/datasets/document_classification.py     |  43 ++-
 flair/datasets/entity_linking.py              |  83 +++--
 flair/datasets/sequence_labeling.py           | 312 ++++++++----------
 flair/datasets/text_text.py                   |  24 +-
 flair/embeddings/document.py                  |  77 +++--
 flair/embeddings/legacy.py                    |  32 +-
 flair/embeddings/token.py                     |  60 ++--
 flair/embeddings/transformer.py               |   5 +-
 flair/models/entity_linker_model.py           |  14 +-
 flair/models/lemmatizer_model.py              |  34 +-
 flair/models/multitask_model.py               |  39 ++-
 flair/models/pairwise_classification_model.py |  13 +-
 flair/models/relation_classifier_model.py     |  23 +-
 flair/models/sequence_tagger_model.py         |  40 ++-
 flair/models/tars_model.py                    |  43 ++-
 flair/models/text_classification_model.py     |  17 +-
 flair/nn/model.py                             |  17 +-
 flair/splitter.py                             |   5 +-
 flair/trainers/plugins/base.py                |   7 +-
 22 files changed, 487 insertions(+), 498 deletions(-)

diff --git a/flair/data.py b/flair/data.py
index 9ef97048bd..24d8cef055 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -69,8 +69,10 @@ def remove_item(self, item: str):
     def add_item(self, item: str) -> int:
         """Add string - if already in dictionary returns its ID. if not in dictionary, it will get a new ID.
 
-        :param item: a string for which to assign an id.
-        :return: ID of string
+        Args:
+            item: a string for which to assign an id.
+
+        Returns: ID of string
         """
         bytes_item = item.encode("utf-8")
         if bytes_item not in self.item2idx:
@@ -81,8 +83,10 @@ def add_item(self, item: str) -> int:
     def get_idx_for_item(self, item: str) -> int:
         """Returns the ID of the string, otherwise 0.
 
-        :param item: string for which ID is requested
-        :return: ID of string, otherwise 0
+        Args:
+            item: string for which ID is requested
+
+        Returns: ID of string, otherwise 0
         """
         item_encoded = item.encode("utf-8")
         if item_encoded in self.item2idx:
@@ -99,8 +103,10 @@ def get_idx_for_item(self, item: str) -> int:
     def get_idx_for_items(self, items: List[str]) -> List[int]:
         """Returns the IDs for each item of the list of string, otherwise 0 if not found.
 
-        :param items: List of string for which IDs are requested
-        :return: List of ID of strings
+        Args:
+            items: List of string for which IDs are requested
+
+        Returns: List of ID of strings
         """
         if not hasattr(self, "item2idx_not_encoded"):
             d = {key.decode("UTF-8"): value for key, value in self.item2idx.items()}
@@ -706,15 +712,17 @@ def __init__(
     ) -> None:
         """Class to hold all metadata related to a text.
 
-        Metadata can be tokens, predictions, language code, ...
-        :param text: original string (sentence), or a list of string tokens (words)
-        :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
-            more advanced options are :class:`SegTokTokenizer` to use segtok or :class:`SpacyTokenizer`
-            to use Spacy library if available). Check the implementations of abstract class Tokenizer or
-            implement your own subclass (if you need it). If instead of providing a Tokenizer, this parameter
-            is just set to True (deprecated), :class:`SegtokTokenizer` will be used.
-        :param language_code: Language of the sentence
-        :param start_position: Start char offset of the sentence in the superordinate document
+        Metadata can be tokens, labels, predictions, language code, etc.
+
+        Args:
+            text: original string (sentence), or a pre tokenized list of tokens.
+            use_tokenizer: Specify a custom tokenizer to split the text into tokens. The Default is
+                :class:`flair.tokenization.SegTokTokenizer`. If `use_tokenizer` is set to False,
+                :class:`flair.tokenization.SpaceTokenizer` will be used instead. The tokenizer will be ignored,
+                if `text` refers to pretokenized tokens.
+            language_code: Language of the sentence. If not provided, [langdetect](https://pypi.org/project/langdetect/)
+                will be called when the language_code is accessed for the first time.
+            start_position: Start char offset of the sentence in the superordinate document.
         """
         super().__init__()
 
@@ -1354,11 +1362,14 @@ def make_vocab_dictionary(self, max_tokens=-1, min_freq=1) -> Dictionary:
 
         By defining `max_tokens` you can set the maximum number of tokens that should be contained in the dictionary.
         If there are more than `max_tokens` tokens in the corpus, the most frequent tokens are added first.
-        If `min_freq` is set the a value greater than 1 only tokens occurring more than `min_freq` times are considered
+        If `min_freq` is set to a value greater than 1 only tokens occurring more than `min_freq` times are considered
         to be added to the dictionary.
-        :param max_tokens: the maximum number of tokens that should be added to the dictionary (-1 = take all tokens)
-        :param min_freq: a token needs to occur at least `min_freq` times to be added to the dictionary (-1 = there is no limitation)
-        :return: dictionary of tokens
+
+        Args:
+            max_tokens: the maximum number of tokens that should be added to the dictionary (-1 = take all tokens)
+            min_freq: a token needs to occur at least `min_freq` times to be added to the dictionary (-1 = there is no limitation)
+
+        Returns: dictionary of tokens
         """
         tokens = self._get_most_common_tokens(max_tokens, min_freq)
 
@@ -1563,12 +1574,13 @@ def add_label_noise(
     ):
         """Generates uniform label noise distribution in the chosen dataset split.
 
-        :label_type: the type of labels for which the noise should be simulated.
-        :labels: an array with unique labels of said type (retrievable from label dictionary).
-        :noise_share: the desired share of noise in the train split.
-        :split: in which dataset split the noise is to be simulated.
-        :noise_transition_matrix: provides pre-defined probabilities for label flipping based on the
-        initial label value (relevant for class-dependent label noise simulation).
+        Args:
+            label_type: the type of labels for which the noise should be simulated.
+            labels: an array with unique labels of said type (retrievable from label dictionary).
+            noise_share: the desired share of noise in the train split.
+            split: in which dataset split the noise is to be simulated.
+            noise_transition_matrix: provides pre-defined probabilities for label flipping based on the initial
+                label value (relevant for class-dependent label noise simulation).
         """
         import numpy as np
 
@@ -1664,7 +1676,14 @@ def get_all_sentences(self) -> ConcatDataset:
 
     @deprecated(version="0.8", reason="Use 'make_label_dictionary' instead.")
     def make_tag_dictionary(self, tag_type: str) -> Dictionary:
-        # Make the tag dictionary
+        """Create a tag dictionary of a given label type.
+
+        Args:
+            tag_type: the label type to gather the tag labels
+
+        Returns: A Dictionary containing the labeled tags, including "O" and "<START>" and "<STOP>"
+
+        """
         tag_dictionary: Dictionary = Dictionary(add_unk=False)
         tag_dictionary.add_item("O")
         for sentence in _iter_dataset(self.get_all_sentences()):
@@ -1729,7 +1748,6 @@ class ConcatFlairDataset(Dataset):
     This class is useful to assemble different existing datasets.
 
     Args:
-    ----
         datasets (sequence): List of datasets to be concatenated
     """
 
diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index 1ec7a0bbf8..023eeb894d 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -77,11 +77,11 @@ def __init__(
     ) -> None:
         """Instantiate StringDataset.
 
-        :param texts: a string or List of string that make up StringDataset
-        :param use_tokenizer: Custom tokenizer to use (default is SpaceTokenizer,
-        more advanced options are SegTokTokenizer to use segtok or SpacyTokenizer to use Spacy library models
-        if available). Check the code of subclasses of Tokenizer to implement your own (if you need it).
-        If instead of providing a function, this parameter is just set to True, SegTokTokenizer will be used.
+        Args:
+            texts: a string or List of string that make up StringDataset
+            use_tokenizer:
+                Custom tokenizer to use. If instead of providing a function, this parameter is just set to True,
+                :class:`flair.tokenization.SegTokTokenizer` will be used.
         """
         # cast to list if necessary
         if isinstance(texts, str):
diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index abb06cecbe..2ce701d51d 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -331,8 +331,8 @@ def __init__(
     ) -> None:
         """Initialize CoNLLWriter.
 
-        :param sentence_splitter: Implementation of :class:`SentenceSplitter` which
-        segments the text into sentences and tokens
+        Args:
+            sentence_splitter: Sentence splitter which segments the text into sentences and tokens.
         """
         self.sentence_splitter = sentence_splitter
 
@@ -408,15 +408,12 @@ class HunerDataset(ColumnCorpus, ABC):
     """Base class for HUNER datasets.
 
     Every subclass has to implement the following methods:
-      - `to_internal', which reads the complete data set (incl. train, dev, test) and returns the corpus
-        as InternalBioNerDataset
-      - `split_url', which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files
+      - "to_internal", which reads the complete data set (incl. train, dev, test) and returns the corpus as InternalBioNerDataset
+      - "split_url", which returns the base url (i.e. without '.train', '.dev', '.test') to the HUNER split files
 
     For further information see:
-      - Weber et al.: 'HUNER: improving biomedical NER with pretraining'
-        https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext
-      - HUNER github repository:
-        https://github.com/hu-ner/huner
+      - Weber et al.: 'HUNER: improving biomedical NER with pretraining' https://academic.oup.com/bioinformatics/article-abstract/36/1/295/5523847?redirectedFrom=fulltext
+      - HUNER github repository: https://github.com/hu-ner/huner
     """
 
     @abstractmethod
diff --git a/flair/datasets/document_classification.py b/flair/datasets/document_classification.py
index 32fb5e64ca..2c0d6b3416 100644
--- a/flair/datasets/document_classification.py
+++ b/flair/datasets/document_classification.py
@@ -44,22 +44,20 @@ def __init__(
     ) -> None:
         """Instantiates a Corpus from text classification-formatted task data.
 
-        :param data_folder: base folder with the task data
-        :param label_type: name of the label
-        :param train_file: the name of the train file
-        :param test_file: the name of the test file
-        :param dev_file: the name of the dev file, if None, dev data is sampled from train
-        :param truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
-        :param truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
-        :param filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
-        :param tokenizer: Tokenizer for dataset, default is SegtokTokenizer
-        :param memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full'
-        if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if
-        even this is too much for your memory, use 'disk'.
-        :param label_name_map: Optionally map label names to different schema.
-        :param allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
-        :param encoding: Default is 'utf-8' but some datasets are in 'latin-1
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            label_type: name of the label
+            train_file: the name of the train file
+            test_file: the name of the test file
+            dev_file: the name of the dev file, if None, dev data is sampled from train
+            truncate_to_max_tokens: If set, truncates each Sentence to a maximum number of tokens
+            truncate_to_max_chars: If set, truncates each Sentence to a maximum number of chars
+            filter_if_longer_than: If set, filters documents that are longer that the specified number of tokens.
+            tokenizer: Tokenizer for dataset, default is SegtokTokenizer
+            memory_mode: Set to what degree to keep corpus in memory ('full', 'partial' or 'disk'). Use 'full' if full corpus and all embeddings fits into memory for speedups during training. Otherwise use 'partial' and if even this is too much for your memory, use 'disk'.
+            label_name_map: Optionally map label names to different schema.
+            allow_examples_without_labels: set to True to allow Sentences without label in the corpus.
+            encoding: Default is 'utf-8' but some datasets are in 'latin-1
         """
         # find train, dev and test files if not specified
         dev_file, test_file, train_file = find_train_dev_test_files(data_folder, dev_file, test_file, train_file)
@@ -759,14 +757,13 @@ def __init__(
     ) -> None:
         """Initialize the IMDB move review sentiment corpus.
 
-        :param base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
-        :param tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
-        :param rebalance_corpus: Default splits for this corpus have a strange 50/50 train/test split that are impractical.
-        With rebalance_corpus=True (default setting), corpus is rebalanced to a 80/10/10 train/dev/test split. If you
-        want to use original splits, set this to False.
-        :param memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster
+        Args:
+            base_path: Provide this only if you store the IMDB corpus in a specific folder, otherwise use default.
+            tokenizer: Custom tokenizer to use (default is SegtokTokenizer)
+            rebalance_corpus: Weather to use a 80/10/10 data split instead of the original 50/0/50 split.
+            memory_mode: Set to 'partial' because this is a huge corpus, but you can also set to 'full' for faster
          processing or 'none' for less memory.
-        :param corpusargs: Other args for ClassificationCorpus.
+            corpusargs: Other args for ClassificationCorpus.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/datasets/entity_linking.py b/flair/datasets/entity_linking.py
index bac2beb132..801144ca81 100644
--- a/flair/datasets/entity_linking.py
+++ b/flair/datasets/entity_linking.py
@@ -797,10 +797,11 @@ def __init__(
         see https://arxiv.org/abs/2101.01228v2
 
         The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -940,10 +941,10 @@ def __init__(
     def _text_to_cols(self, sentence: Sentence, links: list, outfile):
         """Convert a tokenized sentence into column format.
 
-        :param sentence: Flair Sentence object containing a tokenized post title or comment thread
-        :param links: array containing information about the starting and ending position of an entity mention, as well
-        as its corresponding wiki tag
-        :param outfile: file, to which the output is written
+        Args:
+            sentence: Flair Sentence object containing a tokenized post title or comment thread
+            links: array containing information about the starting and ending position of an entity mention, as well as its corresponding wiki tag
+            outfile: file, to which the output is written
         """
         for i in range(0, len(sentence)):
             # If there are annotated entity mentions for given post title or a comment thread
@@ -1002,10 +1003,10 @@ def _text_to_cols(self, sentence: Sentence, links: list, outfile):
     def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> list:
         """Fills the array containing information about the entity mention annotations.
 
-        :param annot_array: array to be filled
-        :param key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
-        :param post_flag: flag indicating whether the annotations are collected for the post titles (=True)
-        or comment threads (=False)
+        Args:
+            annot_array: array to be filled
+            key: reddit id, on which the post title/comment thread is matched with its corresponding annotation
+            post_flag: flag indicating whether the annotations are collected for the post titles or comment threads
         """
         while True:
             # Check if further annotations belong to the current post title or comment thread as well
@@ -1024,8 +1025,8 @@ def _fill_annot_array(self, annot_array: list, key: str, post_flag: bool) -> lis
     def _fill_curr_comment(self, fix_flag: bool):
         """Extends the string containing the current comment thread, which is passed to _text_to_cols method, when the comments are parsed.
 
-        :param fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed (=True)
-        or regular rows (=False)
+        Args:
+            fix_flag: flag indicating whether the method is called when the incorrectly imported rows are parsed or regular rows
         """
         next_row = None
         while True:
@@ -1151,19 +1152,18 @@ def split_span(word_fields: List[str], datasetname: str):
                     txt_out.write("\n")
 
 
-def determine_tsv_file(filename: str, data_folder: Path, cut_multisense: bool = True):
+def determine_tsv_file(filename: str, data_folder: Path, cut_multisense: bool = True) -> str:
     """Checks if the converted .tsv file already exists and if not, creates it.
 
-    Returns name of the file.
-    ----------
-    string : str
-        String that contains the name of the file.
-    data_folder : str
-        String that contains the name of the folder in which the CoNLL file should reside.
-    cut_multisense : bool, optional
-        Boolean that determines whether or not the wn30_key tag should be cut if it contains multiple possible senses.
-        If True only the first listed sense will be used. Otherwise the whole list of senses will be detected
-        as one new sense. The default is True.
+    Args:
+        filename: The name of the file.
+        data_folder: The name of the folder in which the CoNLL file should reside.
+        cut_multisense: Determines whether the wn30_key tag should be cut if it contains multiple possible senses.
+            If True only the first listed sense will be used. Otherwise, the whole list of senses will be detected
+            as one new sense. The default is True.
+
+    Returns:
+        the name of the file.
     """
     if cut_multisense is True and filename not in [
         "semeval2007task17",
@@ -1211,27 +1211,18 @@ def __init__(
         If the constructor is called for the first time the data is automatically downloaded and transformed from xml to a tab separated column format.
         Since only the WordNet 3.0 version for senses is consistently available for all provided datasets we will only consider this version.
         Also we ignore the id annotation used in datasets that were originally created for evaluation tasks
-        :param filenames: Here you can pass a single datasetname or a list of ddatasetnames. The available names are:
-            'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3',
-            'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test',
-            'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt'.
-            So you can pass for example filenames = ['masc', 'omsti', 'wngt']. Default two mid-sized datasets 'masc' and 'semcor' are loaded.
-        :param base_path: You can override this to point to a specific folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-        :param cut_multisense: Boolean that determines whether or not the wn30_key tag should be cut if it contains
-                               multiple possible senses. If True only the first listed sense will be used and the
-                               suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of
-                               senses will be detected as one new sense. The default is True.
-        :param columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "sense"}
-            if you want to use additional pos and/or lemma for the words.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :param sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if
-                                                                                                                    sample_missing_splits_in_each_corpus is True)
-        :param sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
-        :param use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso")
-            will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
-        :param name: Name of your (costum) corpus
+
+        Args:
+            filenames: Here you can pass a single datasetname or a list of datasetnames. The available names are: 'masc', 'omsti', 'raganato_ALL', 'raganato_semeval2007', 'raganato_semeval2013', 'raganato_semeval2015', 'raganato_senseval2', 'raganato_senseval3', 'semcor', 'semeval2007task17', 'semeval2007task7', 'semeval2013task12', 'semeval2015task13', 'senseval2', 'senseval2_lexical_sample_test', 'senseval2_lexical_sample_train', 'senseval3task1', 'senseval3task6_test', 'senseval3task6_train', 'trainomatic', 'wngt',
+            base_path: You can override this to point to a specific folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            cut_multisense: Boolean that determines whether the wn30_key tag should be cut if it contains multiple possible senses. If True only the first listed sense will be used and the suffix '_cut' will be added to the name of the CoNLL file. Otherwise the whole list of senses will be detected as one new sense. The default is True.
+            columns: Columns to consider when loading the dataset. You can add 1: "lemma" or 2: "pos" to the default dict {0: "text", 3: "sense"} if you want to use additional pos and/or lemma for the words.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
+            sample_missing_splits_in_multicorpus: Whether to sample missing splits when loading the multicorpus (this is redundant if sample_missing_splits_in_each_corpus is True)
+            sample_missing_splits_in_each_corpus: Whether to sample missing splits when loading each single corpus given in filenames.
+            use_raganato_ALL_as_test_data: If True, the raganato_ALL dataset (Raganato et al. "Word Sense Disambiguation: A unified evaluation framework and empirical compariso") will be used as test data. Note that the sample_missing_splits parameters are set to 'only_dev' in this case if set to True.
+            name: Name of your corpus
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 0a5bf1b589..849c7c8995 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -281,20 +281,19 @@ def __init__(
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
 
-        :param data_folder: base folder with the task data
-        :param column_format: a map specifying the column format
-        :param train_files: the name of the train files
-        :param test_files: the name of the test files
-        :param dev_files: the name of the dev files, if empty, dev data is sampled from train
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            column_format: a map specifying the column format
+            train_files: the name of the train files
+            test_files: the name of the test files
+            dev_files: the name of the dev files, if empty, dev data is sampled from train
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         # get train data
         train: Optional[Dataset] = (
@@ -386,20 +385,19 @@ def __init__(
     ) -> None:
         r"""Instantiates a Corpus from CoNLL column-formatted task data such as CoNLL03 or CoNLL2000.
 
-        :param data_folder: base folder with the task data
-        :param column_format: a map specifying the column format
-        :param train_file: the name of the train file
-        :param test_file: the name of the test file
-        :param dev_file: the name of the dev file, if None, dev data is sampled from train
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a Corpus with annotated train, dev and test data
+        Args:
+            data_folder: base folder with the task data
+            column_format: a map specifying the column format
+            train_file: the name of the train file
+            test_file: the name of the test file
+            dev_file: the name of the dev file, if None, dev data is sampled from train
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         # find train, dev and test files if not specified
         dev_file, test_file, train_file = find_train_dev_test_files(
@@ -440,17 +438,16 @@ def __init__(
     ) -> None:
         r"""Instantiates a column dataset.
 
-        :param path_to_column_file: path to the file with the column-formatted data
-        :param column_name_map: a map specifying the column format
-        :param column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t"
-        to split only on tabs
-        :param comment_symbol: if set, lines that begin with this symbol are treated as comments
-        :param in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
-        :param document_separator_token: If provided, sentences that function as document boundaries are so marked
-        :param skip_first_line: set to True if your dataset has a header line
-        :param label_name_map: Optionally map tag names to different schema.
-        :param banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
-        :return: a dataset with annotated data
+        Args:
+            path_to_column_file: path to the file with the column-formatted data
+            column_name_map: a map specifying the column format
+            column_delimiter: default is to split on any separatator, but you can overwrite for instance with "\t" to split only on tabs
+            comment_symbol: if set, lines that begin with this symbol are treated as comments
+            in_memory: If set to True, the dataset is kept in memory as Sentence objects, otherwise does disk reads
+            document_separator_token: If provided, sentences that function as document boundaries are so marked
+            skip_first_line: set to True if your dataset has a header line
+            label_name_map: Optionally map tag names to different schema.
+            banned_sentences: Optionally remove sentences from the corpus. Works only if `in_memory` is true
         """
         path_to_column_file = Path(path_to_column_file)
         assert path_to_column_file.exists()
@@ -969,19 +966,11 @@ def _process_coref_span_annotations_for_word(
 
         This method mutates the clusters and coref_stacks dictionaries.
 
-        # Parameters
-        label : `str`
-            The coref label for this word.
-        word_index : `int`
-            The word index into the sentence.
-        clusters : `DefaultDict[int, List[Tuple[int, int]]]`
-            A dictionary mapping cluster ids to lists of inclusive spans into the
-            sentence.
-        coref_stacks : `DefaultDict[int, List[int]]`
-            Stacks for each cluster id to hold the start indices of active spans (spans
-            which we are inside of when processing a given word). Spans with the same id
-            can be nested, which is why we collect these opening spans on a stack, e.g:
-            [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
+        Args:
+            label: The coref label for this word.
+            word_index : The word index into the sentence.
+            clusters : A dictionary mapping cluster ids to lists of inclusive spans into the sentence.
+            coref_stacks : Stacks for each cluster id to hold the start indices of open spans. Spans with the same id can be nested, which is why we collect these opening spans on a stack, e.g: [Greg, the baker who referred to [himself]_ID1 as 'the bread man']_ID1
         """
         if label != "-":
             for segment in label.split("|"):
@@ -1013,17 +1002,6 @@ def _process_span_annotations_for_word(
         span_labels: List[List[str]],
         current_span_labels: List[Optional[str]],
     ) -> None:
-        """Given a sequence of different label types for a single word and the current span label we are inside, compute the BIO tag for each label and append to a list.
-
-        # Parameters
-        annotations : `List[str]`
-            A list of labels to compute BIO tags for.
-        span_labels : `List[List[str]]`
-            A list of lists, one for each annotation, to incrementally collect
-            the BIO tags for a sequence.
-        current_span_labels : `List[Optional[str]]`
-            The currently open span per annotation type, or `None` if there is no open span.
-        """
         for annotation_index, annotation in enumerate(annotations):
             # strip all bracketing information to
             # get the actual propbank label.
@@ -1306,11 +1284,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1371,10 +1347,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1602,10 +1577,10 @@ def __init__(
         Column order is swapped
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1746,11 +1721,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -1853,10 +1827,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         # column format
         columns = {0: "ner", 1: "text"}
@@ -1896,10 +1869,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         # column format
         columns = {0: "ner", 1: "text"}
@@ -1929,22 +1901,20 @@ def __init__(
 
 
 class NER_ENGLISH_SEC_FILLINGS(ColumnCorpus):
-    """Initialize corpus of SEC-fillings annotated with English NER tags.
-
-    See paper "Domain Adaption of Named Entity Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
-
-    :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
-    POS tags or chunks respectively
-    :param in_memory: If True, keeps dataset in memory giving speedups in training.
-    :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
-    """
-
     def __init__(
         self,
         base_path: Optional[Union[str, Path]] = None,
         in_memory: bool = True,
         **corpusargs,
     ) -> None:
+        """Initialize corpus of SEC-fillings annotated with English NER tags.
+
+        See paper "Domain Adaption of Named Entity Recognition to Support Credit Risk Assessment" by Alvarado et al, 2015: https://aclanthology.org/U15-1010/
+
+        Args:
+            base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+        """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
         # column format
@@ -2024,11 +1994,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2118,10 +2087,10 @@ def __init__(
         The corpus will be downoaded from https://raw.githubusercontent.com/aritter/twitter_nlp/master/data/annotated/ner.txt.
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2160,10 +2129,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2212,11 +2180,11 @@ def __init__(
 
         The corpus was introduced in the paper "Design Challenges and Misconceptions in Named Entity Recognition" by Ratinov and Roth (2009): https://aclanthology.org/W09-1119/.
         The first time you call this constructor it will automatically download the dataset.
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-            to point to a different folder but typically this should not be necessary.
-            POS tags instead
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2279,10 +2247,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2340,10 +2308,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2457,10 +2425,10 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -2891,9 +2859,9 @@ def __init__(
 
         The first time you call this constructor it will automatically download the dataset.
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -3088,10 +3056,10 @@ def __init__(
     ) -> None:
         """Download and Initialize the MultiCoNer corpus.
 
-        :param task: either 'multi', 'code-switch', or the language code for one of the mono tasks.
-        :param base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine
-        POS tags or chunks respectively
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
+        Args:
+            task: either 'multi', 'code-switch', or the language code for one of the mono tasks.
+            base_path: Path to the CoNLL-03 corpus (i.e. 'conll_03' folder) on your machine POS tags or chunks respectively
+            in_memory: If True, keeps dataset in memory giving speedups in training.
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4148,10 +4116,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4195,10 +4163,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4243,10 +4211,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4290,10 +4258,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4337,10 +4305,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4384,10 +4352,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4431,10 +4399,10 @@ def __init__(
 
         The dataset is downloaded from  https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4478,10 +4446,10 @@ def __init__(
 
         The dataset is downloaded from https://github.com/System-T/UniversalPropositions
 
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training.
-        :param document_as_sequence: If True, all sentences of a document are read into a single Sentence object
+        Args:
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training.
+            document_as_sequence: If True, all sentences of a document are read into a single Sentence object
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -4741,10 +4709,10 @@ def __init__(
         - Domain-specific classification (DSC). Participants will be asked to deploy a different model for each of the above types,
           trying to increase the accuracy for each considered type.
 
-        :param domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
-        :param base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this
-        to point to a different folder but typically this should not be necessary.
-        :param in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
+        Args:
+            domains: Domains to be used. Supported are "WN" (Wikinews), "FIC" (fiction), "ADG" (De Gasperi subset) and "all".
+            base_path: Default is None, meaning that corpus gets auto-downloaded and loaded. You can override this to point to a different folder but typically this should not be necessary.
+            in_memory: If True, keeps dataset in memory giving speedups in training. Not recommended due to heavy RAM usage.
         """
         supported_domains = ["WN", "FIC", "ADG"]
 
diff --git a/flair/datasets/text_text.py b/flair/datasets/text_text.py
index e1727e0cce..e96a5317ed 100644
--- a/flair/datasets/text_text.py
+++ b/flair/datasets/text_text.py
@@ -327,18 +327,18 @@ def __init__(
         seperated by e.g. '\t' (just like in the glue RTE-dataset https://gluebenchmark.com/tasks) .
         For each data pair we create a flair.data.DataPair object.
 
-        :param path_to_data: path to the data file
-        :param columns: list of integers that indicate the respective columns. The first entry is the column
-        for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2]
-        :param max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
-        :param max_chars_per_doc: If set, shortens sentences to this maximum number of characters
-        :param use_tokenizer: Whether or not to use in-built tokenizer
-        :param in_memory: If True, data will be saved in list of flair.data.DataPair objects, other wise we use lists with simple strings which needs less space
-        :param label_type: Name of the label of the data pairs
-        :param skip_first_line: If True, first line of data file will be ignored
-        :param separator: Separator between columns in the data file
-        :param encoding: Encoding of the data file
-        :param label: If False, the dataset expects unlabeled data
+        Args:
+            path_to_data: path to the data file
+            columns: list of integers that indicate the respective columns. The first entry is the column for the first sentence, the second for the second sentence and the third for the label. Default [0,1,2]
+            max_tokens_per_doc: If set, shortens sentences to this maximum number of tokens
+            max_chars_per_doc: If set, shortens sentences to this maximum number of characters
+            use_tokenizer: Whether to use in-built tokenizer
+            in_memory: If True, data will be saved in list of flair.data.DataPair objects, otherwise we use lists with simple strings which needs less space
+            label_type: Name of the label of the data pairs
+            skip_first_line: If True, first line of data file will be ignored
+            separator: Separator between columns in the data file
+            encoding: Encoding of the data file
+            label: If False, the dataset expects unlabeled data
         """
         path_to_data = Path(path_to_data)
 
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index 0fdfbcd706..f3482cfd28 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -37,12 +37,12 @@ def __init__(
     ) -> None:
         """Bidirectional transformer embeddings of words from various transformer architectures.
 
-        :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for
-        options)
-        :param layers: string indicating which layers to take for embedding (-1 is topmost layer)
-        :param cls_pooling: Pooling strategy for combining token level embeddings. options are 'cls', 'max', 'mean'.
-        :param layer_mean: If True, uses a scalar mix of layers as embedding
-        :param fine_tune: If True, allows transformers to be fine-tuned during training
+        Args:
+            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            layers: string indicating which layers to take for embedding (-1 is topmost layer)
+            cls_pooling: Pooling strategy for combining token level embeddings. options are 'cls', 'max', 'mean'.
+            layer_mean: If True, uses a scalar mix of layers as embedding
+            fine_tune: If True, allows transformers to be fine-tuned during training
         """
         TransformerEmbeddings.__init__(
             self,
@@ -71,10 +71,10 @@ def __init__(
     ) -> None:
         """The constructor takes a list of embeddings to be combined.
 
-        :param embeddings: a list of token embeddings
-        :param fine_tune_mode: if set to "linear" a trainable layer is added, if set to
-        "nonlinear", a nonlinearity is added as well. Set this to make the pooling trainable.
-        :param pooling: a string which can any value from ['mean', 'max', 'min']
+        Args:
+            embeddings: a list of token embeddings
+            fine_tune_mode: if set to "linear" a trainable layer is added, if set to "nonlinear", a nonlinearity is added as well. Set this to make the pooling trainable.
+            pooling: a string which can any value from ['mean', 'max', 'min']
         """
         super().__init__()
 
@@ -173,8 +173,9 @@ def __init__(
     ) -> None:
         """The constructor for DocumentTFIDFEmbeddings.
 
-        :param train_dataset: the train dataset which will be used to construct a vectorizer
-        :param vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor
+        Args:
+            train_dataset: the train dataset which will be used to construct a vectorizer
+            vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor
         """
         super().__init__()
 
@@ -237,23 +238,22 @@ def __init__(
         dropout: float = 0.5,
         word_dropout: float = 0.0,
         locked_dropout: float = 0.0,
-        rnn_type="GRU",
+        rnn_type: str="GRU",
         fine_tune: bool = True,
     ) -> None:
         """Instantiates an RNN that works upon some token embeddings.
 
-        :param embeddings: a list of token embeddings
-        :param hidden_size: the number of hidden states in the rnn
-        :param rnn_layers: the number of layers for the rnn
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the rnn or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param bidirectional: boolean value, indicating whether to use a bidirectional rnn or not
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
-        :param rnn_type: 'GRU' or 'LSTM'
+        Args:
+            embeddings: a list of token embeddings
+            hidden_size: the number of hidden states in the rnn
+            rnn_layers: the number of layers for the rnn
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the rnn or not
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            bidirectional: boolean value, indicating whether to use a bidirectional rnn or not
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+            rnn_type: 'GRU' or 'LSTM'
         """
         super().__init__()
 
@@ -539,9 +539,9 @@ def __init__(
     ) -> None:
         """Instantiates a document embedding using the SentenceTransformer Embeddings.
 
-        :param model: string name of models from SentencesTransformer Class
-        :param name: string name of embedding type which will be set to Sentence object
-        :param batch_size: int number of sentences to processed in one batch
+        Args:
+            model: string name of models from SentencesTransformer Class
+            batch_size: int number of sentences to processed in one batch
         """
         super().__init__()
 
@@ -611,17 +611,16 @@ def __init__(
         locked_dropout: float = 0.0,
         fine_tune: bool = True,
     ) -> None:
-        """Instantiates a CNN that works uppons some token embeddings.
-
-        :param embeddings: a list of token embeddings
-        :param kernels: list of (number of kernels, kernel size)
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the rnn or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+        """Instantiates a CNN that works upon some token embeddings.
+
+        Args:
+            embeddings: a list of token embeddings
+            kernels: list of (number of kernels, kernel size)
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the rnn or not
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
         """
         super().__init__()
 
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 8e1f29a25b..6096fa7582 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -172,14 +172,16 @@ def __init__(
         cache_directory: Optional[Path] = None,
     ) -> None:
         """Initializes contextual string embeddings using a character-level language model.
-        :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
+
+        Args:
+            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
                 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward'
                 depending on which character language model is desired.
-        :param detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
+            detach: if set to False, the gradient will propagate into the language model. this dramatically slows down
                 training and often leads to worse results, so not recommended.
-        :param use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
+            use_cache: if set to False, will not write embeddings to file for later retrieval. this saves disk space but will
                 not allow re-use of once computed embeddings that do not fit into memory
-        :param cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
+            cache_directory: if cache_directory is not set, the cache will be written to ~/.flair/embeddings. otherwise the cache
                 is written to the provided directory.
         """
         super().__init__()
@@ -527,17 +529,17 @@ def __init__(
         locked_dropout: float = 0.0,
     ) -> None:
         """The constructor takes a list of embeddings to be combined.
-        :param embeddings: a list of token embeddings
-        :param hidden_size: the number of hidden states in the lstm
-        :param rnn_layers: the number of layers for the lstm
-        :param reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear
-        layer before putting them into the lstm or not
-        :param reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output
-        dimension as before will be taken.
-        :param bidirectional: boolean value, indicating whether to use a bidirectional lstm or not
-        :param dropout: the dropout value to be used
-        :param word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
-        :param locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
+
+        Args:
+            embeddings: a list of token embeddings
+            hidden_size: the number of hidden states in the lstm
+            rnn_layers: the number of layers for the lstm
+            reproject_words: boolean value, indicating whether to reproject the token embeddings in a separate linear layer before putting them into the lstm or not.
+            reproject_words_dimension: output dimension of reprojecting token embeddings. If None the same output dimension as before will be taken.
+            bidirectional: boolean value, indicating whether to use a bidirectional lstm or not
+            dropout: the dropout value to be used
+            word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
+            locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used.
         """
         super().__init__()
 
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 82facf130b..1c52eaec5d 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -41,13 +41,13 @@ def __init__(
     ) -> None:
         """Bidirectional transformer embeddings of words from various transformer architectures.
 
-        :param model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for
-        options)
-        :param layers: string indicating which layers to take for embedding (-1 is topmost layer)
-        :param subtoken_pooling: how to get from token piece embeddings to token embedding. Either take the first
-        subtoken ('first'), the last subtoken ('last'), both first and last ('first_last') or a mean over all ('mean')
-        :param layer_mean: If True, uses a scalar mix of layers as embedding
-        :param fine_tune: If True, allows transformers to be fine-tuned during training
+        Args:
+            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            layers: string indicating which layers to take for embedding (-1 is topmost layer)
+            subtoken_pooling: how to get from token piece embeddings to token embedding. Either take the first
+                subtoken ('first'), the last subtoken ('last'), both first and last ('first_last') or a mean over all ('mean')
+            layer_mean: If True, uses a scalar mix of layers as embedding
+            fine_tune: If True, allows transformers to be fine-tuned during training
         """
         TransformerEmbeddings.__init__(
             self,
@@ -166,9 +166,10 @@ def __init__(
         """Initializes classic word embeddings.
 
         Constructor downloads required files if not there.
-        :param embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or custom
-        If you want to use a custom embedding file, just pass the path to the embeddings as embeddings variable.
-        set stable=True to use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+
+        Args:
+            embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or a path to a custom embedding
+            stable: if True, use the stable embeddings as described in https://arxiv.org/abs/2110.02861
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -570,18 +571,12 @@ def __init__(
     ) -> None:
         """Initializes contextual string embeddings using a character-level language model.
 
-        :param model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
-                'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward',
-                etc (see https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md)
-                depending on which character language model is desired.
-        :param fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows
-                down training and often leads to overfitting, so use with caution.
-        :param chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster
-                but requires more memory. Lower means slower but less memory.
-        :param with_whitespace: If True, use hidden state after whitespace after word. If False, use hidden
-                 state at last character of word.
-        :param tokenized_lm: Whether this lm is tokenized. Default is True, but for LMs trained over unprocessed text
-                False might be better.
+        Args:
+            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired.
+            fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down training and often leads to overfitting, so use with caution.
+            chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires more memory. Lower means slower but less memory.
+            with_whitespace: If True, use hidden state after whitespace after word. If False, use hidden state at last character of word.
+            tokenized_lm: Whether this lm is tokenized. Default is True, but for LMs trained over unprocessed text False might be better.
         """
         super().__init__()
         self.instance_parameters = self.get_instance_parameters(locals=locals())
@@ -1016,8 +1011,9 @@ def __init__(
 
         Constructor downloads required embedding file and stores in cache if use_local is False.
 
-        :param embeddings: path to your embeddings '.bin' file
-        :param use_local: set this to False if you are using embeddings from a remote source
+        Args:
+            embeddings: path to your embeddings '.bin' file
+            use_local: set this to False if you are using embeddings from a remote source
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -1104,10 +1100,11 @@ def __init__(
     ) -> None:
         """Initializes one-hot encoded word embeddings and a trainable embedding layer.
 
-        :param vocab_dictionary: the vocabulary that will be encoded
-        :param field: by default, the 'text' of tokens is embedded, but you can also embed tags such as 'pos'
-        :param embedding_length: dimensionality of the trainable embedding layer
-        :param stable: set stable=True to use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+        Args:
+            vocab_dictionary: the vocabulary that will be encoded
+            field: by default, the 'text' of tokens is embedded, but you can also embed tags such as 'pos'
+            embedding_length: dimensionality of the trainable embedding layer
+            stable: if True, use the stable embeddings as described in https://arxiv.org/abs/2110.02861
         """
         super().__init__()
         self.name = f"one-hot-{field}"
@@ -1498,9 +1495,10 @@ def __init__(self, embeddings: str, model: str = "skip", size: int = 100) -> Non
         See: http://www.nilc.icmc.usp.br/embeddings
         Constructor downloads required files if not there.
 
-        :param embeddings: one of: 'fasttext', 'glove', 'wang2vec' or 'word2vec'
-        :param model: one of: 'skip' or 'cbow'. This is not applicable to glove.
-        :param size: one of: 50, 100, 300, 600 or 1000.
+        Args:
+            embeddings: one of: 'fasttext', 'glove', 'wang2vec' or 'word2vec'
+            model: one of: 'skip' or 'cbow'. This is not applicable to glove.
+            size: one of: 50, 100, 300, 600 or 1000.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 9d2a8b5ab1..a78f5ca54e 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1387,7 +1387,8 @@ def export_onnx(
     ) -> TransformerOnnxEmbeddings:
         """Export TransformerEmbeddings to OnnxFormat.
 
-        :param example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
-        sentences with some variation.
+        Args:
+            example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
+                sentences with some variation.
         """
         return self.onnx_cls.export_from_embedding(path, self, example_sentences, **kwargs)
diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py
index ae4f2ecb07..75af25043f 100644
--- a/flair/models/entity_linker_model.py
+++ b/flair/models/entity_linker_model.py
@@ -94,12 +94,14 @@ def __init__(
     ) -> None:
         """Initializes an EntityLinker.
 
-        :param embeddings: embeddings used to embed the words/sentences
-        :param label_dictionary: dictionary that gives ids to all classes. Should contain <unk>
-        :param pooling_operation: either 'average', 'first', 'last' or 'first&last'. Specifies the way of how text representations of entity mentions (with more than one word) are handled.
-        E.g. 'average' means that as text representation we take the average of the embeddings of the words in the mention. 'first&last' concatenates
-        the embedding of the first and the embedding of the last word.
-        :param label_type: name of the label you use.
+        Args:
+            embeddings: embeddings used to embed the tokens of the sentences.
+            label_dictionary: dictionary that gives ids to all classes. Should contain <unk>.
+            pooling_operation: either `average`, `first`, `last` or `first_last`. Specifies the way of how text
+                representations of entity mentions (with more than one token) are handled. E.g. `average` means that as
+                text representation we take the average of the embeddings of the token in the mention.
+                `first_last` concatenates the embedding of the first and the embedding of the last token.
+            label_type: name of the label you use.
         """
         super().__init__(
             embeddings=embeddings,
diff --git a/flair/models/lemmatizer_model.py b/flair/models/lemmatizer_model.py
index dfeef8e9bc..d647da9352 100644
--- a/flair/models/lemmatizer_model.py
+++ b/flair/models/lemmatizer_model.py
@@ -40,22 +40,24 @@ def __init__(
         that predicts the lemma of the given token one letter at a time.
         Note that one can use data in which only those words are annotated that differ from their lemma or data
         in which all words are annotated with a (maybe equal) lemma.
-        :param embeddings: Embedding used to encode sentence
-        :param rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector
-            over the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
-        :param rnn_hidden_size: size of the hidden state of the RNN('s).
-        :param rnn_layers: Number of stacked RNN cells
-        :param beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction.
-        :param char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for
-            the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand
-            over a path to a dictionary or the dictionary itself.
-        :param label_type: Name of the gold labels to use.
-        :param max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in
-            the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is
-            computed as the length of the longest token in the sentences plus one.
-        :param max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed
-            maximum length for the decoding will be used for all sentences.
-        :param use_attention: whether to use attention. Only sensible if encoding via RNN
+
+        Args:
+            embeddings: Embedding used to encode sentence
+            rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector over
+                the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
+            rnn_hidden_size: size of the hidden state of the RNN('s).
+            rnn_layers: Number of stacked RNN cells
+            beam_size: Number of hypothesis used when decoding the output of the RNN. Only used in prediction.
+            char_dict: Dictionary of characters the model is able to process. The dictionary must contain <unk> for
+                the handling of unknown characters. If None, a standard dictionary will be loaded. One can either hand
+                over a path to a dictionary or the dictionary itself.
+            label_type: Name of the gold labels to use.
+            max_sequence_length_dependent_on_input: If set to True, the maximum length of a decoded sequence in
+                the prediction depends on the sentences you want to lemmatize. To be precise the maximum length is
+                computed as the length of the longest token in the sentences plus one.
+            max_sequence_length: If set to True and max_sequence_length_dependend_on_input is False a fixed
+                maximum length for the decoding will be used for all sentences.
+            use_attention: whether to use attention. Only sensible if encoding via RNN
         """
         super().__init__()
 
diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
index a68c698621..1008266993 100644
--- a/flair/models/multitask_model.py
+++ b/flair/models/multitask_model.py
@@ -33,7 +33,11 @@ def __init__(
     ) -> None:
         """Instantiates the MultiTaskModel.
 
-        :param models: Key (Task ID) - Value (flair.nn.Model) Pairs to stack model
+        Args:
+            models: The child models used during multitask training.
+            task_ids: If given, add each corresponding model a specified task id. Otherwise, tasks get the ids 'Task_0', 'Task_1', ...
+            loss_factors: If given, weight the losses of teh corresponding models during training.
+            use_all_tasks: If True, each sentence will be trained on all tasks parallel, otherwise each epoch 1 task will be sampled to train the sentence on.
         """
         super().__init__()
 
@@ -64,8 +68,10 @@ def _prepare_tensors(self, data_points: List[DT]) -> Tuple[torch.Tensor, ...]:
     def forward_loss(self, sentences: Union[List[Sentence], Sentence]) -> Tuple[torch.Tensor, int]:
         """Calls the respective forward loss of each model and sums them weighted by their loss factors.
 
-        :param sentences: batch of sentences
-        :return: loss
+        Args:
+            sentences: batch of sentences
+
+        Returns: loss and sample count
         """
         batch_split = self.split_batch_to_task_ids(sentences, all_tasks=self.use_all_tasks)
         loss = torch.tensor(0.0, device=flair.device)
@@ -90,9 +96,12 @@ def split_batch_to_task_ids(sentences: Union[List[Sentence], Sentence], all_task
 
         If single sentence is assigned to several tasks (i.e. same corpus but different tasks), then the model
         assignment for this batch is randomly chosen.
-        :param sentences: batch of sentences
-        :param all_tasks: use all tasks of each sentence. If deactivated, a random task will be sampled
-        :return: Key-value pairs as (task_id, list of sentences ids in batch)
+
+        Args:
+            sentences: batch of sentences
+            all_tasks: use all tasks of each sentence. If deactivated, a random task will be sampled
+
+        Returns: Key-value pairs as (task_id, list of sentences ids in batch)
         """
         batch_to_task_mapping: Dict[str, List[int]] = {}
         for sentence_id, sentence in enumerate(sentences):
@@ -123,12 +132,13 @@ def evaluate(
     ) -> Result:
         """Evaluates the model. Returns a Result object containing evaluation results and a loss value.
 
-        :param sentences: batch of sentences
-        :param embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed),
-            'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
-        :param mini_batch_size: size of batches
-        :param evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type
-        :return: Tuple of Result object and loss value (float)
+        Args:
+            sentences: batch of sentences
+            embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
+            mini_batch_size: size of batches
+            evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type
+
+        Returns: Tuple of Result object and loss value (float)
         """
         if not evaluate_all:
             if gold_label_type not in self.tasks:
@@ -204,10 +214,7 @@ def evaluate(
         )
 
     def _get_state_dict(self):
-        """Returns the state dict of the multitask model which has multiple models underneath.
-
-        :return model_state: model state for the multitask model
-        """
+        """Returns the state dict of the multitask model which has multiple models underneath."""
         initial_model_state = super()._get_state_dict()
         initial_model_state["state_dict"] = {}  # the model state is stored per model already.
         model_state = {
diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py
index e6598d87a0..c776ee973d 100644
--- a/flair/models/pairwise_classification_model.py
+++ b/flair/models/pairwise_classification_model.py
@@ -26,13 +26,12 @@ def __init__(
     ) -> None:
         """Initializes a TextPairClassifier.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
-        or False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param loss_weights: Dictionary of weights for labels for the loss function
-        (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            embeddings: embeddings used to embed each data point
+            label_dictionary: dictionary of labels you want to predict
+            multi_label: auto-detected by default, but you can set this to True to force multi-label prediction or False to force single-label prediction
+            multi_label_threshold: If multi-label you can set the threshold to make predictions
+            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is unspecified it will default to 1.0
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py
index 43b7dc203e..4fa99979f8 100644
--- a/flair/models/relation_classifier_model.py
+++ b/flair/models/relation_classifier_model.py
@@ -219,18 +219,20 @@ class _Entity(NamedTuple):
 class RelationClassifier(flair.nn.DefaultClassifier[EncodedSentence, EncodedSentence]):
     """Relation Classifier to predict the relation between two entities.
 
-    ---- Task ----
+    Task
+    ----
     Relation Classification (RC) is the task of identifying the semantic relation between two entities in a text.
     In contrast to (end-to-end) Relation Extraction (RE), RC requires pre-labelled entities.
 
     Example:
-    -------
+    --------
     For the `founded_by` relation from `ORG` (head) to `PER` (tail) and the sentence
     "Larry Page and Sergey Brin founded Google .", we extract the relations
     - founded_by(head='Google', tail='Larry Page') and
     - founded_by(head='Google', tail='Sergey Brin').
 
-    ---- Architecture ----
+    Architecture
+    ------------
     The Relation Classifier Model builds upon a text classifier.
     The model generates an encoded sentence for each entity pair
     in the cross product of all entities in the original sentence.
@@ -241,7 +243,9 @@ class RelationClassifier(flair.nn.DefaultClassifier[EncodedSentence, EncodedSent
 
     The implemented encoding strategies are taken from this paper by Zhou et al.: https://arxiv.org/abs/2102.01373
 
-    Note: Currently, the model has no multi-label support.
+    .. warning::
+        Currently, the model has no multi-label support.
+
     """
 
     def __init__(
@@ -369,11 +373,14 @@ def _entity_pair_permutations(
         If the passed sentence contains relation annotations,
         the relation gold label will be yielded along with the participating entities.
         The permutations are constructed by a filtered cross-product
-        under the specification of `self.entity_label_types` and `self.entity_pair_labels`.
+        under the specification of :py:meth:~`flair.models.RelationClassifier.entity_label_types`
+        and :py:meth:~`flair.models.RelationClassifier.entity_pair_labels`.
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :yields: Tuples of (HEAD, TAIL, gold_label).
-                 The head and tail `_Entity`s have span references to the passed sentence.
+        Args:
+            sentence: A Sentence with entity annotations
+
+        Yields:
+            Tuples of (HEAD, TAIL, gold_label): The head and tail `_Entity`s` have span references to the passed sentence.
         """
         valid_entities: List[_Entity] = list(self._valid_entities(sentence))
 
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 32d0ec11bc..61f32294f1 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -48,27 +48,25 @@ def __init__(
         """Sequence Tagger class for predicting labels for single tokens. Can be parameterized by several attributes.
 
         In case of multitask learning, pass shared embeddings or shared rnn into respective attributes.
-        :param embeddings: Embeddings to use during training and prediction
-        :param tag_dictionary: Dictionary containing all tags from corpus which can be predicted
-        :param tag_type: type of tag which is going to be predicted in case a corpus has multiple annotations
-        :param use_rnn: If true, use a RNN, else Linear layer.
-        :param rnn: (Optional) Takes a torch.nn.Module as parameter by which you can pass a shared RNN between
-            different tasks.
-        :param rnn_type: Specifies the RNN type to use, default is 'LSTM', can choose between 'GRU' and 'RNN' as well.
-        :param hidden_size: Hidden size of RNN layer
-        :param rnn_layers: number of RNN layers
-        :param bidirectional: If True, RNN becomes bidirectional
-        :param use_crf: If True, use a Conditional Random Field for prediction, else linear map to tag space.
-        :param reproject_embeddings: If True, add a linear layer on top of embeddings, if you want to imitate
-            fine tune non-trainable embeddings.
-        :param dropout: If > 0, then use dropout.
-        :param word_dropout: If > 0, then use word dropout.
-        :param locked_dropout: If > 0, then use locked dropout.
-        :param train_initial_hidden_state: if True, trains initial hidden state of RNN
-        :param loss_weights: Dictionary of weights for labels for the loss function
-            (if any label's weight is unspecified it will default to 1.0)
-        :param init_from_state_dict: Indicator whether we are loading a model from state dict
-            since we need to transform previous models' weights into CRF instance weights
+
+        Args:
+            embeddings: Embeddings to use during training and prediction
+            tag_dictionary: Dictionary containing all tags from corpus which can be predicted
+            tag_type: type of tag which is going to be predicted in case a corpus has multiple annotations
+            use_rnn: If true, use a RNN, else Linear layer.
+            rnn: Takes a torch.nn.Module as parameter by which you can pass a shared RNN between different tasks.
+            rnn_type: Specifies the RNN type to use, default is 'LSTM', can choose between 'GRU' and 'RNN' as well.
+            hidden_size: Hidden size of RNN layer
+            rnn_layers: number of RNN layers
+            bidirectional: If True, RNN becomes bidirectional
+            use_crf: If True, use a Conditional Random Field for prediction, else linear map to tag space.
+            reproject_embeddings: If True, add a linear layer on top of embeddings, if you want to imitate fine tune non-trainable embeddings.
+            dropout: If > 0, then use dropout.
+            word_dropout: If > 0, then use word dropout.
+            locked_dropout: If > 0, then use locked dropout.
+            train_initial_hidden_state: if True, trains initial hidden state of RNN
+            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is unspecified it will default to 1.0.
+            init_from_state_dict: Indicator whether we are loading a model from state dict since we need to transform previous models' weights into CRF instance weights
         """
         super().__init__()
 
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index 6bee5aee1e..dbcce02290 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -331,20 +331,19 @@ def __init__(
         label_dictionary: Optional[Dictionary] = None,
         label_type: Optional[str] = None,
         embeddings: Union[TransformerWordEmbeddings, str] = "bert-base-uncased",
-        num_negative_labels_to_sample: int = 2,
+        num_negative_labels_to_sample: Optional[int] = 2,
         prefix: bool = True,
         **tagger_args,
     ) -> None:
         """Initializes a TarsTagger.
 
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of labels you want to predict
-        :param embeddings: name of the pre-trained transformer model e.g.,
-        'bert-base-uncased' etc
-        :param num_negative_labels_to_sample: number of negative labels to sample for each
-        positive labels against a sentence during training. Defaults to 2 negative
-        labels for each positive label. The model would sample all the negative labels
-        if None is passed. That slows down the training considerably.
+        Args:
+            task_name: a string depicting the name of the task
+            label_dictionary: dictionary of labels you want to predict
+            embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'
+            num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
+                sentence during training. Defaults to 2 negative labels for each positive label. The model would sample
+                all the negative labels if None is passed. That slows down the training considerably.
         """
         super().__init__()
 
@@ -664,24 +663,24 @@ def __init__(
         label_dictionary: Optional[Dictionary] = None,
         label_type: Optional[str] = None,
         embeddings: Union[TransformerDocumentEmbeddings, str] = "bert-base-uncased",
-        num_negative_labels_to_sample: int = 2,
+        num_negative_labels_to_sample: Optional[int] = 2,
         prefix: bool = True,
         **tagger_args,
     ) -> None:
         """Initializes a TarsClassifier.
 
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of labels you want to predict
-        :param embeddings: name of the pre-trained transformer model e.g.,
-        'bert-base-uncased' etc
-        :param num_negative_labels_to_sample: number of negative labels to sample for each
-        positive labels against a sentence during training. Defaults to 2 negative
-        labels for each positive label. The model would sample all the negative labels
-        if None is passed. That slows down the training considerably.
-        :param multi_label: auto-detected by default, but you can set this to True
-        to force multi-label predictionor False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param beta: Parameter for F-beta score for evaluation and training annealing
+        Args:
+            task_name: a string depicting the name of the task.
+            label_dictionary: dictionary of labels you want to predict.
+            embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'.
+            num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
+                sentence during training. Defaults to 2 negative labels for each positive label.
+                The model would sample all the negative labels if None is passed.
+                That slows down the training considerably.
+            multi_label: auto-detected by default, but you can set this to True to force multi-label predictions
+                or False to force single-label predictions.
+            multi_label_threshold: If multi-label you can set the threshold to make predictions.
+            beta: Parameter for F-beta score for evaluation and training annealing.
         """
         super().__init__()
 
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 25a6b1d11a..8ac13d946e 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -29,14 +29,15 @@ def __init__(
     ) -> None:
         """Initializes a TextClassifier.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
-        or False to force single-label prediction
-        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param loss_weights: Dictionary of weights for labels for the loss function
-        (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            embeddings: embeddings used to embed each data point
+            label_dictionary: dictionary of labels you want to predict
+            multi_label: auto-detected by default, but you can set this to True to force multi-label predictions
+                or False to force single-label predictions.
+            multi_label_threshold: If multi-label you can set the threshold to make predictions
+            beta: Parameter for F-beta score for evaluation and training annealing
+            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is
+                unspecified it will default to 1.0
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 2e77d67d35..979e27da73 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -767,14 +767,15 @@ def predict(
     ):
         """Predicts the class labels for the given sentences. The labels are directly added to the sentences.
 
-        :param sentences: list of sentences
-        :param mini_batch_size: mini batch size to use
-        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted  # noqa: E501
-        :param verbose: set to True to display a progress bar
-        :param return_loss: set to True to return loss
-        :param label_name: set this to change the name of the label type that is predicted
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.  # noqa: E501
-        'gpu' to store embeddings in GPU memory.
+        Args:
+            sentences: list of sentences to predict
+            mini_batch_size: the amount of sentences that will be predicted within one batch
+            return_probabilities_for_all_classes: return probabilities for all classes instead of only best predicted
+            verbose: set to True to display a progress bar
+            return_loss: set to True to return loss
+            label_name: set this to change the name of the label type that is predicted
+            embedding_storage_mode: default is 'none' which is the best is most cases.
+                Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.
         """
         if label_name is None:
             label_name = self.label_type if self.label_type is not None else "label"
diff --git a/flair/splitter.py b/flair/splitter.py
index cf57b36bdf..90464bfc21 100644
--- a/flair/splitter.py
+++ b/flair/splitter.py
@@ -97,8 +97,9 @@ class SpacySentenceSplitter(SentenceSplitter):
 
     Implementation of :class:`SentenceSplitter`, using models from Spacy.
 
-    :param model Spacy V2 model or the name of the model to load.
-    :param tokenizer Custom tokenizer to use (default :class:`SpacyTokenizer`)
+    Args:
+        model: Spacy V2 model or the name of the model to load.
+        tokenizer: Custom tokenizer to use (default :class:`SpacyTokenizer`)
     """
 
     def __init__(self, model: Union[Any, str], tokenizer: Optional[Tokenizer] = None) -> None:
diff --git a/flair/trainers/plugins/base.py b/flair/trainers/plugins/base.py
index 9932735990..9b5c3b607a 100644
--- a/flair/trainers/plugins/base.py
+++ b/flair/trainers/plugins/base.py
@@ -77,8 +77,9 @@ def validate_event(self, *events: EventIdenifier):
     def register_hook(self, func: Callable, *events: EventIdenifier):
         """Register a hook.
 
-        :param func: Function to be called when the event is emitted.
-        :param *events: List of events to call this function on.
+        Args:
+            func: Function to be called when the event is emitted.
+            *events: List of events to call this function on.
         """
         self.validate_event(*events)
 
@@ -194,7 +195,7 @@ def attach_to(self, pluggable: Pluggable):
             try:
                 func = getattr(self, name)
 
-                # get attribute hook events (mayr aise an AttributeError)
+                # get attribute hook events (may raise an AttributeError)
                 events = func._plugin_hook_events
 
                 # register function as a hook

From f010c09cf36ad946ebe739b43b0d81af3aabdd8f Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 10 Jul 2023 17:53:47 +0200
Subject: [PATCH 079/124] fix transformer embeddings tutorial

---
 .../transformer-embeddings.md                 | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
index 0682d7ef80..435a7b7221 100644
--- a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
@@ -1,7 +1,7 @@
 # Transformer embeddings
 
 Flair supports various Transformer-based architectures like BERT or XLNet from [HuggingFace](https://github.com/huggingface), 
-with two classes `TransformerWordEmbeddings` (to embed words) and `TransformerDocumentEmbeddings` (to embed documents).
+with two classes [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) (to embed words) and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) (to embed documents).
 
 ## Embeddings words 
 
@@ -35,12 +35,12 @@ sentence = Sentence('The grass is green .')
 embedding.embed(sentence)
 ```
 
-[Here](https://huggingface.co/transformers/pretrained_models.html) is a full list of all models (BERT, RoBERTa, XLM, XLNet etc.). You can use any of these models with this class.
+Use the [Huggingface Model hub](https://huggingface.co/models) to find any open source text embedding model to use.
 
 
 ## Embeddings sentences
 
-To embed a whole sentence as one (instead of each word in the sentence), simply use the TransformerDocumentEmbeddings 
+To embed a whole sentence as one (instead of each word in the sentence), simply use the [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) 
 instead:
 
 ```python
@@ -58,18 +58,18 @@ embedding.embed(sentence)
 
 ## Arguments
 
-There are several options that you can set when you init the TransformerWordEmbeddings 
-and TransformerDocumentEmbeddings classes:
+There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.document.TransformerWordEmbeddings) 
+and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) classes:
 
-| Argument             | Default             | Description
-| -------------------- | ------------------- | ------------------------------------------------------------------------------
-| `model` | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)
-| `layers`             | `all`       | Defines the layers of the Transformer-based model that produce the embedding
-| `subtoken_pooling`  | `first`             | See [Pooling operation section](#pooling).
-| `layer_mean`     | `True`             | See [Layer mean section](#layer-mean).
-| `fine_tune`     | `False`             | Whether or not embeddings are fine-tuneable.
-| `allow_long_sentences`     | `True`             | Whether or not texts longer than maximal sequence length are supported.
-| `use_context` | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation
+| Argument               | Default             | Description                                                                                                                                
+|------------------------|---------------------|--------------------------------------------------------------------------------------------------------------------------------------------
+| `model`                | `bert-base-uncased` | The string identifier of the transformer model you want to use (see above)                                                                 |
+| `layers`               | `all`               | Defines the layers of the Transformer-based model that produce the embedding                                                               |
+| `subtoken_pooling`     | `first`             | See [Pooling operation section](#pooling).                                                                                                 |
+| `layer_mean`           | `True`              | See [Layer mean section](#layer-mean).                                                                                                     |
+| `fine_tune`            | `False`             | Whether or not embeddings are fine-tuneable.                                                                                               |
+| `allow_long_sentences` | `True`              | Whether or not texts longer than maximal sequence length are supported.                                                                    |
+| `use_context`          | `False`             | Set to True to include context outside of sentences. This can greatly increase accuracy on some tasks, but slows down embedding generation |
 
 
 ### Layers
@@ -176,6 +176,5 @@ tensor([-0.0323, -0.3904, -1.1946,  ...,  0.1305, -0.1365, -0.4323],
 
 ### Models
 
-Please have a look at the awesome Hugging Face [documentation](https://huggingface.co/transformers/v2.3.0/pretrained_models.html)
-for all supported pretrained models!
+Please have a look at the awesome [Huggingface Model hub](https://huggingface.co/models) to find any open source text embedding model to use.
 

From 67a7b2e90f64fba8f3dda1e0e11dc663bdc33a94 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 10 Jul 2023 18:47:24 +0200
Subject: [PATCH 080/124] fix :param usage outside of datasets folder

---
 flair/datasets/base.py                        |  31 ++---
 flair/datasets/relation_extraction.py         |   5 -
 flair/embeddings/base.py                      |  11 +-
 flair/file_utils.py                           |  11 +-
 flair/inference_utils.py                      |   8 +-
 flair/models/clustering.py                    |  25 ++--
 flair/models/lemmatizer_model.py              |  24 ++--
 flair/models/pairwise_regression_model.py     |   4 +-
 flair/models/regexp_tagger.py                 |  18 ++-
 flair/models/relation_classifier_model.py     | 113 +++++++++---------
 flair/models/relation_extractor_model.py      |  12 +-
 flair/models/sequence_tagger_model.py         |  68 ++++++-----
 flair/models/sequence_tagger_utils/crf.py     |  14 ++-
 flair/models/sequence_tagger_utils/viterbi.py |  40 ++++---
 flair/models/tars_model.py                    |  19 +--
 flair/models/word_tagger_model.py             |   7 +-
 flair/nn/distance/cosine.py                   |   6 +-
 flair/nn/model.py                             |  37 +++---
 flair/samplers.py                             |   9 +-
 flair/tokenization.py                         |   5 +-
 flair/trainers/plugins/base.py                |  12 +-
 flair/trainers/plugins/functional/amp.py      |   6 +-
 .../plugins/functional/anneal_on_plateau.py   |  16 +--
 .../plugins/functional/checkpoints.py         |   7 +-
 .../plugins/functional/linear_scheduler.py    |  20 +---
 .../plugins/functional/weight_extractor.py    |   9 +-
 flair/trainers/plugins/loggers/loss_file.py   |  20 +---
 .../plugins/loggers/metric_history.py         |   6 +-
 flair/trainers/plugins/loggers/tensorboard.py |  11 +-
 flair/trainers/plugins/metric_records.py      |  10 +-
 flair/trainers/trainer.py                     |  11 +-
 flair/training_utils.py                       |  16 ++-
 flair/visual/ner_html.py                      |  14 ++-
 33 files changed, 309 insertions(+), 316 deletions(-)

diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index 023eeb894d..e0601e5a4c 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -44,7 +44,8 @@ class FlairDatapointDataset(FlairDataset, Generic[DT]):
     def __init__(self, datapoints: Union[DT, List[DT]]) -> None:
         """Instantiate FlairDatapointDataset.
 
-        :param sentences: DT or List of DT that make up FlairDatapointDataset
+        Args:
+            datapoints: DT or List of DT that make up FlairDatapointDataset
         """
         # cast to list if necessary
         if not isinstance(datapoints, list):
@@ -130,19 +131,21 @@ def __init__(
         'Plats': 'Abrahamsby'
         }
 
-        :param query: Query, e.g. {'Län': 'Stockholms län'}
-        :param host: Host, e.g. 'localhost',
-        :param port: Port, e.g. 27017
-        :param database: Database, e.g. 'rosenberg',
-        :param collection: Collection, e.g. 'book',
-        :param text_field: Text field, e.g. 'Beskrivning',
-        :param categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'],
-        :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
-        :param max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
-        :param max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
-        :param tokenizer: Custom tokenizer to use (default SegtokTokenizer)
-        :param in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
-        :return: list of sentences
+        Args:
+            query: Query, e.g. {'Län': 'Stockholms län'}
+            host: Host, e.g. 'localhost',
+            port: Port, e.g. 27017
+            database: Database, e.g. 'rosenberg',
+            collection: Collection, e.g. 'book',
+            text_field: Text field, e.g. 'Beskrivning',
+            categories_field: List of category fields, e.g ['Län', 'Härad', 'Tingslag', 'Församling', 'Plats'],
+            max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is.
+            max_tokens_per_doc: If set, truncates each Sentence to a maximum number of Tokens
+            max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
+            tokenizer: Custom tokenizer to use (default SegtokTokenizer)
+            in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
+
+        Returns: list of sentences
         """
         # first, check if pymongo is installed
         try:
diff --git a/flair/datasets/relation_extraction.py b/flair/datasets/relation_extraction.py
index 0421802de6..30709a14c4 100644
--- a/flair/datasets/relation_extraction.py
+++ b/flair/datasets/relation_extraction.py
@@ -44,9 +44,6 @@ def __init__(
         """SemEval-2010 Task 8 on Multi-Way Classification of Semantic Relations Between Pairs of Nominals.
 
         see https://aclanthology.org/S10-1006.pdf
-        :param base_path:
-        :param in_memory:
-        :param augment_train:
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
@@ -229,8 +226,6 @@ def __init__(self, base_path: Optional[Union[str, Path]] = None, in_memory: bool
 
         with 41 relations from https://nlp.stanford.edu/projects/tacred/.
         Manual download is required for this dataset.
-        :param base_path:
-        :param in_memory:
         """
         base_path = flair.cache_root / "datasets" if not base_path else Path(base_path)
 
diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py
index bf3e7645ab..f0da70cc43 100644
--- a/flair/embeddings/base.py
+++ b/flair/embeddings/base.py
@@ -123,7 +123,9 @@ def __init__(self, mixture_size: int, trainable: bool = False) -> None:
         """Inits scalar mix implementation.
 
         ``mixture = gamma * sum(s_k * tensor_k)`` where ``s = softmax(w)``, with ``w`` and ``gamma`` scalar parameters.
-        :param mixture_size: size of mixtures (usually the number of layers)
+
+        Args:
+            mixture_size: size of mixtures (usually the number of layers)
         """
         super().__init__()
         self.mixture_size = mixture_size
@@ -157,8 +159,11 @@ def forward(self, tensors: List[torch.Tensor]) -> torch.Tensor:
 
         Computes a weighted average of the ``tensors``.  The input tensors an be any shape
         with at least two dimensions, but must all be the same shape.
-        :param tensors: list of input tensors
-        :return: computed weighted average of input tensors
+
+        Args:
+            tensors: list of input tensors
+
+        Returns: computed weighted average of input tensors
         """
         if len(tensors) != self.mixture_size:
             log.error(
diff --git a/flair/file_utils.py b/flair/file_utils.py
index 7968fefc2a..7f0ba5f9e7 100644
--- a/flair/file_utils.py
+++ b/flair/file_utils.py
@@ -45,8 +45,6 @@ def load_big_file(f: str):
     """Workaround for loading a big pickle file.
 
     Files over 2GB cause pickle errors on certain Mac and Windows distributions.
-    :param f:
-    :return:
     """
     with open(f, "rb") as f_in:
         # mmap seems to be much more memory efficient
@@ -148,10 +146,11 @@ def unzip_file(file: Union[str, Path], unzip_to: Union[str, Path]):
 def unpack_file(file: Path, unpack_to: Path, mode: Optional[str] = None, keep: bool = True):
     """Unpacks an archive file to the given location.
 
-    :param file Archive file to unpack
-    :param unpack_to Destination where to store the output
-    :param mode Type of the archive (zip, tar, gz, targz, rar)
-    :param keep Indicates whether to keep the archive after extraction or delete it
+    Args:
+        file: Archive file to unpack
+        unpack_to: Destination where to store the output
+        mode: Type of the archive (zip, tar, gz, targz, rar)
+        keep: Indicates whether to keep the archive after extraction or delete it
     """
     if mode == "zip" or (mode is None and str(file).endswith("zip")):
         from zipfile import ZipFile
diff --git a/flair/inference_utils.py b/flair/inference_utils.py
index 035025c005..0310671534 100644
--- a/flair/inference_utils.py
+++ b/flair/inference_utils.py
@@ -64,10 +64,10 @@ class WordEmbeddingsStore:
     def __init__(self, embedding: WordEmbeddings, backend="sqlite", verbose=True) -> None:
         """Instantiates the WordEmbeddingsStore.
 
-        :param embedding: Flair WordEmbeddings instance.
-        :param backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``.
-                        Default value is ``'sqlite'``.
-        :param verbose: If `True` print information on standard output
+        Args:
+            embedding: The WordEmbeddings to store.
+            backend: cache database backend name e.g ``'sqlite'``, ``'lmdb'``. Default value is ``'sqlite'``.
+            verbose: If `True` print information on standard output
         """
         self.items = ""
 
diff --git a/flair/models/clustering.py b/flair/models/clustering.py
index 00eb88563f..b027d6b9cd 100644
--- a/flair/models/clustering.py
+++ b/flair/models/clustering.py
@@ -22,8 +22,9 @@ class ClusteringModel:
     def __init__(self, model: Union[ClusterMixin, BaseEstimator], embeddings: DocumentEmbeddings) -> None:
         """Instantiate the ClusteringModel.
 
-        :param model: the clustering algorithm from sklearn this wrapper will use.
-        :param embeddings: the flair DocumentEmbedding this wrapper uses to calculate a vector for each sentence.
+        Args:
+            model: the clustering algorithm from sklearn this wrapper will use.
+            embeddings: the flair DocumentEmbedding this wrapper uses to calculate a vector for each sentence.
         """
         self.model = model
         self.embeddings = embeddings
@@ -31,7 +32,8 @@ def __init__(self, model: Union[ClusterMixin, BaseEstimator], embeddings: Docume
     def fit(self, corpus: Corpus, **kwargs):
         """Trains the model.
 
-        :param corpus: the flair corpus this wrapper will use for fitting the model.
+        Args:
+            corpus: the flair corpus this wrapper will use for fitting the model.
         """
         X = self._convert_dataset(corpus)
 
@@ -42,7 +44,8 @@ def fit(self, corpus: Corpus, **kwargs):
     def predict(self, corpus: Corpus):
         """Predict labels given a list of sentences and returns the respective class indices.
 
-        :param corpus: the flair corpus this wrapper will use for predicting the labels.
+        Args:
+            corpus: the flair corpus this wrapper will use for predicting the labels.
         """
         X = self._convert_dataset(corpus)
         log.info("Start the prediction " + str(self.model) + " with " + str(len(X)) + " Datapoints.")
@@ -57,7 +60,8 @@ def predict(self, corpus: Corpus):
     def save(self, model_file: Union[str, Path]):
         """Saves current model.
 
-        :param model_file: path where to save the model.
+        Args:
+            model_file: path where to save the model.
         """
         joblib.dump(pickle.dumps(self), str(model_file))
 
@@ -67,7 +71,8 @@ def save(self, model_file: Union[str, Path]):
     def load(model_file: Union[str, Path]):
         """Loads a model from a given path.
 
-        :param model_file: path to the file where the model is saved.
+        Args:
+            model_file: path to the file where the model is saved.
         """
         log.info("Loading model from: " + str(model_file))
         return pickle.loads(joblib.load(str(model_file)))
@@ -80,7 +85,8 @@ def _convert_dataset(
         Turns the corpora into X, y datasets as required for most sklearn clustering models.
         Ref.: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
 
-        :param label_type: the label from sentences will be extracted. If the value is none this will be skipped.
+        Args:
+            label_type: the label from sentences will be extracted. If the value is none this will be skipped.
         """
         log.info("Embed sentences...")
         sentences = []
@@ -107,8 +113,9 @@ def evaluate(self, corpus: Corpus, label_type: str):
 
         Also, the result of the evaluation is logged.
 
-        :param corpus: the flair corpus this wrapper will use for evaluation.
-        :param label_type: the label from the sentence will be used for the evaluation.
+        Args:
+            corpus: the flair corpus this wrapper will use for evaluation.
+            label_type: the label from the sentence will be used for the evaluation.
         """
         X, Y = self._convert_dataset(corpus, label_type=label_type)
         predict = self.model.predict(X)
diff --git a/flair/models/lemmatizer_model.py b/flair/models/lemmatizer_model.py
index d647da9352..8356491103 100644
--- a/flair/models/lemmatizer_model.py
+++ b/flair/models/lemmatizer_model.py
@@ -167,9 +167,11 @@ def words_to_char_indices(
         indices representing characters in self.char_dict.
         One can manually set the vector length with the parameter seq_length, though the vector length is always
         at least maximum string length in the list.
-        :param end_symbol: add self.end_index at the end of each representation
-        :param start_symbol: add self.start_index in front of of each representation
-        :param padding_in_front: whether to fill up with self.dummy_index in front or in back of strings
+
+        Args:
+            end_symbol: add self.end_index at the end of each representation
+            start_symbol: add self.start_index in front of each representation
+            padding_in_front: whether to fill up with self.dummy_index in front or in back of strings
         """
         # add additional columns for special symbols if necessary
         c = int(end_symbol) + int(start_symbol)
@@ -405,15 +407,13 @@ def predict(
     ):
         """Predict lemmas of words for a given (list of) sentence(s).
 
-        :param sentences: sentences to predict
-        :param label_name: label name used for predicted lemmas
-        :param mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn
-            is set to True
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
-            you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
-        :param return_loss: whether or not to compute and return loss. Setting it to True only makes sense if labels
-            are provided
-        :param verbose: If True, lemmatized sentences will be printed in the console.
+        Args:
+            sentences: sentences to predict
+            label_name: label name used for predicted lemmas
+            mini_batch_size: number of tokens that are send through the RNN simultaneously, assuming batching_in_rnn is set to True
+            embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
+            return_loss: whether to compute and return loss. Setting it to True only makes sense if labels are provided
+            verbose: If True, lemmatized sentences will be printed in the console.
         """
         if isinstance(sentences, Sentence):
             sentences = [sentences]
diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py
index 0cec520703..174bc8a160 100644
--- a/flair/models/pairwise_regression_model.py
+++ b/flair/models/pairwise_regression_model.py
@@ -37,8 +37,8 @@ def __init__(
     ) -> None:
         """Initialize the Text Pair Regression Model.
 
-        :param embeddings: embeddings used to embed each data point
-        :param label_name:
+        Args:
+            embeddings: embeddings used to embed each data point
         """
         super().__init__()
 
diff --git a/flair/models/regexp_tagger.py b/flair/models/regexp_tagger.py
index a6b7f6c800..35c244d960 100644
--- a/flair/models/regexp_tagger.py
+++ b/flair/models/regexp_tagger.py
@@ -10,7 +10,8 @@
 class TokenCollection:
     """A utility class for RegexpTagger to hold all tokens for a given Sentence and define some functionality.
 
-    :param sentence: A Sentence object
+    Args:
+        sentence: A Sentence object
     """
 
     sentence: Sentence
@@ -33,8 +34,10 @@ def get_token_span(self, span: Tuple[int, int]) -> Span:
         spanning the tokens included in the interval. If the interval is overlapping with a token span, a
         ValueError is raised
 
-        :param span: Start and end pos of the requested span as tuple
-        :return: A span object spanning the requested token interval
+        Args:
+            span: Start and end pos of the requested span as tuple
+
+        Returns: A span object spanning the requested token interval
         """
         span_start: int = self.__tokens_start_pos.index(span[0])
         span_end: int = self.__tokens_end_pos.index(span[1])
@@ -52,7 +55,8 @@ def __init__(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]) -> No
 
         If a match violates (in this case overlaps) a token span, an exception is raised.
 
-        :param mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
+        Args:
+            mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
         """
         self._regexp_mapping: Dict[str, typing.Pattern] = {}
         self.register_labels(mapping=mapping)
@@ -64,7 +68,8 @@ def registered_labels(self):
     def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]):
         """Register a regexp -> label mapping.
 
-        :param mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
+        Args:
+            mapping: A list of tuples or a single tuple representing a mapping as regexp -> label
         """
         mapping = self._listify(mapping)
 
@@ -79,7 +84,8 @@ def register_labels(self, mapping: Union[List[Tuple[str, str]], Tuple[str, str]]
     def remove_labels(self, labels: Union[List[str], str]):
         """Remove a registered regexp -> label mapping given by label.
 
-        :param labels: A list of labels or a single label as strings.
+        Args:
+            labels: A list of labels or a single label as strings.
         """
         labels = self._listify(labels)
 
diff --git a/flair/models/relation_classifier_model.py b/flair/models/relation_classifier_model.py
index 4fa99979f8..c819dd1357 100644
--- a/flair/models/relation_classifier_model.py
+++ b/flair/models/relation_classifier_model.py
@@ -264,34 +264,18 @@ def __init__(
     ) -> None:
         """Initializes a `RelationClassifier`.
 
-        :param embeddings: The document embeddings used to embed each sentence
-        :param label_dictionary: A Dictionary containing all predictable labels from the corpus
-        :param label_type: The label type which is going to be predicted, in case a corpus has multiple annotations
-        :param entity_label_types: A label type or sequence of label types of the required relation entities.
-                                   You can also specify a label filter in a dictionary with the label type as key and
-                                   the valid entity labels as values in a set.
-                                   E.g. to use only 'PER' and 'ORG' labels from a NER-tagger: `{'ner': {'PER', 'ORG'}}`.
-                                   To use all labels from 'ner', pass 'ner'.
-        :param entity_pair_labels: A set of valid relation entity pair combinations, used as relation candidates.
-                                   Specify valid entity pairs in a set of tuples of labels (<HEAD>, <TAIL>).
-                                   E.g. for the `born_in` relation, only relations from 'PER' to 'LOC' make sense.
-                                   Here, relations from 'PER' to 'PER' are not meaningful, so
-                                   it is advised to specify the `entity_pair_labels` as `{('PER', 'ORG')}`.
-                                   This setting may help to reduce the number of relation candidates.
-                                   Leaving this parameter as `None` (default) disables the relation-candidate-filter,
-                                   i.e. the model classifies the relation for each entity pair
-                                   in the cross product of *all* entity pairs (inefficient).
-        :param entity_threshold: Only pre-labelled entities above this threshold are taken into account by the model.
-        :param cross_augmentation: If `True`, use cross augmentation to transform `Sentence`s into `EncodedSentenece`s.
-                                   When cross augmentation is enabled, the transformation functions,
-                                   e.g. `transform_corpus`, generate an encoded sentence for each entity pair
-                                   in the cross product of all entities in the original sentence.
-                                   When disabling cross augmentation, the transform functions only generate
-                                   encoded sentences for each gold relation annotation in the original sentence.
-        :param encoding_strategy: An instance of a class conforming the :class:`EncodingStrategy` protocol
-        :param zero_tag_value: The label to use for out-of-class relations
-        :param allow_unk_tag: If `False`, removes `<unk>` from the passed label dictionary, otherwise do nothing.
-        :param classifierargs: The remaining parameters passed to the underlying `DefaultClassifier`
+        Args:
+            embeddings: The document embeddings used to embed each sentence
+            label_dictionary: A Dictionary containing all predictable labels from the corpus
+            label_type: The label type which is going to be predicted, in case a corpus has multiple annotations
+            entity_label_types: A label type or sequence of label types of the required relation entities. You can also specify a label filter in a dictionary with the label type as key and the valid entity labels as values in a set. E.g. to use only 'PER' and 'ORG' labels from a NER-tagger: `{'ner': {'PER', 'ORG'}}`. To use all labels from 'ner', pass 'ner'.
+            entity_pair_labels: A set of valid relation entity pair combinations, used as relation candidates. Specify valid entity pairs in a set of tuples of labels (<HEAD>, <TAIL>). E.g. for the `born_in` relation, only relations from 'PER' to 'LOC' make sense. Here, relations from 'PER' to 'PER' are not meaningful, so it is advised to specify the `entity_pair_labels` as `{('PER', 'ORG')}`. This setting may help to reduce the number of relation candidates. Leaving this parameter as `None` (default) disables the relation-candidate-filter, i.e. the model classifies the relation for each entity pair in the cross product of *all* entity pairs (inefficient).
+            entity_threshold: Only pre-labelled entities above this threshold are taken into account by the model.
+            cross_augmentation: If `True`, use cross augmentation to transform `Sentence`s into `EncodedSentenece`s. When cross augmentation is enabled, the transformation functions, e.g. `transform_corpus`, generate an encoded sentence for each entity pair in the cross product of all entities in the original sentence. When disabling cross augmentation, the transform functions only generate  encoded sentences for each gold relation annotation in the original sentence.
+            encoding_strategy: An instance of a class conforming the :class:`EncodingStrategy` protocol
+            zero_tag_value: The label to use for out-of-class relations
+            allow_unk_tag: If `False`, removes `<unk>` from the passed label dictionary, otherwise do nothing.
+            classifierargs: The remaining parameters passed to the underlying :class:`flair.models.DefaultClassifier`
         """
         # Set label type and prepare label dictionary
         self._label_type = label_type
@@ -345,10 +329,13 @@ def __init__(
         self.to(flair.device)
 
     def _valid_entities(self, sentence: Sentence) -> Iterator[_Entity]:
-        """Yields all valid entities, filtered under the specification of `self.entity_label_types`.
+        """Yields all valid entities, filtered under the specification of :attr:`~entity_label_types`.
+
+        Args:
+            sentence: A Sentence object with entity annotations
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :return: Valid entities as `_Entity`
+        Yields:
+            Valid entities as `_Entity`
         """
         for label_type, valid_labels in self.entity_label_types.items():
             for entity_span in sentence.get_spans(label_type=label_type):
@@ -416,14 +403,16 @@ def _encode_sentence(
         tail: _Entity,
         gold_label: Optional[str] = None,
     ) -> EncodedSentence:
-        """Returns a new `Sentence` object with masked/marked head and tail spans according to the encoding strategy.
+        """Returns a new Sentence object with masked/marked head and tail spans according to the encoding strategy.
 
-        If provided, the encoded sentence also has the corresponding gold label annotation from `self.label_type`.
+        If provided, the encoded sentence also has the corresponding gold label annotation from :attr:`~label_type`.
 
-        :param head: The head `_Entity`
-        :param tail: The tail `_Entity`
-        :param gold_label: An optional gold label of the induced relation by the head and tail entity
-        :return: The `EncodedSentence` (with gold annotations)
+        Args:
+            head: The head Entity
+            tail: The tail Entity
+            gold_label: An optional gold label of the induced relation by the head and tail entity
+
+        Returns: The EncodedSentence with Gold Annotations
         """
         # Some sanity checks
         original_sentence: Sentence = head.span.sentence
@@ -478,9 +467,10 @@ def _encode_sentence_for_inference(
               **exactly** one induced relation annotation, the gold annotation or `self.zero_tag_value`.
             - The created relations have head and tail spans from the original passed sentence.
 
-        :param sentence: A flair `Sentence` object with entity annotations
-        :return: Encoded sentences annotated with their gold relation and
-                 the corresponding relation in the original sentence
+        Args:
+            sentence: A flair `Sentence` object with entity annotations
+
+        Returns: Encoded sentences annotated with their gold relation and the corresponding relation in the original sentence
         """
         for head, tail, gold_label in self._entity_pair_permutations(sentence):
             masked_sentence: EncodedSentence = self._encode_sentence(
@@ -518,11 +508,14 @@ def transform_sentence(self, sentences: Union[Sentence, List[Sentence]]) -> List
         """Transforms sentences into encoded sentences specific to the `RelationClassifier`.
 
         For more information on the internal sentence transformation procedure,
-        see the :class:`RelationClassifier` architecture and
-        the different :class:`EncodingStrategy` variants docstrings.
+        see the :class:`flair.models.RelationClassifier` architecture and
+        the different :class:`flair.models.relation_classifier_model.EncodingStrategy` variants docstrings.
+
+        Args:
+            sentences: sentences to transform
 
-        :param sentences: A (list) of sentence(s) to transform
-        :return: A list of encoded sentences specific to the `RelationClassifier`
+        Returns:
+            A list of encoded sentences specific to the `RelationClassifier`
         """
         if not isinstance(sentences, list):
             sentences = [sentences]
@@ -541,8 +534,10 @@ def transform_dataset(self, dataset: Dataset[Sentence]) -> FlairDatapointDataset
         see the :class:`RelationClassifier` architecture and
         the different :class:`EncodingStrategy` variants docstrings.
 
-        :param dataset: A dataset of sentences to transform
-        :return: A dataset of encoded sentences specific to the `RelationClassifier`
+        Args:
+            dataset: A dataset of sentences to transform
+
+        Returns: A dataset of encoded sentences specific to the `RelationClassifier`
         """
         data_loader: DataLoader = DataLoader(dataset, batch_size=1)
         original_sentences: List[Sentence] = [batch[0] for batch in iter(data_loader)]
@@ -556,8 +551,10 @@ def transform_corpus(self, corpus: Corpus[Sentence]) -> Corpus[EncodedSentence]:
         see the :class:`RelationClassifier` architecture and
         the different :class:`EncodingStrategy` variants docstrings.
 
-        :param corpus: A corpus of sentences to transform
-        :return: A corpus of encoded sentences specific to the `RelationClassifier`
+        Args:
+            corpus: A corpus of sentences to transform
+
+        Returns: A corpus of encoded sentences specific to the `RelationClassifier`
         """
         return Corpus(
             train=self.transform_dataset(corpus.train) if corpus.train is not None else None,
@@ -612,16 +609,16 @@ def predict(
         Standard `Sentence` objects and `EncodedSentences` specific to the `RelationClassifier` are allowed as input.
         The (relation) labels are directly added to the sentences.
 
-        :param sentences: A list of (encoded) sentences.
-        :param mini_batch_size: The mini batch size to use
-        :param return_probabilities_for_all_classes: Return probabilities for all classes instead of only best predicted
-        :param verbose: Set to display a progress bar
-        :param return_loss: Set to return loss
-        :param label_name: Set to change the predicted label type name
-        :param embedding_storage_mode: The default is 'none', which is always best.
-                                       Only set to 'cpu' or 'gpu' if you wish to predict
-                                       and keep the generated embeddings in CPU or GPU memory, respectively.
-        :return: The loss and the total number of classes, if `return_loss` is set
+        Args:
+            sentences: A list of (encoded) sentences.
+            mini_batch_size: The mini batch size to use
+            return_probabilities_for_all_classes: Return probabilities for all classes instead of only best predicted
+            verbose: Set to display a progress bar
+            return_loss: Set to return loss
+            label_name: Set to change the predicted label type name
+            embedding_storage_mode: The default is 'none', which is always best. Only set to 'cpu' or 'gpu' if you wish to predict and keep the generated embeddings in CPU or GPU memory, respectively.
+
+        Returns: The loss and the total number of classes, if `return_loss` is set
         """
         prediction_label_type: str = self.label_type if label_name is None else label_name
 
diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py
index 83e063b72e..c270f87aab 100644
--- a/flair/models/relation_extractor_model.py
+++ b/flair/models/relation_extractor_model.py
@@ -25,12 +25,12 @@ def __init__(
     ) -> None:
         """Initializes a RelationClassifier.
 
-        :param document_embeddings: embeddings used to embed each data point
-        :param label_dictionary: dictionary of labels you want to predict
-        :param beta: Parameter for F-beta score for evaluation and training annealing
-        :param train_on_gold_pairs_only: Set true to not train to predict no relation.
-        :param loss_weights: Dictionary of weights for labels for the loss function
-            (if any label's weight is unspecified it will default to 1.0)
+        Args:
+            document_embeddings: embeddings used to embed each data point
+            label_dictionary: dictionary of labels you want to predict
+            beta: Parameter for F-beta score for evaluation and training annealing
+            train_on_gold_pairs_only: Set true to not train to predict no relation.
+            loss_weights: Dictionary of weights for labels for the loss function any unspecified labels will default to a weight of 1.0
         """
         # pooling operation to get embeddings for entites
         self.pooling_operation = pooling_operation
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 61f32294f1..76646c562a 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -206,7 +206,8 @@ def label_type(self):
     def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor:
         """Initializes the loss weights based on given dictionary.
 
-        :param loss_weights: dictionary - contains loss weights
+        Args:
+            loss_weights: dictionary - contains loss weights
         """
         n_classes = len(self.label_dictionary)
         weight_list = [1.0 for _ in range(n_classes)]
@@ -219,7 +220,8 @@ def _init_loss_weights(self, loss_weights: Dict[str, float]) -> torch.Tensor:
     def _init_initial_hidden_state(self, num_directions: int):
         """Initializes hidden states given the number of directions in RNN.
 
-        :param num_directions: Number of directions in RNN.
+        Args:
+            num_directions: Number of directions in RNN.
         """
         hs_initializer = torch.nn.init.xavier_normal_
         lstm_init_h = torch.nn.Parameter(
@@ -243,11 +245,12 @@ def RNN(
     ) -> torch.nn.RNN:
         """Static wrapper function returning an RNN instance from PyTorch.
 
-        :param rnn_type: Type of RNN from torch.nn
-        :param rnn_layers: number of layers to include
-        :param hidden_size: hidden size of RNN cell
-        :param bidirectional: If True, RNN cell is bidirectional
-        :param rnn_input_dim: Input dimension to RNN cell
+        Args:
+            rnn_type: Type of RNN from torch.nn
+            rnn_layers: number of layers to include
+            hidden_size: hidden size of RNN cell
+            bidirectional: If True, RNN cell is bidirectional
+            rnn_input_dim: Input dimension to RNN cell
         """
         if rnn_type in ["LSTM", "GRU", "RNN"]:
             RNN = getattr(torch.nn, rnn_type)(
@@ -289,8 +292,9 @@ def _prepare_tensors(self, data_points: Union[List[Sentence], Sentence]) -> Tupl
     def forward(self, sentence_tensor: torch.Tensor, lengths: torch.LongTensor):
         """Forward propagation through network.
 
-        :param sentence_tensor: A tensor representing the batch of sentences.
-        :param lengths: A IntTensor representing the lengths of the respective sentences.
+        Args:
+            sentence_tensor: A tensor representing the batch of sentences.
+            lengths: A IntTensor representing the lengths of the respective sentences.
         """
         if self.use_dropout:
             sentence_tensor = self.dropout(sentence_tensor)
@@ -365,8 +369,10 @@ def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor):
 
         Trims current batch tensor in shape (batch size, sequence length, tagset size)
         in such a way that all pads are going to be removed.
-        :param features: torch.tensor containing all features from forward propagation
-        :param lengths: length from each sentence in batch in order to trim padding tokens
+
+        Args:
+            features: all features from forward propagation
+            lengths: length from each sentence in batch in order to trim padding tokens
         """
         features_formatted = []
         for feat, length in zip(features, lengths):
@@ -378,7 +384,8 @@ def _get_scores_from_features(features: torch.Tensor, lengths: torch.Tensor):
     def _get_gold_labels(self, sentences: List[Sentence]) -> List[str]:
         """Extracts gold labels from each sentence.
 
-        :param sentences: List of sentences in batch
+        Args:
+            sentences: List of sentences in batch
         """
         # spans need to be encoded as token-level predictions
         if self.predict_spans:
@@ -430,13 +437,14 @@ def predict(
     ):
         """Predicts labels for current batch with CRF or Softmax.
 
-        :param sentences: List of sentences in batch
-        :param mini_batch_size: batch size for test data
-        :param return_probabilities_for_all_classes: Whether to return probabilities for all classes
-        :param verbose: whether to use progress bar
-        :param label_name: which label to predict
-        :param return_loss: whether to return loss value
-        :param embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
+        Args:
+            sentences: List of sentences in batch
+            mini_batch_size: batch size for test data
+            return_probabilities_for_all_classes: Whether to return probabilities for all classes
+            verbose: whether to use progress bar
+            label_name: which label to predict
+            return_loss: whether to return loss value
+            embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
         """
         if label_name is None:
             label_name = self.tag_type
@@ -532,9 +540,10 @@ def predict(
     def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], probabilities_for_all_classes: bool):
         """Softmax over emission scores from forward propagation.
 
-        :param features: sentence tensor from forward propagation
-        :param batch: list of sentence
-        :param probabilities_for_all_classes: whether to return score for each tag in tag dictionary
+        Args:
+            features: sentence tensor from forward propagation
+            batch: sentences
+            probabilities_for_all_classes: whether to return score for each tag in tag dictionary
         """
         softmax_batch = F.softmax(features, dim=1).cpu()
         scores_batch, prediction_batch = torch.max(softmax_batch, dim=1)
@@ -562,7 +571,8 @@ def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], pro
     def _all_scores_for_token(self, sentences: List[Sentence], scores: torch.Tensor, lengths: List[int]):
         """Returns all scores for each tag in tag dictionary.
 
-        :param scores: Scores for current sentence.
+        Args:
+            scores: Scores for current sentence.
         """
         scores = scores.numpy()
         tokens = [token for sentence in sentences for token in sentence]
@@ -916,11 +926,13 @@ def push_to_hub(
     ):
         """Uploads the Sequence Tagger model to a Hugging Face Hub repository.
 
-        :param repo_id: A namespace (user or an organization) and a repo name separated by a `/`.
-        :param token: An authentication token (See https://huggingface.co/settings/token).
-        :param private: Whether the repository is private.
-        :param commit_message: Message to commit while pushing.
-        :return: The url of the repository.
+        Args:
+            repo_id: A namespace (user or an organization) and a repo name separated by a `/`.
+            token: An authentication token (See https://huggingface.co/settings/token).
+            private: Whether the repository is private.
+            commit_message: Message to commit while pushing.
+
+        Returns: The url of the repository.
         """
         # Lazy import
         from huggingface_hub import create_repo, model_info, upload_folder
diff --git a/flair/models/sequence_tagger_utils/crf.py b/flair/models/sequence_tagger_utils/crf.py
index 0b4e78e6ce..085339dce9 100644
--- a/flair/models/sequence_tagger_utils/crf.py
+++ b/flair/models/sequence_tagger_utils/crf.py
@@ -17,9 +17,10 @@ class CRF(torch.nn.Module):
     def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool) -> None:
         """Initialize the Conditional Random Field.
 
-        :param tag_dictionary: tag dictionary in order to find ID for start and stop tags
-        :param tagset_size: number of tag from tag dictionary
-        :param init_from_state_dict: whether we load pretrained model from state dict
+        Args:
+            tag_dictionary: tag dictionary in order to find ID for start and stop tags
+            tagset_size: number of tag from tag dictionary
+            init_from_state_dict: whether we load pretrained model from state dict
         """
         super().__init__()
 
@@ -37,9 +38,10 @@ def __init__(self, tag_dictionary, tagset_size: int, init_from_state_dict: bool)
     def forward(self, features: torch.Tensor) -> torch.Tensor:
         """Forward propagation of Conditional Random Field.
 
-        :param features: output from RNN / Linear layer in shape (batch size, seq len, hidden size)
-        :return: CRF scores (emission scores for each token + transitions prob from previous state) in
-        shape (batch_size, seq len, tagset size, tagset size)
+        Args:
+            features: output from RNN / Linear layer in shape (batch size, seq len, hidden size)
+
+        Returns: CRF scores (emission scores for each token + transitions prob from previous state) in shape (batch_size, seq len, tagset size, tagset size)
         """
         batch_size, seq_len = features.size()[:2]
 
diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 1cae3c0088..3fa41df6d0 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -19,7 +19,8 @@ class ViterbiLoss(torch.nn.Module):
     def __init__(self, tag_dictionary: Dictionary) -> None:
         """Create an instance of the Viterbi loss.
 
-        :param tag_dictionary: tag_dictionary of task
+        Args:
+            tag_dictionary: tag_dictionary of task
         """
         super().__init__()
         self.tag_dictionary = tag_dictionary
@@ -30,10 +31,11 @@ def __init__(self, tag_dictionary: Dictionary) -> None:
     def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor:
         """Forward propagation of Viterbi Loss.
 
-        :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size),
-            lengths of sentences in batch, transitions from CRF
-        :param targets: true tags for sentences which will be converted to matrix indices.
-        :return: summed Viterbi Loss over all data points
+        Args:
+            features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentences in batch, transitions from CRF
+            targets: true tags for sentences which will be converted to matrix indices.
+
+        Returns: summed Viterbi Loss over all data points
         """
         features, lengths, transitions = features_tuple
 
@@ -82,9 +84,11 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor:
     def _log_sum_exp(tensor, dim):
         """Calculates the log-sum-exponent of a tensor's dimension in a numerically stable way.
 
-        :param tensor: tensor
-        :param dim: dimension to calculate log-sum-exp of
-        :return: log-sum-exp
+        Args:
+            tensor: tensor
+            dim: dimension to calculate log-sum-exp of
+
+        Returns: log-sum-exp
         """
         m, _ = torch.max(tensor, dim)
         m_expanded = m.unsqueeze(dim).expand_as(tensor)
@@ -99,8 +103,9 @@ def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor):
         from previous tag 5 and could directly be addressed through the 1-dim indices (10 + tagset_size * 5) = 70,
         if our tagset consists of 12 tags.
 
-        :param targets: targets as in tag dictionary
-        :param lengths: lengths of sentences in batch
+        Args:
+            targets: targets as in tag dictionary
+            lengths: lengths of sentences in batch
         """
         targets_per_sentence = []
 
@@ -127,7 +132,8 @@ class ViterbiDecoder:
     def __init__(self, tag_dictionary: Dictionary) -> None:
         """Initialize the Viterbi Decoder.
 
-        :param tag_dictionary: Dictionary of tags for sequence labeling task
+        Args:
+            tag_dictionary: Dictionary of tags for sequence labeling task
         """
         self.tag_dictionary = tag_dictionary
         self.tagset_size = len(tag_dictionary)
@@ -139,10 +145,11 @@ def decode(
     ) -> Tuple[List, List]:
         """Decoding function returning the most likely sequence of tags.
 
-        :param features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size),
-            lengths of sentence in batch, transitions of CRF
-        :param probabilities_for_all_classes: whether to return probabilities for all tags
-        :return: decoded sequences
+        Args:
+            features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentence in batch, transitions of CRF
+            probabilities_for_all_classes: whether to return probabilities for all tags
+
+        Returns: decoded sequences
         """
         features, lengths, transitions = features_tuple
         all_tags = []
@@ -220,7 +227,8 @@ def decode(
     def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]):
         """Returns all scores for each tag in tag dictionary.
 
-        :param scores: Scores for current sentence.
+        Args:
+            scores: Scores for current sentence.
         """
         scores = scores.numpy()
         prob_tags_per_sentence = []
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index dbcce02290..d36dcc3478 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -183,11 +183,13 @@ def add_and_switch_to_new_task(
         Sets necessary attributes and finally 'switches' to the new task. Parameters are similar to the constructor
         except for model choice, batch size and negative sampling. This method does not store the resultant model onto
         disk.
-        :param task_name: a string depicting the name of the task
-        :param label_dictionary: dictionary of the labels you want to predict
-        :param label_type: string to identify the label type ('ner', 'sentiment', etc.)
-        :param multi_label: whether this task is a multi-label prediction problem
-        :param force_switch: if True, will overwrite existing task with same name
+
+        Args:
+            task_name: a string depicting the name of the task
+            label_dictionary: dictionary of the labels you want to predict
+            label_type: string to identify the label type ('ner', 'sentiment', etc.)
+            multi_label: whether this task is a multi-label prediction problem
+            force_switch: if True, will overwrite existing task with same name
         """
         if task_name in self._task_specific_attributes and not force_switch:
             log.warning(f"Task `{task_name}` already exists in TARS model. Switching to it.")
@@ -262,9 +264,10 @@ def predict_zero_shot(
     ):
         """Make zero shot predictions from the TARS model.
 
-        :param sentences: input sentence objects to classify
-        :param candidate_label_set: set of candidate labels
-        :param multi_label: indicates whether multi-label or single class prediction. Defaults to True.
+        Args:
+            sentences: input sentence objects to classify
+            candidate_label_set: set of candidate labels
+            multi_label: indicates whether multi-label or single class prediction. Defaults to True.
         """
         # check if candidate_label_set is empty
         if candidate_label_set is None or len(candidate_label_set) == 0:
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index d00d12b6c0..aa111e5c4d 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -25,9 +25,10 @@ def __init__(
     ) -> None:
         """Initializes a TokenClassifier.
 
-        :param embeddings: word embeddings used in tagger
-        :param tag_dictionary: dictionary of tags you want to predict
-        :param tag_type: string identifier for tag type
+        Args:
+            embeddings: word embeddings used in tagger
+            tag_dictionary: dictionary of tags you want to predict
+            tag_type: string identifier for tag type
         """
         # if the classifier predicts BIO/BIOES span labels, the internal label dictionary must be computed
         if label_dictionary.span_labels:
diff --git a/flair/nn/distance/cosine.py b/flair/nn/distance/cosine.py
index a19cc8452d..188b73f375 100644
--- a/flair/nn/distance/cosine.py
+++ b/flair/nn/distance/cosine.py
@@ -6,8 +6,10 @@
 def dot_product(a: torch.Tensor, b: torch.Tensor, normalize=False):
     """Computes dot product for pairs of vectors.
 
-    :param normalize: Vectors are normalized (leads to cosine similarity)
-    :return: Matrix with res[i][j]  = dot_product(a[i], b[j])
+    Args:
+        normalize: Vectors are normalized (leads to cosine similarity)
+
+    Returns: Matrix with res[i][j] = dot_product(a[i], b[j])
     """
     if len(a.shape) == 1:
         a = a.unsqueeze(0)
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 979e27da73..7a95be5199 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -62,10 +62,14 @@ def evaluate(
         """Evaluates the model. Returns a Result object containing evaluation results and a loss value.
 
         Implement this to enable evaluation.
-        :param data_loader: DataLoader that iterates over dataset to be evaluated
-        :param out_path: Optional output path to store predictions
-        :param embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU  # noqa: E501
-        :return: Returns a Tuple consisting of a Result object and a loss float value
+
+        Args:
+            data_loader: DataLoader that iterates over dataset to be evaluated
+            out_path: Optional output path to store predictions
+            embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU
+
+        Returns:
+            The evaluation results.
         """
         raise NotImplementedError
 
@@ -100,7 +104,8 @@ def _fetch_model(model_name) -> str:
     def save(self, model_file: Union[str, Path], checkpoint: bool = False):
         """Saves the current model to the provided file.
 
-        :param model_file: the model file
+        Args:
+            model_file: the model file
         """
         model_state = self._get_state_dict()
 
@@ -115,8 +120,10 @@ def save(self, model_file: Union[str, Path], checkpoint: bool = False):
     def load(cls, model_path: Union[str, Path, Dict[str, Any]]) -> "Model":
         """Loads the model from the given file.
 
-        :param model_path: the model file or the already loaded state dict
-        :return: the loaded text classifier model
+        Args:
+            model_path: the model file or the already loaded state dict
+
+        Returns: the loaded text classifier model
         """
         # if this class is abstract, go through all inheriting classes and try to fetch and load the model
         if inspect.isabstract(cls):
@@ -498,13 +505,15 @@ def predict(
         """Predicts the class labels for the given sentences.
 
         The labels are directly added to the sentences.
-        :param sentences: list of sentences
-        :param mini_batch_size: mini batch size to use
-        :param return_probabilities_for_all_classes : return probabilities for all classes instead of only best predicted  # noqa: E501
-        :param verbose: set to True to display a progress bar
-        :param return_loss: set to True to return loss
-        :param label_name: set this to change the name of the label type that is predicted  # noqa: E501
-        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.  # noqa: E501
+
+        Args:
+            sentences: list of sentences
+            mini_batch_size: mini batch size to use
+            return_probabilities_for_all_classes: return probabilities for all classes instead of only best predicted
+            verbose: set to True to display a progress bar
+            return_loss: set to True to return loss
+            label_name: set this to change the name of the label type that is predicted  # noqa: E501
+            embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory.  # noqa: E501
         """
         raise NotImplementedError
 
diff --git a/flair/samplers.py b/flair/samplers.py
index e20646624f..d8a5c22f36 100644
--- a/flair/samplers.py
+++ b/flair/samplers.py
@@ -13,7 +13,8 @@ class FlairSampler(Sampler):
     def set_dataset(self, data_source):
         """Initialize the data source for the FlairSampler.
 
-        :param data_source: dataset to sample from.
+        Args:
+            data_source: dataset to sample from.
         """
         self.data_source = data_source
         self.num_samples = len(self.data_source)
@@ -31,7 +32,8 @@ def __init__(self) -> None:
     def set_dataset(self, data_source):
         """Initialize the dataset used for sampling.
 
-        :param data_source:
+        Args:
+            data_source:
         """
         self.data_source = data_source
         self.num_samples = len(self.data_source)
@@ -91,7 +93,8 @@ class ExpandingChunkSampler(FlairSampler):
     def __init__(self, step=3) -> None:
         """Initialize the ExpandingChunkSampler.
 
-        :param step: every *step* epochs the block size increments by one.
+        Args:
+            step: every *step* epochs the block size increments by one.
         """
         super().__init__(None)
         self.block_size = 1
diff --git a/flair/tokenization.py b/flair/tokenization.py
index ab4c0d2390..af77e5f570 100644
--- a/flair/tokenization.py
+++ b/flair/tokenization.py
@@ -31,9 +31,8 @@ def name(self) -> str:
 class SpacyTokenizer(Tokenizer):
     """Tokenizer using spacy under the hood.
 
-    Implementation of :class:`Tokenizer`, using models from Spacy.
-
-    :param model a Spacy V2 model or the name of the model to load.
+    Args:
+        model: a Spacy V2 model or the name of the model to load.
     """
 
     def __init__(self, model) -> None:
diff --git a/flair/trainers/plugins/base.py b/flair/trainers/plugins/base.py
index 9b5c3b607a..709866dada 100644
--- a/flair/trainers/plugins/base.py
+++ b/flair/trainers/plugins/base.py
@@ -38,7 +38,8 @@ class Pluggable:
     def __init__(self, *, plugins: Sequence[PluginArgument] = []) -> None:
         """Initialize a `Pluggable`.
 
-        :param plugins: Plugins which should be attached to this `Pluggable`.
+        Args:
+            plugins: Plugins which should be attached to this `Pluggable`.
         """
         self._hook_handles: Dict[EventIdenifier, Dict[HookHandleId, HookHandle]] = defaultdict(dict)
 
@@ -124,10 +125,11 @@ def __init__(
     ) -> None:
         """Intitialize `HookHandle`.
 
-        :param _id: Id, the callback is stored as in the `Pluggable`.
-        :param *events: List of events, the callback is registered for.
-        :param func: The callback function.
-        :param pluggable: The `Pluggable` where the callback is registered.
+        Args:
+            _id: Id, the callback is stored as in the `Pluggable`.
+            events: List of events, the callback is registered for.
+            func: The callback function.
+            pluggable: The `Pluggable` where the callback is registered.
         """
         pluggable.validate_event(*events)
 
diff --git a/flair/trainers/plugins/functional/amp.py b/flair/trainers/plugins/functional/amp.py
index a0040b76a1..411b7d3722 100644
--- a/flair/trainers/plugins/functional/amp.py
+++ b/flair/trainers/plugins/functional/amp.py
@@ -39,11 +39,7 @@ def backward(self, loss):
 
     @TrainerPlugin.hook
     def after_setup(self, **kw):
-        """Wraps with AMP.
-
-        :param kw:
-        :return:
-        """
+        """Wraps with AMP."""
         optimizer = self.trainer.optimizer
 
         self.trainer.model, self.trainer.optimizer = self.amp.initialize(
diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index f33c60b77b..69646ef5d4 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -51,13 +51,7 @@ def after_setup(
         optimizer,
         **kw,
     ):
-        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers.
-
-        :param train_with_dev:
-        :param optimizer:
-        :param kw:
-        :return:
-        """
+        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers."""
         # minimize training loss if training with dev data, else maximize dev score
         anneal_mode = "min" if train_with_dev else "max"
 
@@ -75,13 +69,7 @@ def after_setup(
 
     @TrainerPlugin.hook
     def after_evaluation(self, current_model_is_best, validation_scores, **kw):
-        """Scheduler step of AnnealOnPlateau.
-
-        :param current_model_is_best:
-        :param validation_scores:
-        :param kw:
-        :return:
-        """
+        """Scheduler step of AnnealOnPlateau."""
         reduced_learning_rate: bool = self.scheduler.step(*validation_scores)
 
         self.store_learning_rate()
diff --git a/flair/trainers/plugins/functional/checkpoints.py b/flair/trainers/plugins/functional/checkpoints.py
index f3bfc3ff65..f1f7020f44 100644
--- a/flair/trainers/plugins/functional/checkpoints.py
+++ b/flair/trainers/plugins/functional/checkpoints.py
@@ -19,12 +19,7 @@ def __init__(
 
     @TrainerPlugin.hook
     def after_training_epoch(self, epoch, **kw):
-        """Saves the model each k epochs.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """Saves the model each k epochs."""
         if self.save_model_each_k_epochs > 0 and epoch % self.save_model_each_k_epochs == 0:
             log.info(
                 f"Saving model at current epoch since 'save_model_each_k_epochs={self.save_model_each_k_epochs}' "
diff --git a/flair/trainers/plugins/functional/linear_scheduler.py b/flair/trainers/plugins/functional/linear_scheduler.py
index 08aca32c26..ad1cb86128 100644
--- a/flair/trainers/plugins/functional/linear_scheduler.py
+++ b/flair/trainers/plugins/functional/linear_scheduler.py
@@ -31,14 +31,7 @@ def after_setup(
         max_epochs,
         **kw,
     ):
-        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers.
-
-        :param dataset_size:
-        :param mini_batch_size:
-        :param max_epochs:
-        :param kw:
-        :return:
-        """
+        """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers."""
         # calculate warmup steps
         steps_per_epoch = (dataset_size + mini_batch_size - 1) / mini_batch_size
         num_train_steps = int(steps_per_epoch * max_epochs)
@@ -52,21 +45,14 @@ def after_setup(
 
     @TrainerPlugin.hook
     def before_training_epoch(self, **kw):
-        """Load state for anneal_with_restarts, batch_growth_annealing, logic for early stopping.
-
-        :param kw:
-        :return:
-        """
+        """Load state for anneal_with_restarts, batch_growth_annealing, logic for early stopping."""
         self.store_learning_rate()
         self.previous_learning_rate = self.current_learning_rate
 
     @TrainerPlugin.hook
     def after_training_batch(self, optimizer_was_run: bool, **kw):
-        """Do the scheduler step if one-cycle or linear decay.
+        """Do the scheduler step if one-cycle or linear decay."""
 
-        :param kw:
-        :return:
-        """
         # skip if no optimization has happened.
         if not optimizer_was_run:
             return
diff --git a/flair/trainers/plugins/functional/weight_extractor.py b/flair/trainers/plugins/functional/weight_extractor.py
index 3edd5ff798..a6ed5eab2e 100644
--- a/flair/trainers/plugins/functional/weight_extractor.py
+++ b/flair/trainers/plugins/functional/weight_extractor.py
@@ -11,14 +11,7 @@ def __init__(self, base_path) -> None:
 
     @TrainerPlugin.hook
     def after_training_batch(self, batch_no, epoch, total_number_of_batches, **kw):
-        """Extracts weights.
-
-        :param batch_no:
-        :param epoch:
-        :param total_number_of_batches:
-        :param kw:
-        :return:
-        """
+        """Extracts weights."""
         modulo = max(1, int(total_number_of_batches / 10))
         iteration = epoch * total_number_of_batches + batch_no
 
diff --git a/flair/trainers/plugins/loggers/loss_file.py b/flair/trainers/plugins/loggers/loss_file.py
index 9658aa6b0f..f19ef918ba 100644
--- a/flair/trainers/plugins/loggers/loss_file.py
+++ b/flair/trainers/plugins/loggers/loss_file.py
@@ -60,21 +60,12 @@ def __init__(
 
     @TrainerPlugin.hook
     def before_training_epoch(self, epoch, **kw):
-        """Get the current epoch for loss file logging.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """Get the current epoch for loss file logging."""
         self.current_row = {MetricName("epoch"): epoch}
 
     @TrainerPlugin.hook
     def metric_recorded(self, record):
-        """Add the metric of a record to the current row.
-
-        :param record:
-        :return:
-        """
+        """Add the metric of a record to the current row."""
         if record.name in self.headers and self.current_row is not None:
             if record.name == "learning_rate" and not record.is_scalar:
                 # record is a list of scalars
@@ -90,12 +81,7 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def after_evaluation(self, epoch, **kw):
-        """This prints all relevant metrics.
-
-        :param epoch:
-        :param kw:
-        :return:
-        """
+        """This prints all relevant metrics."""
         if self.loss_txt is not None:
             self.current_row[MetricName("timestamp")] = f"{datetime.now():%H:%M:%S}"
 
diff --git a/flair/trainers/plugins/loggers/metric_history.py b/flair/trainers/plugins/loggers/metric_history.py
index fe9166fc18..46802824d0 100644
--- a/flair/trainers/plugins/loggers/metric_history.py
+++ b/flair/trainers/plugins/loggers/metric_history.py
@@ -30,9 +30,5 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def after_training(self, **kw):
-        """Returns metric history.
-
-        :param kw:
-        :return:
-        """
+        """Returns metric history."""
         self.trainer.return_values.update(self.metric_history)
diff --git a/flair/trainers/plugins/loggers/tensorboard.py b/flair/trainers/plugins/loggers/tensorboard.py
index 1700768dc4..9158c510ef 100644
--- a/flair/trainers/plugins/loggers/tensorboard.py
+++ b/flair/trainers/plugins/loggers/tensorboard.py
@@ -13,8 +13,9 @@ class TensorboardLogger(TrainerPlugin):
     def __init__(self, log_dir=None, comment="", tracked_metrics=()) -> None:
         """Initializes the TensorboardLogger.
 
-        :param log_dir: Directory into which tensorboard log files will be written  # noqa: E501
-        :param tracked_metrics: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example  # noqa: E501
+        Args:
+            log_dir: Directory into which tensorboard log files will be written
+            tracked_metrics: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example
         """
         super().__init__()
         self.comment = comment
@@ -50,10 +51,6 @@ def metric_recorded(self, record):
 
     @TrainerPlugin.hook
     def _training_finally(self, **kw):
-        """Closes the writer.
-
-        :param kw:
-        :return:
-        """
+        """Closes the writer."""
         assert self.writer is not None
         self.writer.close()
diff --git a/flair/trainers/plugins/metric_records.py b/flair/trainers/plugins/metric_records.py
index 6648f9f22c..d4742e1098 100644
--- a/flair/trainers/plugins/metric_records.py
+++ b/flair/trainers/plugins/metric_records.py
@@ -76,11 +76,11 @@ def __init__(
     ) -> None:
         """Create a metric record.
 
-        :param name: Name of the metric.
-        :param typ: Type of metric.
-        :param value: Value of the metric (can be anything: scalar, tensor,
-            image, etc.).
-        :param walltime: Time of recording this metric.
+        Args:
+            name: Name of the metric.
+            typ: Type of metric.
+            value: Value of the metric (can be anything: scalar, tensor, image, etc.).
+            walltime: Time of recording this metric.
         """
         self.name: MetricName = MetricName(name)
         self.typ: RecordType = typ
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index efd17ea61b..8fe18bf3e5 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -55,8 +55,9 @@ class ModelTrainer(Pluggable):
     def __init__(self, model: flair.nn.Model, corpus: Corpus) -> None:
         """Initialize a model trainer.
 
-        :param model: The model that you want to train. The model should inherit from flair.nn.Model  # noqa: E501
-        :param corpus: The dataset used to train the model, should be of type Corpus
+        Args:
+            model: The model that you want to train. The model should inherit from flair.nn.Model  # noqa: E501
+            corpus: The dataset used to train the model, should be of type Corpus
         """
         super().__init__()
         self.model: flair.nn.Model = model
@@ -837,11 +838,7 @@ def _publish_eval_result(self, result, prefix=(), **kw):
         )
 
     def _initialize_model_card(self, **training_parameters):
-        """Initializes model card with library versions and parameters.
-
-        :param training_parameters:
-        :return:
-        """
+        """Initializes model card with library versions and parameters."""
         # create a model card for this model with Flair and PyTorch version
         model_card = {
             "flair_version": flair.__version__,
diff --git a/flair/training_utils.py b/flair/training_utils.py
index e465f86c1d..387ad63aec 100644
--- a/flair/training_utils.py
+++ b/flair/training_utils.py
@@ -328,9 +328,11 @@ def load_state_dict(self, state_dict):
 def init_output_file(base_path: Union[str, Path], file_name: str) -> Path:
     """Creates a local file which can be appended to.
 
-    :param base_path: the path to the directory
-    :param file_name: the file name
-    :return: the created file
+    Args:
+        base_path: the path to the directory
+        file_name: the file name
+
+    Returns: the created file
     """
     base_path = Path(base_path)
     base_path.mkdir(parents=True, exist_ok=True)
@@ -343,9 +345,11 @@ def init_output_file(base_path: Union[str, Path], file_name: str) -> Path:
 def convert_labels_to_one_hot(label_list: List[List[str]], label_dict: Dictionary) -> List[List[int]]:
     """Convert list of labels to a one hot list.
 
-    :param label_list: list of labels
-    :param label_dict: label dictionary
-    :return: converted label list
+    Args:
+        label_list: list of labels
+        label_dict: label dictionary
+
+    Returns: converted label list
     """
     return [[1 if label in labels else 0 for label in label_dict.get_items()] for labels in label_list]
 
diff --git a/flair/visual/ner_html.py b/flair/visual/ner_html.py
index fb7a09127b..0cdc151a54 100644
--- a/flair/visual/ner_html.py
+++ b/flair/visual/ner_html.py
@@ -56,12 +56,14 @@ def render_ner_html(
 ) -> str:
     """Create the html code to visualize some sentences.
 
-    :param sentences: single sentence or list of sentences to convert to HTML
-    :param title: title of the HTML page
-    :param colors: dict where keys are tags and values are color HTML codes
-    :param default_color: color to use if colors parameter is missing a tag
-    :param wrap_page: if True method returns result of processing sentences wrapped by &lt;html&gt; and &lt;body&gt; tags, otherwise - without these tags  # noqa: E501
-    :return: HTML as a string
+    Args:
+        sentences: single sentence or list of sentences to convert to HTML
+        title: title of the HTML page
+        colors: dict where keys are tags and values are color HTML codes
+        default_color: color to use if colors parameter is missing a tag
+        wrap_page: if True method returns result of processing sentences wrapped by &lt;html&gt; and &lt;body&gt; tags, otherwise - without these tags
+
+    Returns: HTML as a string
     """
     if isinstance(sentences, Sentence):
         sentences = [sentences]

From 0548a960f8f7a6d00fff0b6c094f0b200e3c0006 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 10 Jul 2023 18:47:39 +0200
Subject: [PATCH 081/124] fix updating documentation syntax error

---
 docs/contributing/updating_documentation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contributing/updating_documentation.md b/docs/contributing/updating_documentation.md
index 7fb28cbcc3..f976793621 100644
--- a/docs/contributing/updating_documentation.md
+++ b/docs/contributing/updating_documentation.md
@@ -20,7 +20,7 @@ A tutorial should always be easy to understand, and reference api documentation
 
 ```{note}
   You can reference symbols by defining links
-  e.g.: `[`flair.set_seed`](#flair.set_seed)` for a function
+  e.g.: ``[`flair.set_seed`](#flair.set_seed)`` for a function
   e.g.: `[entity-linking](project:../tutorial/tutorial-basics/entity-linking.md)` for another tutorial
 ```
 

From d04eba51e236f24a069108121af06e4088637b3e Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 14:29:04 +0200
Subject: [PATCH 082/124] remove sidebars from index page

---
 docs/conf.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index ffcf7c6a82..0ddb629096 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -85,7 +85,8 @@ def linkcode_resolve(*args):
         "globaltoc.html",
         "searchbox.html",
         "versioning.html",
-    ]
+    ],
+    "index": []
 }
 
 smv_latest_version = importlib_metadata.version(project)

From 4da11a5f9b72137098206850df732e4a3777e198 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 14:38:35 +0200
Subject: [PATCH 083/124] fix wrong link

---
 docs/tutorial/tutorial-embeddings/transformer-embeddings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
index 435a7b7221..eae5eb35bf 100644
--- a/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
+++ b/docs/tutorial/tutorial-embeddings/transformer-embeddings.md
@@ -58,7 +58,7 @@ embedding.embed(sentence)
 
 ## Arguments
 
-There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.document.TransformerWordEmbeddings) 
+There are several options that you can set when you init the [`TransformerWordEmbeddings`](#flair.embeddings.token.TransformerWordEmbeddings) 
 and [`TransformerDocumentEmbeddings`](#flair.embeddings.document.TransformerDocumentEmbeddings) classes:
 
 | Argument               | Default             | Description                                                                                                                                

From 20389cf3da8641b5856bcb81e78bd60867b1a483 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 15:16:38 +0200
Subject: [PATCH 084/124] add custom version-switcher

---
 docs/_static/custom-icon.js           | 16 ++++++++++++++
 docs/_templates/version-switcher.html | 30 +++++++++++++++++++++++++++
 docs/_templates/versioning.html       |  5 +----
 docs/conf.py                          | 13 ++++++++++++
 4 files changed, 60 insertions(+), 4 deletions(-)
 create mode 100644 docs/_static/custom-icon.js
 create mode 100644 docs/_templates/version-switcher.html

diff --git a/docs/_static/custom-icon.js b/docs/_static/custom-icon.js
new file mode 100644
index 0000000000..cd949b3b7e
--- /dev/null
+++ b/docs/_static/custom-icon.js
@@ -0,0 +1,16 @@
+/*******************************************************************************
+ * Set a custom icon for pypi as it's not available in the fa built-in brands
+ */
+FontAwesome.library.add(
+  (faListOldStyle = {
+    prefix: "fa-custom",
+    iconName: "pypi",
+    icon: [
+      17.313, // viewBox width
+      19.807, // viewBox height
+      [], // ligature
+      "e001", // unicode codepoint - private use area
+      "m10.383 0.2-3.239 1.1769 3.1883 1.1614 3.239-1.1798zm-3.4152 1.2411-3.2362 1.1769 3.1855 1.1614 3.2369-1.1769zm6.7177 0.00281-3.2947 1.2009v3.8254l3.2947-1.1988zm-3.4145 1.2439-3.2926 1.1981v3.8254l0.17548-0.064132 3.1171-1.1347zm-6.6564 0.018325v3.8247l3.244 1.1805v-3.8254zm10.191 0.20931v2.3137l3.1777-1.1558zm3.2947 1.2425-3.2947 1.1988v3.8254l3.2947-1.1988zm-8.7058 0.45739c0.00929-1.931e-4 0.018327-2.977e-4 0.027485 0 0.25633 0.00851 0.4263 0.20713 0.42638 0.49826 1.953e-4 0.38532-0.29327 0.80469-0.65542 0.93662-0.36226 0.13215-0.65608-0.073306-0.65613-0.4588-6.28e-5 -0.38556 0.2938-0.80504 0.65613-0.93662 0.068422-0.024919 0.13655-0.038114 0.20156-0.039466zm5.2913 0.78369-3.2947 1.1988v3.8247l3.2947-1.1981zm-10.132 1.239-3.2362 1.1769 3.1883 1.1614 3.2362-1.1769zm6.7177 0.00213-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4124 1.2439-3.2947 1.1988v3.8254l3.2947-1.1988zm-6.6585 0.016195v3.8275l3.244 1.1805v-3.8254zm16.9 0.21143-3.2947 1.1988v3.8247l3.2947-1.1981zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4124 1.2432-3.2947 1.1988v3.8254l3.2947-1.1988zm-6.6585 0.019027v3.8247l3.244 1.1805v-3.8254zm13.485 1.4497-3.2947 1.1988v3.8247l3.2947-1.1981zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm2.4018 0.38127c0.0093-1.83e-4 0.01833-3.16e-4 0.02749 0 0.25633 0.0085 0.4263 0.20713 0.42638 0.49826 1.97e-4 0.38532-0.29327 0.80469-0.65542 0.93662-0.36188 0.1316-0.65525-0.07375-0.65542-0.4588-1.95e-4 -0.38532 0.29328-0.80469 0.65542-0.93662 0.06842-0.02494 0.13655-0.03819 0.20156-0.03947zm-5.8142 0.86403-3.244 1.1805v1.4201l3.244 1.1805z", // svg path (https://simpleicons.org/icons/pypi.svg)
+    ],
+  })
+);
diff --git a/docs/_templates/version-switcher.html b/docs/_templates/version-switcher.html
new file mode 100644
index 0000000000..1d21c6c65f
--- /dev/null
+++ b/docs/_templates/version-switcher.html
@@ -0,0 +1,30 @@
+{# As the version switcher will only work when JavaScript is enabled, we add it through JavaScript.
+ #}
+<script>
+document.write(`
+  <div class="version-switcher__container dropdown">
+    <button id="versionswitcherbutton" type="button" role="button" class="version-switcher__button btn btn-sm navbar-btn dropdown-toggle" data-bs-toggle="dropdown" aria-haspopup="listbox" aria-controls="versionswitcherlist" aria-label="Version switcher list">
+      {% if current_version.is_released %} {{ current_version.name }} {% if latest_version and item == latest_version %} (stable) {% endif %} {% else %} latest (dev) {% endif %}
+      <span class="caret"></span>
+    </button>
+    <div id="versionswitcherlist" class="version-switcher__menu dropdown-menu list-group-flush py-0" role="listbox" aria-labelledby="versionswitcherbutton">
+    <!-- dropdown will be populated by javascript on page load -->
+    {%- for item in versions|reverse %}
+      <a class="list-group-item list-group-item-action py-1" href="{{ item.url }}" data-version-name="dev" data-version="devdocs">
+      <span>{% if item.is_released %}
+    {{ item.name }}
+      {% if latest_version and item == latest_version %}
+      (stable)
+      {% endif %}
+    {% else %}
+    latest (dev)
+    {% endif %}
+    {% if item == current_version %}
+    [x]
+    {% endif %}</span>
+      </a>
+    {%- endfor %}
+    </div>
+  </div>
+`);
+</script>
diff --git a/docs/_templates/versioning.html b/docs/_templates/versioning.html
index a6f92873ce..0c8784af0d 100644
--- a/docs/_templates/versioning.html
+++ b/docs/_templates/versioning.html
@@ -9,10 +9,7 @@ <h3>{{ _('Versions') }}</h3>
       (stable)
       {% endif %}
     {% else %}
-    latest (dev)
-    {% endif %}
-    {% if item == current_version %}
-    [x]
+    latest ({{ item.name }})
     {% endif %}
   </a></li>
   {%- endfor %}
diff --git a/docs/conf.py b/docs/conf.py
index 0ddb629096..0eb6f934e7 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -26,6 +26,19 @@
     "conf_py_path": "/docs/",
 }  # dummy value that sphinx-github-style won't crash when run in temp folder.
 
+html_js_files = ["custom-icon.js"]
+
+html_theme_options = {
+    "navbar_end": ["theme-switcher", "version-switcher", "navbar-icon-links"],
+    "github_url": linkcode_url,
+    "icon_links": [
+            {
+                "name": "PyPI",
+                "url": "https://pypi.org/project/flair",
+                "icon": "fa-custom fa-pypi",
+            },
+        ],
+}
 
 def linkcode_resolve(*args):
     # use smv_current_version as the git url

From 08396d1fdccf348ec1e69ba3fdc2a382f40f010b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 15:57:56 +0200
Subject: [PATCH 085/124] add custom pypi icon

---
 docs/_static/custom-icon.js | 16 ----------------
 docs/conf.py                |  6 ++----
 2 files changed, 2 insertions(+), 20 deletions(-)
 delete mode 100644 docs/_static/custom-icon.js

diff --git a/docs/_static/custom-icon.js b/docs/_static/custom-icon.js
deleted file mode 100644
index cd949b3b7e..0000000000
--- a/docs/_static/custom-icon.js
+++ /dev/null
@@ -1,16 +0,0 @@
-/*******************************************************************************
- * Set a custom icon for pypi as it's not available in the fa built-in brands
- */
-FontAwesome.library.add(
-  (faListOldStyle = {
-    prefix: "fa-custom",
-    iconName: "pypi",
-    icon: [
-      17.313, // viewBox width
-      19.807, // viewBox height
-      [], // ligature
-      "e001", // unicode codepoint - private use area
-      "m10.383 0.2-3.239 1.1769 3.1883 1.1614 3.239-1.1798zm-3.4152 1.2411-3.2362 1.1769 3.1855 1.1614 3.2369-1.1769zm6.7177 0.00281-3.2947 1.2009v3.8254l3.2947-1.1988zm-3.4145 1.2439-3.2926 1.1981v3.8254l0.17548-0.064132 3.1171-1.1347zm-6.6564 0.018325v3.8247l3.244 1.1805v-3.8254zm10.191 0.20931v2.3137l3.1777-1.1558zm3.2947 1.2425-3.2947 1.1988v3.8254l3.2947-1.1988zm-8.7058 0.45739c0.00929-1.931e-4 0.018327-2.977e-4 0.027485 0 0.25633 0.00851 0.4263 0.20713 0.42638 0.49826 1.953e-4 0.38532-0.29327 0.80469-0.65542 0.93662-0.36226 0.13215-0.65608-0.073306-0.65613-0.4588-6.28e-5 -0.38556 0.2938-0.80504 0.65613-0.93662 0.068422-0.024919 0.13655-0.038114 0.20156-0.039466zm5.2913 0.78369-3.2947 1.1988v3.8247l3.2947-1.1981zm-10.132 1.239-3.2362 1.1769 3.1883 1.1614 3.2362-1.1769zm6.7177 0.00213-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4124 1.2439-3.2947 1.1988v3.8254l3.2947-1.1988zm-6.6585 0.016195v3.8275l3.244 1.1805v-3.8254zm16.9 0.21143-3.2947 1.1988v3.8247l3.2947-1.1981zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm-3.4124 1.2432-3.2947 1.1988v3.8254l3.2947-1.1988zm-6.6585 0.019027v3.8247l3.244 1.1805v-3.8254zm13.485 1.4497-3.2947 1.1988v3.8247l3.2947-1.1981zm-3.4145 1.2411-3.2926 1.2016v3.8247l3.2926-1.2009zm2.4018 0.38127c0.0093-1.83e-4 0.01833-3.16e-4 0.02749 0 0.25633 0.0085 0.4263 0.20713 0.42638 0.49826 1.97e-4 0.38532-0.29327 0.80469-0.65542 0.93662-0.36188 0.1316-0.65525-0.07375-0.65542-0.4588-1.95e-4 -0.38532 0.29328-0.80469 0.65542-0.93662 0.06842-0.02494 0.13655-0.03819 0.20156-0.03947zm-5.8142 0.86403-3.244 1.1805v1.4201l3.244 1.1805z", // svg path (https://simpleicons.org/icons/pypi.svg)
-    ],
-  })
-);
diff --git a/docs/conf.py b/docs/conf.py
index 0eb6f934e7..462a6da648 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -26,8 +26,6 @@
     "conf_py_path": "/docs/",
 }  # dummy value that sphinx-github-style won't crash when run in temp folder.
 
-html_js_files = ["custom-icon.js"]
-
 html_theme_options = {
     "navbar_end": ["theme-switcher", "version-switcher", "navbar-icon-links"],
     "github_url": linkcode_url,
@@ -35,7 +33,7 @@
             {
                 "name": "PyPI",
                 "url": "https://pypi.org/project/flair",
-                "icon": "fa-custom fa-pypi",
+                "icon": "fas fa-box",
             },
         ],
 }
@@ -82,7 +80,7 @@ def linkcode_resolve(*args):
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ["_static"]
+html_static_path = ["_static"]
 
 # Napoleon settings
 napoleon_include_init_with_doc = True

From 177a13895f534be243ed7e00b11adaacf050e481 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 16:10:52 +0200
Subject: [PATCH 086/124] fix CI-CD name

---
 .github/workflows/publish-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index e86d8edf98..ef7d61f08b 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -1,4 +1,4 @@
-name: 'Run tests for ci cd'
+name: 'Build doc page'
 on:
   push:
     branches: [ main, doc-page ]

From c86215da3d8b368637eaa72d019214466ccf9616 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 17 Jul 2023 16:26:33 +0200
Subject: [PATCH 087/124] add 404 redirect

---
 .github/workflows/publish-docs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index ef7d61f08b..90cd22fe8a 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -32,8 +32,10 @@ jobs:
       - name: Add redirect to stable doc
         run: |
           cp assets/redirect.html doc_build/index.html
+          cp assets/redirect.html doc_build/404.html
           cp assets/README.md doc_build/README.md
           sed -i "s/\[VERSION\]/$(python -c 'import flair;print(flair.__version__)')/g" doc_build/index.html
+          sed -i "s/\[VERSION\]/$(python -c 'import flair;print(flair.__version__)')/g" doc_build/404.html
       - name: Deploy
         uses: peaceiris/actions-gh-pages@v3
         with:

From 35cf20c9dab1c33c994f525080b889fffa67517e Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 24 Jul 2023 13:41:36 +0200
Subject: [PATCH 088/124] global config for autodocs

---
 docs/api/datasets/base.rst                    |  5 +---
 docs/api/datasets/biomedical.rst              |  5 +---
 docs/api/datasets/document_classification.rst |  5 +---
 docs/api/datasets/entity_linking.rst          |  5 +---
 docs/api/datasets/ocr.rst                     |  5 +---
 docs/api/datasets/relation_extraction.rst     |  5 +---
 docs/api/datasets/sequence_labeling.rst       |  5 +---
 docs/api/datasets/text_image.rst              |  5 +---
 docs/api/datasets/text_text.rst               |  5 +---
 docs/api/datasets/treebanks.rst               |  5 +---
 docs/api/embeddings/base.rst                  |  5 +---
 docs/api/embeddings/document.rst              |  5 +---
 docs/api/embeddings/image.rst                 |  5 +---
 docs/api/embeddings/legacy.rst                |  5 +---
 docs/api/embeddings/token.rst                 |  5 +---
 docs/api/embeddings/transformer.rst           |  5 +---
 docs/api/flair.data.rst                       |  5 +---
 docs/api/flair.models.rst                     |  5 +---
 docs/api/flair.nn.rst                         |  5 +---
 docs/api/flair.rst                            |  5 +---
 docs/api/flair.splitter.rst                   |  5 +---
 docs/api/flair.tokenization.rst               |  5 +---
 docs/api/flair.trainers.plugins.rst           |  5 +---
 docs/api/flair.trainers.rst                   |  5 +---
 docs/conf.py                                  | 24 +++++++++++++------
 flair/embeddings/document.py                  |  2 +-
 26 files changed, 42 insertions(+), 104 deletions(-)

diff --git a/docs/api/datasets/base.rst b/docs/api/datasets/base.rst
index b6d0a7a70b..e42784deb0 100644
--- a/docs/api/datasets/base.rst
+++ b/docs/api/datasets/base.rst
@@ -1,7 +1,4 @@
 flair.datasets.base
 ===================
 
-.. automodule:: flair.datasets.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.base
\ No newline at end of file
diff --git a/docs/api/datasets/biomedical.rst b/docs/api/datasets/biomedical.rst
index 5ffcfa6b0c..d59bd8c589 100644
--- a/docs/api/datasets/biomedical.rst
+++ b/docs/api/datasets/biomedical.rst
@@ -1,7 +1,4 @@
 flair.datasets.biomedical
 =========================
 
-.. automodule:: flair.datasets.biomedical
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.biomedical
\ No newline at end of file
diff --git a/docs/api/datasets/document_classification.rst b/docs/api/datasets/document_classification.rst
index 06ce075df5..d8303f3aeb 100644
--- a/docs/api/datasets/document_classification.rst
+++ b/docs/api/datasets/document_classification.rst
@@ -1,7 +1,4 @@
 flair.datasets.document_classification
 ======================================
 
-.. automodule:: flair.datasets.document_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.document_classification
\ No newline at end of file
diff --git a/docs/api/datasets/entity_linking.rst b/docs/api/datasets/entity_linking.rst
index d3066675b7..cdb2b32356 100644
--- a/docs/api/datasets/entity_linking.rst
+++ b/docs/api/datasets/entity_linking.rst
@@ -1,7 +1,4 @@
 flair.datasets.entity_linking
 =============================
 
-.. automodule:: flair.datasets.entity_linking
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.entity_linking
\ No newline at end of file
diff --git a/docs/api/datasets/ocr.rst b/docs/api/datasets/ocr.rst
index 0d33b55b4e..3f85340440 100644
--- a/docs/api/datasets/ocr.rst
+++ b/docs/api/datasets/ocr.rst
@@ -1,7 +1,4 @@
 flair.datasets.ocr
 ==================
 
-.. automodule:: flair.datasets.ocr
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.ocr
\ No newline at end of file
diff --git a/docs/api/datasets/relation_extraction.rst b/docs/api/datasets/relation_extraction.rst
index c6df41c7d0..62dcdd55d1 100644
--- a/docs/api/datasets/relation_extraction.rst
+++ b/docs/api/datasets/relation_extraction.rst
@@ -1,7 +1,4 @@
 flair.datasets.relation_extraction
 ==================================
 
-.. automodule:: flair.datasets.relation_extraction
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.relation_extraction
\ No newline at end of file
diff --git a/docs/api/datasets/sequence_labeling.rst b/docs/api/datasets/sequence_labeling.rst
index cfa52a3ed5..875d4831b1 100644
--- a/docs/api/datasets/sequence_labeling.rst
+++ b/docs/api/datasets/sequence_labeling.rst
@@ -1,7 +1,4 @@
 flair.datasets.sequence_labeling
 ================================
 
-.. automodule:: flair.datasets.sequence_labeling
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.sequence_labeling
\ No newline at end of file
diff --git a/docs/api/datasets/text_image.rst b/docs/api/datasets/text_image.rst
index dbdf8f86d3..f14e564916 100644
--- a/docs/api/datasets/text_image.rst
+++ b/docs/api/datasets/text_image.rst
@@ -1,7 +1,4 @@
 flair.datasets.text_image
 =========================
 
-.. automodule:: flair.datasets.text_image
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.text_image
\ No newline at end of file
diff --git a/docs/api/datasets/text_text.rst b/docs/api/datasets/text_text.rst
index 33e4bd6c41..f88dfd1aed 100644
--- a/docs/api/datasets/text_text.rst
+++ b/docs/api/datasets/text_text.rst
@@ -1,7 +1,4 @@
 flair.datasets.text_text
 =========================
 
-.. automodule:: flair.datasets.text_text
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.text_text
\ No newline at end of file
diff --git a/docs/api/datasets/treebanks.rst b/docs/api/datasets/treebanks.rst
index 3f294fa3a5..0d6c14a281 100644
--- a/docs/api/datasets/treebanks.rst
+++ b/docs/api/datasets/treebanks.rst
@@ -1,7 +1,4 @@
 flair.datasets.treebanks
 ========================
 
-.. automodule:: flair.datasets.treebanks
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.datasets.treebanks
\ No newline at end of file
diff --git a/docs/api/embeddings/base.rst b/docs/api/embeddings/base.rst
index 1d17b64de0..1bf51ffa7a 100644
--- a/docs/api/embeddings/base.rst
+++ b/docs/api/embeddings/base.rst
@@ -1,7 +1,4 @@
 flair.embeddings.base
 =====================
 
-.. automodule:: flair.embeddings.base
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.base
\ No newline at end of file
diff --git a/docs/api/embeddings/document.rst b/docs/api/embeddings/document.rst
index d289cd0166..ca870fc8ea 100644
--- a/docs/api/embeddings/document.rst
+++ b/docs/api/embeddings/document.rst
@@ -1,7 +1,4 @@
 flair.embeddings.document
 =========================
 
-.. automodule:: flair.embeddings.document
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.document
\ No newline at end of file
diff --git a/docs/api/embeddings/image.rst b/docs/api/embeddings/image.rst
index 594ca9f3cb..2a701b9e0b 100644
--- a/docs/api/embeddings/image.rst
+++ b/docs/api/embeddings/image.rst
@@ -1,7 +1,4 @@
 flair.embeddings.image
 ======================
 
-.. automodule:: flair.embeddings.image
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.image
\ No newline at end of file
diff --git a/docs/api/embeddings/legacy.rst b/docs/api/embeddings/legacy.rst
index 51865b4e7e..974a777eb9 100644
--- a/docs/api/embeddings/legacy.rst
+++ b/docs/api/embeddings/legacy.rst
@@ -5,7 +5,4 @@ flair.embeddings.legacy
    All embeddings in `flair.embeddings.legacy` are considered deprecated.
    there is no guarantee that they are still working and we recommend using different embeddings instead.
 
-.. automodule:: flair.embeddings.legacy
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.legacy
\ No newline at end of file
diff --git a/docs/api/embeddings/token.rst b/docs/api/embeddings/token.rst
index 8d475bd659..3705fedb1d 100644
--- a/docs/api/embeddings/token.rst
+++ b/docs/api/embeddings/token.rst
@@ -1,7 +1,4 @@
 flair.embeddings.token
 ======================
 
-.. automodule:: flair.embeddings.token
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.token
\ No newline at end of file
diff --git a/docs/api/embeddings/transformer.rst b/docs/api/embeddings/transformer.rst
index 4d4b678ec9..2bda02f771 100644
--- a/docs/api/embeddings/transformer.rst
+++ b/docs/api/embeddings/transformer.rst
@@ -1,7 +1,4 @@
 flair.embeddings.transformer
 ============================
 
-.. automodule:: flair.embeddings.transformer
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.embeddings.transformer
\ No newline at end of file
diff --git a/docs/api/flair.data.rst b/docs/api/flair.data.rst
index 9d93885dd4..00dd67a521 100644
--- a/docs/api/flair.data.rst
+++ b/docs/api/flair.data.rst
@@ -1,7 +1,4 @@
 flair.data
 ==========
 
-.. automodule:: flair.data
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.data
\ No newline at end of file
diff --git a/docs/api/flair.models.rst b/docs/api/flair.models.rst
index 5fe0cf440b..8679b3fb7d 100644
--- a/docs/api/flair.models.rst
+++ b/docs/api/flair.models.rst
@@ -1,7 +1,4 @@
 flair.models
 ============
 
-.. automodule:: flair.models
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.models
\ No newline at end of file
diff --git a/docs/api/flair.nn.rst b/docs/api/flair.nn.rst
index 4d42b88e55..4eb066d3ea 100644
--- a/docs/api/flair.nn.rst
+++ b/docs/api/flair.nn.rst
@@ -1,7 +1,4 @@
 flair.nn
 ========
 
-.. automodule:: flair.nn
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.nn
\ No newline at end of file
diff --git a/docs/api/flair.rst b/docs/api/flair.rst
index 056e6182b4..4e12a03829 100644
--- a/docs/api/flair.rst
+++ b/docs/api/flair.rst
@@ -1,7 +1,4 @@
 flair
 =====
 
-.. automodule:: flair
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair
\ No newline at end of file
diff --git a/docs/api/flair.splitter.rst b/docs/api/flair.splitter.rst
index c577b3b944..5863df5788 100644
--- a/docs/api/flair.splitter.rst
+++ b/docs/api/flair.splitter.rst
@@ -1,7 +1,4 @@
 flair.splitter
 ==============
 
-.. automodule:: flair.splitter
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.splitter
\ No newline at end of file
diff --git a/docs/api/flair.tokenization.rst b/docs/api/flair.tokenization.rst
index 92baa459db..00f2bc4bfd 100644
--- a/docs/api/flair.tokenization.rst
+++ b/docs/api/flair.tokenization.rst
@@ -1,7 +1,4 @@
 flair.tokenization
 ==================
 
-.. automodule:: flair.tokenization
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.tokenization
\ No newline at end of file
diff --git a/docs/api/flair.trainers.plugins.rst b/docs/api/flair.trainers.plugins.rst
index 6a4c6cd2c6..4bb766876b 100644
--- a/docs/api/flair.trainers.plugins.rst
+++ b/docs/api/flair.trainers.plugins.rst
@@ -1,7 +1,4 @@
 flair.trainers.plugins
 ======================
 
-.. automodule:: flair.trainers.plugins
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.trainers.plugins
\ No newline at end of file
diff --git a/docs/api/flair.trainers.rst b/docs/api/flair.trainers.rst
index dabe55cb5f..db11b5029c 100644
--- a/docs/api/flair.trainers.rst
+++ b/docs/api/flair.trainers.rst
@@ -1,7 +1,4 @@
 flair.trainers
 ==============
 
-.. automodule:: flair.trainers
-   :members:
-   :undoc-members:
-   :show-inheritance:
\ No newline at end of file
+.. automodule:: flair.trainers
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index 462a6da648..97b76fde11 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -30,14 +30,15 @@
     "navbar_end": ["theme-switcher", "version-switcher", "navbar-icon-links"],
     "github_url": linkcode_url,
     "icon_links": [
-            {
-                "name": "PyPI",
-                "url": "https://pypi.org/project/flair",
-                "icon": "fas fa-box",
-            },
-        ],
+        {
+            "name": "PyPI",
+            "url": "https://pypi.org/project/flair",
+            "icon": "fas fa-box",
+        },
+    ],
 }
 
+
 def linkcode_resolve(*args):
     # use smv_current_version as the git url
     real_linkcode_url = linkcode_url + f"/blob/{smv_current_version}/" + "{filepath}#L{linestart}-L{linestop}"
@@ -86,6 +87,15 @@ def linkcode_resolve(*args):
 napoleon_include_init_with_doc = True
 napoleon_include_private_with_doc = True
 
+autodoc_default_options = {
+    "member-order": "bysource",
+    "undoc-members": True,
+    "members": True,
+    "show-inheritance": True,
+    "private-members": False,
+    "inherited": True,
+}
+
 source_suffix = {
     ".rst": "restructuredtext",
     ".md": "markdown",
@@ -97,7 +107,7 @@ def linkcode_resolve(*args):
         "searchbox.html",
         "versioning.html",
     ],
-    "index": []
+    "index": [],
 }
 
 smv_latest_version = importlib_metadata.version(project)
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index f3482cfd28..4e0517da1a 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -238,7 +238,7 @@ def __init__(
         dropout: float = 0.5,
         word_dropout: float = 0.0,
         locked_dropout: float = 0.0,
-        rnn_type: str="GRU",
+        rnn_type: str = "GRU",
         fine_tune: bool = True,
     ) -> None:
         """Instantiates an RNN that works upon some token embeddings.

From c31ab096cd846d18614d0aeb9b60636f0f109761 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 24 Jul 2023 14:54:35 +0200
Subject: [PATCH 089/124] add docs to variables missing

---
 flair/datasets/base.py                        |  1 +
 flair/embeddings/base.py                      |  1 +
 flair/embeddings/document.py                  |  5 +++
 flair/embeddings/token.py                     | 36 +++++++++++++------
 flair/embeddings/transformer.py               |  3 ++
 flair/models/clustering.py                    |  4 +--
 flair/models/entity_linker_model.py           |  2 ++
 flair/models/lemmatizer_model.py              |  9 ++++-
 flair/models/multitask_model.py               | 25 ++++---------
 flair/models/pairwise_classification_model.py | 10 ++++--
 flair/models/pairwise_regression_model.py     |  9 ++++-
 flair/models/relation_extractor_model.py      | 13 ++++---
 flair/models/sequence_tagger_model.py         |  9 +++--
 flair/models/sequence_tagger_utils/viterbi.py |  7 ++--
 flair/models/tars_model.py                    |  7 ++++
 flair/models/text_classification_model.py     |  2 ++
 flair/models/word_tagger_model.py             |  6 ++--
 flair/nn/distance/cosine.py                   |  2 ++
 flair/nn/model.py                             | 13 +++++--
 flair/samplers.py                             |  6 +---
 flair/trainers/plugins/loggers/tensorboard.py |  2 ++
 flair/trainers/plugins/metric_records.py      |  1 +
 flair/visual/ner_html.py                      |  1 +
 23 files changed, 114 insertions(+), 60 deletions(-)

diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index e0601e5a4c..ca6ead4069 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -144,6 +144,7 @@ def __init__(
             max_chars_per_doc: If set, truncates each Sentence to a maximum number of chars
             tokenizer: Custom tokenizer to use (default SegtokTokenizer)
             in_memory: If True, keeps dataset as Sentences in memory, otherwise only keeps strings
+            tag_type: The tag type to assign labels to.
 
         Returns: list of sentences
         """
diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py
index f0da70cc43..d3cacd75ad 100644
--- a/flair/embeddings/base.py
+++ b/flair/embeddings/base.py
@@ -126,6 +126,7 @@ def __init__(self, mixture_size: int, trainable: bool = False) -> None:
 
         Args:
             mixture_size: size of mixtures (usually the number of layers)
+            trainable: weather or not the weights should be learnable.
         """
         super().__init__()
         self.mixture_size = mixture_size
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index 4e0517da1a..35e455de4b 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -43,6 +43,8 @@ def __init__(
             cls_pooling: Pooling strategy for combining token level embeddings. options are 'cls', 'max', 'mean'.
             layer_mean: If True, uses a scalar mix of layers as embedding
             fine_tune: If True, allows transformers to be fine-tuned during training
+            is_token_embedding: If True, the embedding can be used as TokenEmbedding too.
+            **kwargs: Arguments propagated to :met:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
@@ -175,6 +177,7 @@ def __init__(
 
         Args:
             train_dataset: the train dataset which will be used to construct a vectorizer
+            vectorizer: a precalculated vectorizer. If provided, requires the train_dataset to be an empty list.
             vectorizer_params: parameters given to Scikit-learn's TfidfVectorizer constructor
         """
         super().__init__()
@@ -254,6 +257,7 @@ def __init__(
             word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
             locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
             rnn_type: 'GRU' or 'LSTM'
+            fine_tune: if True, allow to finetune the embeddings.
         """
         super().__init__()
 
@@ -621,6 +625,7 @@ def __init__(
             dropout: the dropout value to be used
             word_dropout: the word dropout value to be used, if 0.0 word dropout is not used
             locked_dropout: the locked dropout value to be used, if 0.0 locked dropout is not used
+            fine_tune: if True, allow to finetune the embeddings.
         """
         super().__init__()
 
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 1c52eaec5d..457cf7b944 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -43,11 +43,9 @@ def __init__(
 
         Args:
             model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
-            layers: string indicating which layers to take for embedding (-1 is topmost layer)
-            subtoken_pooling: how to get from token piece embeddings to token embedding. Either take the first
-                subtoken ('first'), the last subtoken ('last'), both first and last ('first_last') or a mean over all ('mean')
-            layer_mean: If True, uses a scalar mix of layers as embedding
-            fine_tune: If True, allows transformers to be fine-tuned during training
+            is_document_embedding: If True, the embedding can be used as DocumentEmbedding too.
+            allow_long_sentences: If True, too long sentences will be patched and strided and afterwards combined.
+            **kwargs: Arguments propagated to :met:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
@@ -169,7 +167,14 @@ def __init__(
 
         Args:
             embeddings: one of: 'glove', 'extvec', 'crawl' or two-letter language code or a path to a custom embedding
+            field: if given, the word-embeddings embed the data for the specific label-type instead of the plain text.
+            fine_tune: If True, allows word-embeddings to be fine-tuned during training
+            force_cpu: If True, stores the large embedding matrix not on the gpu to save memory. `force_cpu` can only be used if `fine_tune` is False
             stable: if True, use the stable embeddings as described in https://arxiv.org/abs/2110.02861
+            no_header: only for reading plain word2vec text files. If true, the reader assumes the first line to not contain embedding length and vocab size.
+            vocab: If the embeddings are already loaded in memory, provide the vocab here.
+            embedding_length: If the embeddings are already loaded in memory, provide the embedding_length here.
+            name: The name of the embeddings.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
@@ -572,11 +577,20 @@ def __init__(
         """Initializes contextual string embeddings using a character-level language model.
 
         Args:
-            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast', 'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired.
-            fine_tune: if set to True, the gradient will propagate into the language model. This dramatically slows down training and often leads to overfitting, so use with caution.
-            chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff. Higher means faster but requires more memory. Lower means slower but less memory.
-            with_whitespace: If True, use hidden state after whitespace after word. If False, use hidden state at last character of word.
-            tokenized_lm: Whether this lm is tokenized. Default is True, but for LMs trained over unprocessed text False might be better.
+            model: model string, one of 'news-forward', 'news-backward', 'news-forward-fast', 'news-backward-fast',
+              'mix-forward', 'mix-backward', 'german-forward', 'german-backward', 'polish-backward', 'polish-forward' depending on which character language model is desired.
+            fine_tune: if set to True, the gradient will propagate into the language model.
+              This dramatically slows down training and often leads to overfitting, so use with caution.
+            chars_per_chunk: max number of chars per rnn pass to control speed/memory tradeoff.
+              Higher means faster but requires more memory. Lower means slower but less memory.
+            with_whitespace: If True, use hidden state after whitespace after word.
+              If False, use hidden state at last character of word.
+            tokenized_lm: Whether this lm is tokenized. Default is True,
+              but for LMs trained over unprocessed text False might be better.
+            has_decoder: Weather to load the decoder-head of the LanguageModel. This should only be true, if you intend
+              to generate text.
+            is_lower: Whether this lm is trained on lower-cased data.
+            name: The name of the embeddings
         """
         super().__init__()
         self.instance_parameters = self.get_instance_parameters(locals=locals())
@@ -1014,6 +1028,8 @@ def __init__(
         Args:
             embeddings: path to your embeddings '.bin' file
             use_local: set this to False if you are using embeddings from a remote source
+            field: if given, the word-embeddings embed the data for the specific label-type instead of the plain text.
+            name: The name of the embeddings.
         """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
 
diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index a78f5ca54e..64b043a7a6 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1388,7 +1388,10 @@ def export_onnx(
         """Export TransformerEmbeddings to OnnxFormat.
 
         Args:
+            path: the path to save the embeddings. Notice that the embeddings are stored as external file,
+              hence it matters if the path is an absolue path or a relative one.
             example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
                 sentences with some variation.
+            **kwargs: the parameters passed to :met:`TransformerOnnxEmbeddings.export_from_embedding`
         """
         return self.onnx_cls.export_from_embedding(path, self, example_sentences, **kwargs)
diff --git a/flair/models/clustering.py b/flair/models/clustering.py
index b027d6b9cd..e9902f6f67 100644
--- a/flair/models/clustering.py
+++ b/flair/models/clustering.py
@@ -34,6 +34,7 @@ def fit(self, corpus: Corpus, **kwargs):
 
         Args:
             corpus: the flair corpus this wrapper will use for fitting the model.
+            **kwargs: parameters propagated to the models `.fit()` method.
         """
         X = self._convert_dataset(corpus)
 
@@ -84,9 +85,6 @@ def _convert_dataset(
 
         Turns the corpora into X, y datasets as required for most sklearn clustering models.
         Ref.: https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
-
-        Args:
-            label_type: the label from sentences will be extracted. If the value is none this will be skipped.
         """
         log.info("Embed sentences...")
         sentences = []
diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py
index 75af25043f..686cf29214 100644
--- a/flair/models/entity_linker_model.py
+++ b/flair/models/entity_linker_model.py
@@ -102,6 +102,8 @@ def __init__(
                 text representation we take the average of the embeddings of the token in the mention.
                 `first_last` concatenates the embedding of the first and the embedding of the last token.
             label_type: name of the label you use.
+            candidates: If provided, use a :class:`CandidateGenerator` for prediction candidates.
+            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             embeddings=embeddings,
diff --git a/flair/models/lemmatizer_model.py b/flair/models/lemmatizer_model.py
index 8356491103..6700b089d0 100644
--- a/flair/models/lemmatizer_model.py
+++ b/flair/models/lemmatizer_model.py
@@ -42,6 +42,10 @@ def __init__(
         in which all words are annotated with a (maybe equal) lemma.
 
         Args:
+            encode_characters: If True, use a character embedding to additionally encode tokens per character.
+            start_symbol_for_encoding: If True, use a start symbol for encoding characters.
+            end_symbol_for_encoding: If True, use an end symbol for encoding characters.
+            bidirectional_encoding: If True, the character encoding is bidirectional.
             embeddings: Embedding used to encode sentence
             rnn_input_size: Input size of the RNN('s). Each letter of a token is represented by a hot-one-vector over
                 the given character dictionary. This vector is transformed to a input_size vector with a linear layer.
@@ -163,12 +167,14 @@ def words_to_char_indices(
     ):
         """For a given list of strings this function creates index vectors that represent the characters of the strings.
 
-        Each string is represented by sequence_length (maximum string length + entries for special symbold) many
+        Each string is represented by sequence_length (maximum string length + entries for special symbol) many
         indices representing characters in self.char_dict.
         One can manually set the vector length with the parameter seq_length, though the vector length is always
         at least maximum string length in the list.
 
         Args:
+            seq_length: the maximum sequence length to use, if None the maximum is taken..
+            tokens: the texts of the toekens to encode
             end_symbol: add self.end_index at the end of each representation
             start_symbol: add self.start_index in front of each representation
             padding_in_front: whether to fill up with self.dummy_index in front or in back of strings
@@ -414,6 +420,7 @@ def predict(
             embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
             return_loss: whether to compute and return loss. Setting it to True only makes sense if labels are provided
             verbose: If True, lemmatized sentences will be printed in the console.
+            return_probabilities_for_all_classes: unused parameter.
         """
         if isinstance(sentences, Sentence):
             sentences = [sentences]
diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
index 1008266993..98cda7b4a9 100644
--- a/flair/models/multitask_model.py
+++ b/flair/models/multitask_model.py
@@ -6,7 +6,7 @@
 import torch
 
 import flair.nn
-from flair.data import DT, Dictionary, Sentence
+from flair.data import DT, Sentence
 from flair.file_utils import cached_path
 from flair.nn import Classifier
 from flair.training_utils import Result
@@ -121,22 +121,19 @@ def evaluate(
         data_points,
         gold_label_type: str,
         out_path: Optional[Union[str, Path]] = None,
-        embedding_storage_mode: str = "none",
-        mini_batch_size: int = 32,
         main_evaluation_metric: Tuple[str, str] = ("micro avg", "f1-score"),
-        exclude_labels: List[str] = [],
-        gold_label_dictionary: Optional[Dictionary] = None,
-        return_loss: bool = True,
         evaluate_all: bool = True,
         **evalargs,
     ) -> Result:
         """Evaluates the model. Returns a Result object containing evaluation results and a loss value.
 
         Args:
-            sentences: batch of sentences
-            embeddings_storage_mode: One of 'none' (all embeddings are deleted and freshly recomputed), 'cpu' (embeddings are stored on CPU) or 'gpu' (embeddings are stored on GPU)
-            mini_batch_size: size of batches
+            data_points: batch of sentences
+            gold_label_type: if evaluate_all is False, specify the task to evaluate by the task_id.
+            out_path: if not None, predictions will be created and saved at the respective file.
+            main_evaluation_metric: Specify which metric to highlight as main_score
             evaluate_all: choose if all tasks should be evaluated, or a single one, depending on gold_label_type
+            **evalargs: arguments propagated to :meth:`flair.nn.Model.evaluate`
 
         Returns: Tuple of Result object and loss value (float)
         """
@@ -154,12 +151,7 @@ def evaluate(
                 data,
                 gold_label_type=self.tasks[gold_label_type].label_type,
                 out_path=out_path,
-                embedding_storage_mode=embedding_storage_mode,
-                mini_batch_size=mini_batch_size,
                 main_evaluation_metric=main_evaluation_metric,
-                exclude_labels=exclude_labels,
-                gold_label_dictionary=gold_label_dictionary,
-                return_loss=return_loss,
                 **evalargs,
             )
 
@@ -175,12 +167,7 @@ def evaluate(
                 data_points=[data_points[i] for i in split],
                 gold_label_type=self.tasks[task_id].label_type,
                 out_path=f"{out_path}_{task_id}.txt" if out_path is not None else None,
-                embedding_storage_mode=embedding_storage_mode,
-                mini_batch_size=mini_batch_size,
                 main_evaluation_metric=main_evaluation_metric,
-                exclude_labels=exclude_labels,
-                gold_label_dictionary=gold_label_dictionary,
-                return_loss=return_loss,
                 **evalargs,
             )
 
diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py
index c776ee973d..ca50d1b877 100644
--- a/flair/models/pairwise_classification_model.py
+++ b/flair/models/pairwise_classification_model.py
@@ -27,11 +27,17 @@ def __init__(
         """Initializes a TextPairClassifier.
 
         Args:
+            label_type: label_type: name of the label
+            embed_separately: if True, the sentence embeddings will be concatenated,
+              if False both sentences will be combined and newly embedded.
             embeddings: embeddings used to embed each data point
             label_dictionary: dictionary of labels you want to predict
-            multi_label: auto-detected by default, but you can set this to True to force multi-label prediction or False to force single-label prediction
+            multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
+               or False to force single-label prediction
             multi_label_threshold: If multi-label you can set the threshold to make predictions
-            loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is unspecified it will default to 1.0
+            loss_weights: Dictionary of weights for labels for the loss function.
+              If any label's weight is unspecified it will default to 1.0
+            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/pairwise_regression_model.py b/flair/models/pairwise_regression_model.py
index 174bc8a160..1841e07671 100644
--- a/flair/models/pairwise_regression_model.py
+++ b/flair/models/pairwise_regression_model.py
@@ -33,11 +33,18 @@ def __init__(
         locked_dropout: float = 0.0,
         word_dropout: float = 0.0,
         decoder: Optional[torch.nn.Module] = None,
-        **classifierargs,
     ) -> None:
         """Initialize the Text Pair Regression Model.
 
         Args:
+            label_type: name of the label
+            embed_separately: if True, the sentence embeddings will be concatenated,
+              if False both sentences will be combined and newly embedded.
+            dropout: dropout
+            locked_dropout: locked_dropout
+            word_dropout:  word_dropout
+            decoder: if provided, a that specific layer will be used as decoder,
+              otherwise a linear layer with random parameters will be created.
             embeddings: embeddings used to embed each data point
         """
         super().__init__()
diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py
index c270f87aab..a13a80fffe 100644
--- a/flair/models/relation_extractor_model.py
+++ b/flair/models/relation_extractor_model.py
@@ -26,11 +26,14 @@ def __init__(
         """Initializes a RelationClassifier.
 
         Args:
-            document_embeddings: embeddings used to embed each data point
-            label_dictionary: dictionary of labels you want to predict
-            beta: Parameter for F-beta score for evaluation and training annealing
-            train_on_gold_pairs_only: Set true to not train to predict no relation.
-            loss_weights: Dictionary of weights for labels for the loss function any unspecified labels will default to a weight of 1.0
+            embeddings: embeddings used to embed each data point
+            label_type: name of the label
+            entity_label_type: name of the labels used to represent entities
+            entity_pair_filters: if provided, only classify pairs that apply the filter
+            pooling_operation: either "first" or "first_last" how the embeddings of the entities
+              should be used to create relation embeddings
+            train_on_gold_pairs_only: if True, relations with "O" (no relation) label will be ignored in training.
+            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
         """
         # pooling operation to get embeddings for entites
         self.pooling_operation = pooling_operation
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
index 76646c562a..c6defd24a6 100644
--- a/flair/models/sequence_tagger_model.py
+++ b/flair/models/sequence_tagger_model.py
@@ -67,6 +67,8 @@ def __init__(
             train_initial_hidden_state: if True, trains initial hidden state of RNN
             loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is unspecified it will default to 1.0.
             init_from_state_dict: Indicator whether we are loading a model from state dict since we need to transform previous models' weights into CRF instance weights
+            allow_unk_predictions: If True, allows spans to predict <unk> too.
+            tag_format: the format to encode spans as tags, either "BIO" or "BIOES"
         """
         super().__init__()
 
@@ -445,6 +447,7 @@ def predict(
             label_name: which label to predict
             return_loss: whether to return loss value
             embedding_storage_mode: determines where to store embeddings - can be "gpu", "cpu" or None.
+            force_token_predictions: add labels per token instead of span labels, even if `self.predict_spans` is True
         """
         if label_name is None:
             label_name = self.tag_type
@@ -569,11 +572,7 @@ def _standard_inference(self, features: torch.Tensor, batch: List[Sentence], pro
         return predictions, all_tags
 
     def _all_scores_for_token(self, sentences: List[Sentence], scores: torch.Tensor, lengths: List[int]):
-        """Returns all scores for each tag in tag dictionary.
-
-        Args:
-            scores: Scores for current sentence.
-        """
+        """Returns all scores for each tag in tag dictionary."""
         scores = scores.numpy()
         tokens = [token for sentence in sentences for token in sentence]
         prob_all_tags = [
diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 3fa41df6d0..050697b18f 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -148,6 +148,7 @@ def decode(
         Args:
             features_tuple: CRF scores from forward method in shape (batch size, seq len, tagset size, tagset size), lengths of sentence in batch, transitions of CRF
             probabilities_for_all_classes: whether to return probabilities for all tags
+            sentences: list of the respective sentences with extracted features.
 
         Returns: decoded sequences
         """
@@ -225,11 +226,7 @@ def decode(
         return tags, all_tags
 
     def _all_scores_for_token(self, scores: torch.Tensor, lengths: torch.IntTensor, sentences: List[Sentence]):
-        """Returns all scores for each tag in tag dictionary.
-
-        Args:
-            scores: Scores for current sentence.
-        """
+        """Returns all scores for each tag in tag dictionary."""
         scores = scores.numpy()
         prob_tags_per_sentence = []
         for scores_sentence, length, sentence in zip(scores, lengths, sentences):
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index d36dcc3478..e18db60c9f 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -343,10 +343,14 @@ def __init__(
         Args:
             task_name: a string depicting the name of the task
             label_dictionary: dictionary of labels you want to predict
+            label_type: label_type: name of the label
             embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'
             num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
                 sentence during training. Defaults to 2 negative labels for each positive label. The model would sample
                 all the negative labels if None is passed. That slows down the training considerably.
+            prefix: if True, the label will be concatenated at the start, else on the end.
+            **tagger_args: The arguments propagated to :met:`FewshotClassifier.__init__`
+
         """
         super().__init__()
 
@@ -675,6 +679,7 @@ def __init__(
         Args:
             task_name: a string depicting the name of the task.
             label_dictionary: dictionary of labels you want to predict.
+            label_type: label_type: name of the label
             embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased'.
             num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a
                 sentence during training. Defaults to 2 negative labels for each positive label.
@@ -684,6 +689,8 @@ def __init__(
                 or False to force single-label predictions.
             multi_label_threshold: If multi-label you can set the threshold to make predictions.
             beta: Parameter for F-beta score for evaluation and training annealing.
+            prefix: if True, the label will be concatenated at the start, else on the end.
+            **tagger_args: The arguments propagated to :met:`FewshotClassifier.__init__`
         """
         super().__init__()
 
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 8ac13d946e..00e8323386 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -32,12 +32,14 @@ def __init__(
         Args:
             embeddings: embeddings used to embed each data point
             label_dictionary: dictionary of labels you want to predict
+            label_type: string identifier for tag type
             multi_label: auto-detected by default, but you can set this to True to force multi-label predictions
                 or False to force single-label predictions.
             multi_label_threshold: If multi-label you can set the threshold to make predictions
             beta: Parameter for F-beta score for evaluation and training annealing
             loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is
                 unspecified it will default to 1.0
+            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index aa111e5c4d..d55eb86926 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -27,8 +27,10 @@ def __init__(
 
         Args:
             embeddings: word embeddings used in tagger
-            tag_dictionary: dictionary of tags you want to predict
-            tag_type: string identifier for tag type
+            label_dictionary: dictionary of labels or BIO/BIOES tags you want to predict
+            label_type: string identifier for tag type
+            span_encoding: the format to encode spans as tags, either "BIO" or "BIOES"
+            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
         """
         # if the classifier predicts BIO/BIOES span labels, the internal label dictionary must be computed
         if label_dictionary.span_labels:
diff --git a/flair/nn/distance/cosine.py b/flair/nn/distance/cosine.py
index 188b73f375..d92188ab10 100644
--- a/flair/nn/distance/cosine.py
+++ b/flair/nn/distance/cosine.py
@@ -7,6 +7,8 @@ def dot_product(a: torch.Tensor, b: torch.Tensor, normalize=False):
     """Computes dot product for pairs of vectors.
 
     Args:
+        a: the left tensor
+        b: the right tensor
         normalize: Vectors are normalized (leads to cosine similarity)
 
     Returns: Matrix with res[i][j] = dot_product(a[i], b[j])
diff --git a/flair/nn/model.py b/flair/nn/model.py
index 7a95be5199..dd58c04562 100644
--- a/flair/nn/model.py
+++ b/flair/nn/model.py
@@ -64,9 +64,17 @@ def evaluate(
         Implement this to enable evaluation.
 
         Args:
-            data_loader: DataLoader that iterates over dataset to be evaluated
+            data_points: The labeled data_points to evaluate.
+            gold_label_type: The label type indicating the gold labels
             out_path: Optional output path to store predictions
-            embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU
+            embedding_storage_mode: One of 'none', 'cpu' or 'gpu'. 'none' means all embeddings are deleted and freshly
+              recomputed, 'cpu' means all embeddings are stored on CPU, or 'gpu' means all embeddings are stored on GPU
+            mini_batch_size: The batch_size to use for predictions
+            main_evaluation_metric: Specify which metric to highlight as main_score
+            exclude_labels: Specify classes that won't be considered in evaluation
+            gold_label_dictionary: Specify which classes should be considered, all other classes will be taken as <unk>.
+            return_loss: Weather to additionally compute the loss on the data-points.
+            **kwargs: Arguments that will be ignored.
 
         Returns:
             The evaluation results.
@@ -106,6 +114,7 @@ def save(self, model_file: Union[str, Path], checkpoint: bool = False):
 
         Args:
             model_file: the model file
+            checkpoint: currently unused.
         """
         model_state = self._get_state_dict()
 
diff --git a/flair/samplers.py b/flair/samplers.py
index d8a5c22f36..135dfb3310 100644
--- a/flair/samplers.py
+++ b/flair/samplers.py
@@ -30,11 +30,7 @@ def __init__(self) -> None:
         super().__init__(None)
 
     def set_dataset(self, data_source):
-        """Initialize the dataset used for sampling.
-
-        Args:
-            data_source:
-        """
+        """Initialize the dataset used for sampling."""
         self.data_source = data_source
         self.num_samples = len(self.data_source)
         self.indices = list(range(len(data_source)))
diff --git a/flair/trainers/plugins/loggers/tensorboard.py b/flair/trainers/plugins/loggers/tensorboard.py
index 9158c510ef..8fc8af9e9e 100644
--- a/flair/trainers/plugins/loggers/tensorboard.py
+++ b/flair/trainers/plugins/loggers/tensorboard.py
@@ -15,6 +15,8 @@ def __init__(self, log_dir=None, comment="", tracked_metrics=()) -> None:
 
         Args:
             log_dir: Directory into which tensorboard log files will be written
+            comment: The comment to specify Comment log_dir suffix appended to the default
+              ``log_dir``. If ``log_dir`` is assigned, this argument has no effect.
             tracked_metrics: List of tuples that specify which metrics (in addition to the main_score) shall be plotted in tensorboard, could be [("macro avg", 'f1-score'), ("macro avg", 'precision')] for example
         """
         super().__init__()
diff --git a/flair/trainers/plugins/metric_records.py b/flair/trainers/plugins/metric_records.py
index d4742e1098..034c021854 100644
--- a/flair/trainers/plugins/metric_records.py
+++ b/flair/trainers/plugins/metric_records.py
@@ -80,6 +80,7 @@ def __init__(
             name: Name of the metric.
             typ: Type of metric.
             value: Value of the metric (can be anything: scalar, tensor, image, etc.).
+            global_step: The time_step of the log. This should be incremented the next time this metric is logged again. E.g. if you log every epoch, set the global_step to the current epoch.
             walltime: Time of recording this metric.
         """
         self.name: MetricName = MetricName(name)
diff --git a/flair/visual/ner_html.py b/flair/visual/ner_html.py
index 0cdc151a54..c71e108379 100644
--- a/flair/visual/ner_html.py
+++ b/flair/visual/ner_html.py
@@ -62,6 +62,7 @@ def render_ner_html(
         colors: dict where keys are tags and values are color HTML codes
         default_color: color to use if colors parameter is missing a tag
         wrap_page: if True method returns result of processing sentences wrapped by &lt;html&gt; and &lt;body&gt; tags, otherwise - without these tags
+        label_name: the label name to specify which labels of the sentence are visualized.
 
     Returns: HTML as a string
     """

From f1a4d963f8b6851dbfb7397495f015e9363738b6 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 31 Jul 2023 15:45:36 +0200
Subject: [PATCH 090/124] don't use deprecated BPEmbSerializable

---
 flair/embeddings/token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 457cf7b944..038666bf7d 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -1435,7 +1435,7 @@ def __init__(
             ), "Need to specify model_file_path and embedding_file_path if no language is given in BytePairEmbeddings(...)"
             dim = None  # type: ignore[assignment]
 
-        self.embedder = BPEmbSerializable(
+        self.embedder = BPEmb(
             lang=language,
             vs=syllables,
             dim=dim,

From 3ddc564568271c88ca1b5e0e34ef902dba2ab3aa Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 31 Jul 2023 19:11:16 +0200
Subject: [PATCH 091/124] fix rust & mypy errors

---
 flair/models/multitask_model.py | 2 +-
 flair/trainers/trainer.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
index 98cda7b4a9..d7fb262da7 100644
--- a/flair/models/multitask_model.py
+++ b/flair/models/multitask_model.py
@@ -116,7 +116,7 @@ def split_batch_to_task_ids(sentences: Union[List[Sentence], Sentence], all_task
                     batch_to_task_mapping[multitask_id.value] = [sentence_id]
         return batch_to_task_mapping
 
-    def evaluate(
+    def evaluate(  # type: ignore[override]
         self,
         data_points,
         gold_label_type: str,
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 8fe18bf3e5..43e99d6ca1 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -461,7 +461,7 @@ def train_custom(
             if inspect.isclass(sampler):
                 sampler = sampler()
             # set dataset to sample from
-            sampler.set_dataset(train_data)
+            sampler.set_dataset(train_data)  # type: ignore[union-attr]
             shuffle = False
 
         # this field stores the names of all dynamic embeddings in the model (determined after first forward pass)

From c3ea2b8135d0f1a79b89c76bce64ea99467e4b25 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 11 Sep 2023 10:32:18 +0200
Subject: [PATCH 092/124] fix ruff errors

---
 flair/datasets/entity_linking.py                      | 2 +-
 flair/datasets/sequence_labeling.py                   | 7 +++----
 flair/models/sequence_tagger_utils/viterbi.py         | 2 +-
 flair/trainers/plugins/functional/linear_scheduler.py | 1 -
 flair/trainers/trainer.py                             | 2 ++
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/flair/datasets/entity_linking.py b/flair/datasets/entity_linking.py
index 801144ca81..a515e0f3ab 100644
--- a/flair/datasets/entity_linking.py
+++ b/flair/datasets/entity_linking.py
@@ -946,7 +946,7 @@ def _text_to_cols(self, sentence: Sentence, links: list, outfile):
             links: array containing information about the starting and ending position of an entity mention, as well as its corresponding wiki tag
             outfile: file, to which the output is written
         """
-        for i in range(0, len(sentence)):
+        for i in range(len(sentence)):
             # If there are annotated entity mentions for given post title or a comment thread
             if links:
                 # Keep track which is the correct corresponding entity link, in cases where there is >1 link in a sentence
diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index 849c7c8995..d2e4215c82 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1178,8 +1178,7 @@ def dataset_document_iterator(cls, file_path: Union[Path, str]) -> Iterator[List
     def sentence_iterator(cls, file_path: Union[Path, str]) -> Iterator:
         """An iterator over the sentences in an individual CONLL formatted file."""
         for document in cls.dataset_document_iterator(file_path):
-            for sentence in document:
-                yield sentence
+            yield from document
 
 
 class CONLL_03(ColumnCorpus):
@@ -2490,7 +2489,7 @@ def _add_IOB_tags(self, data_file: Union[str, Path], encoding: str = "utf8", ner
         """
 
         def add_I_prefix(current_line: List[str], ner: int, tag: str):
-            for i in range(0, len(current_line)):
+            for i in range(len(current_line)):
                 if i == 0:
                     f.write(line_list[i])
                 elif i == ner:
@@ -2508,7 +2507,7 @@ def add_I_prefix(current_line: List[str], ner: int, tag: str):
                 if len(line_list) > 2:  # word with tags
                     ner_tag = line_list[ner_column]
                     if ner_tag in ["0", "O"]:  # no chunk
-                        for i in range(0, len(line_list)):
+                        for i in range(len(line_list)):
                             if i == 0:
                                 f.write(line_list[i])
                             elif i == ner_column:
diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 050697b18f..73c10fb67b 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -119,7 +119,7 @@ def _format_targets(self, targets: torch.Tensor, lengths: torch.IntTensor):
 
         matrix_indices = [
             [self.tag_dictionary.get_idx_for_item(START_TAG) + (s[0] * self.tagset_size)]
-            + [s[i] + (s[i + 1] * self.tagset_size) for i in range(0, len(s) - 1)]
+            + [s[i] + (s[i + 1] * self.tagset_size) for i in range(len(s) - 1)]
             for s in targets_per_sentence
         ]
 
diff --git a/flair/trainers/plugins/functional/linear_scheduler.py b/flair/trainers/plugins/functional/linear_scheduler.py
index ad1cb86128..51b295c7dc 100644
--- a/flair/trainers/plugins/functional/linear_scheduler.py
+++ b/flair/trainers/plugins/functional/linear_scheduler.py
@@ -52,7 +52,6 @@ def before_training_epoch(self, **kw):
     @TrainerPlugin.hook
     def after_training_batch(self, optimizer_was_run: bool, **kw):
         """Do the scheduler step if one-cycle or linear decay."""
-
         # skip if no optimization has happened.
         if not optimizer_was_run:
             return
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 43e99d6ca1..7db6a7d175 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -348,6 +348,7 @@ def train_custom(
             monitor_train_sample: Set this to evaluate on a sample of the train data at the end of each epoch.
                 If you set an int, it will sample this many sentences to evaluate on. If you set a float, it will sample
                 a percentage of data points from train.
+            max_grad_norm: If not None, gradients are clipped to this value before an optimizer.step is called.
             use_final_model_for_eval: If True, the final model is used for the final evaluation. If False, the
                 model from the best epoch as determined by main_evaluation_metric is used for the final evaluation.
             gold_label_dictionary_for_eval: Set to force evaluation to use a particular label dictionary
@@ -364,6 +365,7 @@ def train_custom(
                 be saved each 5 epochs. Default is 0 which means no model saving.
             create_file_logs: If True, logging output is written to a file
             create_loss_file: If True, a loss file logging output is created
+            use_amp: If True, uses the torch automatic mixed precision
             write_weights: If True, write weights to weights.txt on each batch logging event.
             plugins: Any additional plugins you want to pass to the trainer
             **kwargs: Additional arguments, for instance for the optimizer

From 111b0e6a918a212315b4659f8e4a743de028011d Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 11 Sep 2023 11:09:24 +0200
Subject: [PATCH 093/124] update publish-docs python version

---
 .github/workflows/publish-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 90cd22fe8a..a7f0da96e9 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -10,7 +10,7 @@ jobs:
     name: Build the docs using Sphinx and push to gh-pages
     runs-on: ubuntu-latest
     env:
-      python-version: 3.7
+      python-version: 3.8
     steps:
       - name: Checkout code
         uses: actions/checkout@v3

From 8d543b3d912521489f55ec3225e73fb6bb1af72b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 11:30:12 +0200
Subject: [PATCH 094/124] fix dependencies

---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index cfb2a9a5a6..b6a98f1186 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,4 +1,4 @@
-sphinx-github-style
+sphinx-github-style<=1.0.2  # 1.0.3 changes logic that breaks with sphinx-multiversion
 sphinx-autodoc-typehints
 myst-parser
 sphinx

From 9d49e9a18adf9199a9d19af4df3df63f62b72e10 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 11:30:24 +0200
Subject: [PATCH 095/124] fix method references

---
 flair/embeddings/document.py                  | 2 +-
 flair/embeddings/token.py                     | 2 +-
 flair/embeddings/transformer.py               | 2 +-
 flair/models/entity_linker_model.py           | 2 +-
 flair/models/pairwise_classification_model.py | 2 +-
 flair/models/relation_extractor_model.py      | 2 +-
 flair/models/tars_model.py                    | 4 ++--
 flair/models/text_classification_model.py     | 2 +-
 flair/models/word_tagger_model.py             | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index 35e455de4b..8779d418e4 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -44,7 +44,7 @@ def __init__(
             layer_mean: If True, uses a scalar mix of layers as embedding
             fine_tune: If True, allows transformers to be fine-tuned during training
             is_token_embedding: If True, the embedding can be used as TokenEmbedding too.
-            **kwargs: Arguments propagated to :met:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
+            **kwargs: Arguments propagated to :meth:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
index 038666bf7d..b4e3edc1fd 100644
--- a/flair/embeddings/token.py
+++ b/flair/embeddings/token.py
@@ -45,7 +45,7 @@ def __init__(
             model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
             is_document_embedding: If True, the embedding can be used as DocumentEmbedding too.
             allow_long_sentences: If True, too long sentences will be patched and strided and afterwards combined.
-            **kwargs: Arguments propagated to :met:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
+            **kwargs: Arguments propagated to :meth:`flair.embeddings.transformer.TransformerEmbeddings.__init__`
         """
         TransformerEmbeddings.__init__(
             self,
diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 64b043a7a6..b3b8385070 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1392,6 +1392,6 @@ def export_onnx(
               hence it matters if the path is an absolue path or a relative one.
             example_sentences: a list of sentences that will be used for tracing. It is recommended to take 2-4
                 sentences with some variation.
-            **kwargs: the parameters passed to :met:`TransformerOnnxEmbeddings.export_from_embedding`
+            **kwargs: the parameters passed to :meth:`TransformerOnnxEmbeddings.export_from_embedding`
         """
         return self.onnx_cls.export_from_embedding(path, self, example_sentences, **kwargs)
diff --git a/flair/models/entity_linker_model.py b/flair/models/entity_linker_model.py
index 686cf29214..3e46f3a2dd 100644
--- a/flair/models/entity_linker_model.py
+++ b/flair/models/entity_linker_model.py
@@ -103,7 +103,7 @@ def __init__(
                 `first_last` concatenates the embedding of the first and the embedding of the last token.
             label_type: name of the label you use.
             candidates: If provided, use a :class:`CandidateGenerator` for prediction candidates.
-            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             embeddings=embeddings,
diff --git a/flair/models/pairwise_classification_model.py b/flair/models/pairwise_classification_model.py
index ca50d1b877..2d8f768424 100644
--- a/flair/models/pairwise_classification_model.py
+++ b/flair/models/pairwise_classification_model.py
@@ -37,7 +37,7 @@ def __init__(
             multi_label_threshold: If multi-label you can set the threshold to make predictions
             loss_weights: Dictionary of weights for labels for the loss function.
               If any label's weight is unspecified it will default to 1.0
-            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/relation_extractor_model.py b/flair/models/relation_extractor_model.py
index a13a80fffe..795e8a517f 100644
--- a/flair/models/relation_extractor_model.py
+++ b/flair/models/relation_extractor_model.py
@@ -33,7 +33,7 @@ def __init__(
             pooling_operation: either "first" or "first_last" how the embeddings of the entities
               should be used to create relation embeddings
             train_on_gold_pairs_only: if True, relations with "O" (no relation) label will be ignored in training.
-            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         # pooling operation to get embeddings for entites
         self.pooling_operation = pooling_operation
diff --git a/flair/models/tars_model.py b/flair/models/tars_model.py
index e18db60c9f..6a92871473 100644
--- a/flair/models/tars_model.py
+++ b/flair/models/tars_model.py
@@ -349,7 +349,7 @@ def __init__(
                 sentence during training. Defaults to 2 negative labels for each positive label. The model would sample
                 all the negative labels if None is passed. That slows down the training considerably.
             prefix: if True, the label will be concatenated at the start, else on the end.
-            **tagger_args: The arguments propagated to :met:`FewshotClassifier.__init__`
+            **tagger_args: The arguments propagated to :meth:`FewshotClassifier.__init__`
 
         """
         super().__init__()
@@ -690,7 +690,7 @@ def __init__(
             multi_label_threshold: If multi-label you can set the threshold to make predictions.
             beta: Parameter for F-beta score for evaluation and training annealing.
             prefix: if True, the label will be concatenated at the start, else on the end.
-            **tagger_args: The arguments propagated to :met:`FewshotClassifier.__init__`
+            **tagger_args: The arguments propagated to :meth:`FewshotClassifier.__init__`
         """
         super().__init__()
 
diff --git a/flair/models/text_classification_model.py b/flair/models/text_classification_model.py
index 00e8323386..1b330a0da3 100644
--- a/flair/models/text_classification_model.py
+++ b/flair/models/text_classification_model.py
@@ -39,7 +39,7 @@ def __init__(
             beta: Parameter for F-beta score for evaluation and training annealing
             loss_weights: Dictionary of weights for labels for the loss function. If any label's weight is
                 unspecified it will default to 1.0
-            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         super().__init__(
             **classifierargs,
diff --git a/flair/models/word_tagger_model.py b/flair/models/word_tagger_model.py
index d55eb86926..2d32a54b06 100644
--- a/flair/models/word_tagger_model.py
+++ b/flair/models/word_tagger_model.py
@@ -30,7 +30,7 @@ def __init__(
             label_dictionary: dictionary of labels or BIO/BIOES tags you want to predict
             label_type: string identifier for tag type
             span_encoding: the format to encode spans as tags, either "BIO" or "BIOES"
-            **classifierargs: The arguments propagated to :met:`flair.nn.DefaultClassifier.__init__`
+            **classifierargs: The arguments propagated to :meth:`flair.nn.DefaultClassifier.__init__`
         """
         # if the classifier predicts BIO/BIOES span labels, the internal label dictionary must be computed
         if label_dictionary.span_labels:

From 64e4e8ca87832911127e5ce1e2b86ed679e6d1a4 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 11:44:21 +0200
Subject: [PATCH 096/124] fix documented minimal python version

---
 docs/contributing/local_development.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/contributing/local_development.md b/docs/contributing/local_development.md
index bd9acc4559..87439439f2 100644
--- a/docs/contributing/local_development.md
+++ b/docs/contributing/local_development.md
@@ -6,8 +6,8 @@ the code should hopefully be easy.
 
 ## Setup
 
-Flair requires python-3.7 or higher. To make sure our code also runs on the oldest supported
-python version, it is recommended to use python-3.7.x for flair development.
+Flair requires python-3.8 or higher. To make sure our code also runs on the oldest supported
+python version, it is recommended to use python-3.8.x for flair development.
 
 Create a python environment of your preference and run:
 ```bash

From ac9220b0be1e9d5ba1e9b6048263638d42792e66 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 13:15:23 +0200
Subject: [PATCH 097/124] fix ruff error

---
 flair/datasets/sequence_labeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
index d2e4215c82..b3e40342ba 100644
--- a/flair/datasets/sequence_labeling.py
+++ b/flair/datasets/sequence_labeling.py
@@ -1125,7 +1125,7 @@ def _conll_rows_to_sentence(cls, conll_rows: List[str]) -> Dict:
             speakers.append(speaker if speaker != "-" else None)
 
         named_entities = span_labels[0]
-        srl_frames = [(predicate, labels) for predicate, labels in zip(verbal_predicates, span_labels[1:])]
+        srl_frames = list(zip(verbal_predicates, span_labels[1:]))
 
         # this would not be reached if parse_pieces contained None, hence the cast
         parse_tree = "".join(cast(List[str], parse_pieces)) if all(parse_pieces) else None

From a951be172012f72b30aacd8d37f427a266168095 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 14:27:04 +0200
Subject: [PATCH 098/124] remove banch from doc config

---
 docs/conf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/conf.py b/docs/conf.py
index 97b76fde11..8f448b937b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -116,7 +116,7 @@ def linkcode_resolve(*args):
 smv_tag_whitelist = r"^\d+\.\d+\.\d+$"
 
 # Whitelist pattern for branches (set to None to ignore all branches)
-smv_branch_whitelist = r"^master|doc-page$"
+smv_branch_whitelist = r"^master$"
 
 # Whitelist pattern for remotes (set to None to use local branches only)
 smv_remote_whitelist = r"^origin$"

From 9646f1eb802eab56346b96ecb2ffc0c339db81d7 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 18 Sep 2023 16:10:40 +0200
Subject: [PATCH 099/124] fix build doc page action trigger

---
 .github/workflows/publish-docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index a7f0da96e9..5c83adc855 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -1,7 +1,7 @@
 name: 'Build doc page'
 on:
   push:
-    branches: [ main, doc-page ]
+    branches: [ master ]
     tags:
       - "*"
 

From 1a0ad4bcda3bafdd660590b198b767564c077fa0 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 25 Sep 2023 11:02:29 +0200
Subject: [PATCH 100/124] fix sphinx theme not being orange

---
 docs/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/requirements.txt b/docs/requirements.txt
index b6a98f1186..8d7ae05d70 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -4,5 +4,5 @@ myst-parser
 sphinx
 importlib-metadata
 sphinx-multiversion
-pydata-sphinx-theme
+pydata-sphinx-theme<0.14
 sphinx_design
\ No newline at end of file

From 3ad9b601139200a49282151d3b952bfc4c625814 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 2 Oct 2023 16:57:12 +0200
Subject: [PATCH 101/124] add from state & to state information for plugins

---
 flair/trainers/plugins/base.py                | 23 +++++++-
 flair/trainers/plugins/functional/amp.py      | 52 -------------------
 .../plugins/functional/anneal_on_plateau.py   | 27 ++++++++++
 .../plugins/functional/checkpoints.py         |  9 ++++
 .../plugins/functional/linear_scheduler.py    | 15 ++++--
 .../plugins/functional/weight_extractor.py    |  9 ++++
 flair/trainers/plugins/loggers/log_file.py    |  6 ++-
 flair/trainers/plugins/loggers/loss_file.py   | 14 ++++-
 .../plugins/loggers/metric_history.py         |  8 ++-
 flair/trainers/plugins/loggers/tensorboard.py | 10 ++++
 flair/trainers/plugins/loggers/wandb.py       | 18 ++++++-
 11 files changed, 127 insertions(+), 64 deletions(-)
 delete mode 100644 flair/trainers/plugins/functional/amp.py

diff --git a/flair/trainers/plugins/base.py b/flair/trainers/plugins/base.py
index 709866dada..12d969357b 100644
--- a/flair/trainers/plugins/base.py
+++ b/flair/trainers/plugins/base.py
@@ -1,9 +1,12 @@
+import abc
+import importlib
 import logging
 from collections import defaultdict
 from inspect import isclass, signature
 from itertools import count
 from queue import Queue
 from typing import (
+    Any,
     Callable,
     Dict,
     Iterator,
@@ -175,7 +178,7 @@ def __call__(self, *args, **kw):
             raise err
 
 
-class BasePlugin:
+class BasePlugin(abc.ABC):
     """Base class for all plugins."""
 
     def __init__(self) -> None:
@@ -259,8 +262,20 @@ def pluggable(self) -> Optional[Pluggable]:
     def __str__(self) -> str:
         return self.__class__.__name__
 
+    def get_state(self) -> Dict[str, Any]:
+        return {"__cls__": f"{self.__module__}.{self.__class__.__name__}"}
 
-class TrainerPlugin(BasePlugin):
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "BasePlugin":
+        if "__cls__" in state:
+            module_name, class_name = state.pop("__cls__").rsplit(".", 1)
+            module = importlib.import_module(module_name)
+            plugin_cls: BasePlugin = getattr(module, class_name)
+            return plugin_cls.from_state(state)
+        return cls(**state)
+
+
+class TrainerPlugin(BasePlugin, abc.ABC):
     @property
     def trainer(self):
         return self.pluggable
@@ -272,3 +287,7 @@ def model(self):
     @property
     def corpus(self):
         return self.trainer.corpus
+
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "TrainerPlugin":
+        return cast(TrainerPlugin, super().from_state(state))
diff --git a/flair/trainers/plugins/functional/amp.py b/flair/trainers/plugins/functional/amp.py
deleted file mode 100644
index 411b7d3722..0000000000
--- a/flair/trainers/plugins/functional/amp.py
+++ /dev/null
@@ -1,52 +0,0 @@
-from flair.trainers.plugins.base import TrainerPlugin
-
-
-class AmpPlugin(TrainerPlugin):
-    """Simple plugin for AMP."""
-
-    def __init__(self, opt_level) -> None:
-        super().__init__()
-
-        self.opt_level = opt_level
-
-        self.wrapped_backward = None
-
-        try:
-            from apex import amp
-
-            self.amp = amp
-        except ImportError as exc:
-            raise RuntimeError(
-                "Failed to import apex. Please install apex from "
-                "https://www.github.com/nvidia/apex "
-                "to enable mixed-precision training."
-            ) from exc
-
-    def detach(self, *args, **kwargs):
-        # TODO: what does this do?
-        super().detach(*args, **kwargs)
-
-        # unwrap trainer backward function
-        self.trainer.backward = self.wrapped_backward
-        self.wrapped_backward = None
-
-    def backward(self, loss):
-        assert self.amp is not None
-        optimizer = self.trainer.optimizer
-
-        with self.amp.scale_loss(loss, optimizer) as scaled_loss:
-            scaled_loss.backward()
-
-    @TrainerPlugin.hook
-    def after_setup(self, **kw):
-        """Wraps with AMP."""
-        optimizer = self.trainer.optimizer
-
-        self.trainer.model, self.trainer.optimizer = self.amp.initialize(
-            self.model, optimizer, opt_level=self.opt_level
-        )
-
-        # replace trainers backward function
-        self.wrapped_backward = self.trainer.backward
-
-        self.trainer.backward = self.backward
diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index 69646ef5d4..a91e0a4c1a 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Any, Dict
 
 from flair.trainers.plugins.base import TrainerPlugin, TrainingInterrupt
 from flair.trainers.plugins.metric_records import MetricRecord
@@ -34,6 +35,8 @@ def __init__(
         self.anneal_factor = anneal_factor
         self.patience = patience
         self.initial_extra_patience = initial_extra_patience
+        self.initial_best_value = None
+        self.initial_bad_epochs = None
 
     def store_learning_rate(self):
         optimizer = self.trainer.optimizer
@@ -64,6 +67,8 @@ def after_setup(
             verbose=False,
             optimizer=self.trainer.optimizer,
         )
+        if self.initial_best_value is not None:
+            self.scheduler.best
 
         self.store_learning_rate()
 
@@ -106,3 +111,25 @@ def __str__(self) -> str:
             f"anneal_factor: '{self.anneal_factor}', "
             f"min_learning_rate: '{self.min_learning_rate}'"
         )
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "base_path": str(self.base_path),
+            "min_learning_rate": self.min_learning_rate,
+            "anneal_factor": self.anneal_factor,
+            "patience": self.patience,
+            "initial_extra_patience": self.initial_extra_patience,
+            "anneal_with_restarts": self.anneal_with_restarts,
+            "bad_epochs": self.scheduler.num_bad_epochs,
+            "current_best": self.scheduler.best,
+        }
+
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "AnnealingPlugin":
+        num_bad_epochs = state.pop("bad_epochs")
+        current_best = state.pop("current_best")
+        plugin = cls(**state)
+        plugin.initial_best_value = current_best
+        plugin.initial_bad_epochs = num_bad_epochs
+        return plugin
diff --git a/flair/trainers/plugins/functional/checkpoints.py b/flair/trainers/plugins/functional/checkpoints.py
index f1f7020f44..75ecb9bd98 100644
--- a/flair/trainers/plugins/functional/checkpoints.py
+++ b/flair/trainers/plugins/functional/checkpoints.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any, Dict
 
 from flair.trainers.plugins.base import TrainerPlugin
 
@@ -27,3 +28,11 @@ def after_training_epoch(self, epoch, **kw):
             )
             model_name = "model_epoch_" + str(epoch) + ".pt"
             self.model.save(self.base_path / model_name, checkpoint=self.save_optimizer_state)
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "base_path": str(self.base_path),
+            "save_model_each_k_epochs": self.save_model_each_k_epochs,
+            "save_optimizer_state": self.save_optimizer_state,
+        }
diff --git a/flair/trainers/plugins/functional/linear_scheduler.py b/flair/trainers/plugins/functional/linear_scheduler.py
index 51b295c7dc..1000be6dd7 100644
--- a/flair/trainers/plugins/functional/linear_scheduler.py
+++ b/flair/trainers/plugins/functional/linear_scheduler.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any, Dict
 
 from flair.optim import LinearSchedulerWithWarmup
 from flair.trainers.plugins.base import TrainerPlugin
@@ -9,7 +10,7 @@
 class LinearSchedulerPlugin(TrainerPlugin):
     """Plugin for LinearSchedulerWithWarmup."""
 
-    def __init__(self, warmup_fraction: float, **kwargs) -> None:
+    def __init__(self, warmup_fraction: float) -> None:
         super().__init__()
 
         self.warmup_fraction = warmup_fraction
@@ -29,7 +30,7 @@ def after_setup(
         dataset_size,
         mini_batch_size,
         max_epochs,
-        **kw,
+        **kwargs,
     ):
         """Initialize different schedulers, including anneal target for AnnealOnPlateau, batch_growth_annealing, loading schedulers."""
         # calculate warmup steps
@@ -44,13 +45,13 @@ def after_setup(
         self.store_learning_rate()
 
     @TrainerPlugin.hook
-    def before_training_epoch(self, **kw):
+    def before_training_epoch(self, **kwargs):
         """Load state for anneal_with_restarts, batch_growth_annealing, logic for early stopping."""
         self.store_learning_rate()
         self.previous_learning_rate = self.current_learning_rate
 
     @TrainerPlugin.hook
-    def after_training_batch(self, optimizer_was_run: bool, **kw):
+    def after_training_batch(self, optimizer_was_run: bool, **kwargs):
         """Do the scheduler step if one-cycle or linear decay."""
         # skip if no optimization has happened.
         if not optimizer_was_run:
@@ -60,3 +61,9 @@ def after_training_batch(self, optimizer_was_run: bool, **kw):
 
     def __str__(self) -> str:
         return f"LinearScheduler | warmup_fraction: '{self.warmup_fraction}'"
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "warmup_fraction": self.warmup_fraction,
+        }
diff --git a/flair/trainers/plugins/functional/weight_extractor.py b/flair/trainers/plugins/functional/weight_extractor.py
index a6ed5eab2e..ef5afe081e 100644
--- a/flair/trainers/plugins/functional/weight_extractor.py
+++ b/flair/trainers/plugins/functional/weight_extractor.py
@@ -1,3 +1,5 @@
+from typing import Any, Dict
+
 from flair.trainers.plugins.base import TrainerPlugin
 from flair.training_utils import WeightExtractor
 
@@ -7,6 +9,7 @@ class WeightExtractorPlugin(TrainerPlugin):
 
     def __init__(self, base_path) -> None:
         super().__init__()
+        self.base_path = base_path
         self.weight_extractor = WeightExtractor(base_path)
 
     @TrainerPlugin.hook
@@ -17,3 +20,9 @@ def after_training_batch(self, batch_no, epoch, total_number_of_batches, **kw):
 
         if (iteration + 1) % modulo == 0:
             self.weight_extractor.extract_weights(self.model.state_dict(), iteration)
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "base_path": str(self.base_path),
+        }
diff --git a/flair/trainers/plugins/loggers/log_file.py b/flair/trainers/plugins/loggers/log_file.py
index ed2272f6a7..a9b7453a09 100644
--- a/flair/trainers/plugins/loggers/log_file.py
+++ b/flair/trainers/plugins/loggers/log_file.py
@@ -1,5 +1,6 @@
 import logging
 from pathlib import Path
+from typing import Any, Dict
 
 from flair.trainers.plugins.base import TrainerPlugin
 from flair.training_utils import add_file_handler
@@ -12,10 +13,13 @@ class LogFilePlugin(TrainerPlugin):
 
     def __init__(self, base_path) -> None:
         super().__init__()
-
+        self.base_path = base_path
         self.log_handler = add_file_handler(log, Path(base_path) / "training.log")
 
     @TrainerPlugin.hook("_training_exception", "after_training")
     def close_file_handler(self, **kw):
         self.log_handler.close()
         log.removeHandler(self.log_handler)
+
+    def get_state(self) -> Dict[str, Any]:
+        return {**super().get_state(), "base_path": str(self.base_path)}
diff --git a/flair/trainers/plugins/loggers/loss_file.py b/flair/trainers/plugins/loggers/loss_file.py
index f19ef918ba..5ebeaddd36 100644
--- a/flair/trainers/plugins/loggers/loss_file.py
+++ b/flair/trainers/plugins/loggers/loss_file.py
@@ -1,5 +1,5 @@
 from datetime import datetime
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 from flair.trainers.plugins.base import TrainerPlugin
 from flair.trainers.plugins.metric_records import MetricName
@@ -15,9 +15,10 @@ def __init__(
         super().__init__()
 
         self.first_epoch = epoch + 1
-
+        self.last_epoch = self.first_epoch
         # prepare loss logging file and set up header
         self.loss_txt = init_output_file(base_path, "loss.tsv")
+        self.base_path = base_path
 
         # set up all metrics to collect
         self.metrics_to_collect = metrics_to_collect
@@ -58,6 +59,14 @@ def __init__(
         # initialize the first log line
         self.current_row: Optional[Dict[MetricName, str]] = None
 
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "base_path": str(self.base_path),
+            "metrics_to_collect": self.metrics_to_collect,
+            "first_epoch": self.last_epoch,
+        }
+
     @TrainerPlugin.hook
     def before_training_epoch(self, epoch, **kw):
         """Get the current epoch for loss file logging."""
@@ -83,6 +92,7 @@ def metric_recorded(self, record):
     def after_evaluation(self, epoch, **kw):
         """This prints all relevant metrics."""
         if self.loss_txt is not None:
+            self.last_epoch = epoch
             self.current_row[MetricName("timestamp")] = f"{datetime.now():%H:%M:%S}"
 
             # output log file
diff --git a/flair/trainers/plugins/loggers/metric_history.py b/flair/trainers/plugins/loggers/metric_history.py
index 46802824d0..8d7c946e8d 100644
--- a/flair/trainers/plugins/loggers/metric_history.py
+++ b/flair/trainers/plugins/loggers/metric_history.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Mapping
+from typing import Any, Dict, Mapping
 
 from flair.trainers.plugins.base import TrainerPlugin
 
@@ -32,3 +32,9 @@ def metric_recorded(self, record):
     def after_training(self, **kw):
         """Returns metric history."""
         self.trainer.return_values.update(self.metric_history)
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "metrics_to_collect": dict(self.metrics_to_collect),
+        }
diff --git a/flair/trainers/plugins/loggers/tensorboard.py b/flair/trainers/plugins/loggers/tensorboard.py
index 8fc8af9e9e..59bba9f2e9 100644
--- a/flair/trainers/plugins/loggers/tensorboard.py
+++ b/flair/trainers/plugins/loggers/tensorboard.py
@@ -1,5 +1,6 @@
 import logging
 import os
+from typing import Any, Dict
 
 from flair.trainers.plugins.base import TrainerPlugin
 from flair.training_utils import log_line
@@ -22,6 +23,7 @@ def __init__(self, log_dir=None, comment="", tracked_metrics=()) -> None:
         super().__init__()
         self.comment = comment
         self.tracked_metrics = tracked_metrics
+        self.log_dir = log_dir
 
         try:
             from torch.utils.tensorboard import SummaryWriter
@@ -56,3 +58,11 @@ def _training_finally(self, **kw):
         """Closes the writer."""
         assert self.writer is not None
         self.writer.close()
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "log_dir": str(self.log_dir) if self.log_dir is not None else None,
+            "comment": self.comment,
+            "tracked_metrics": self.tracked_metrics,
+        }
diff --git a/flair/trainers/plugins/loggers/wandb.py b/flair/trainers/plugins/loggers/wandb.py
index 1393872a46..419dac040a 100644
--- a/flair/trainers/plugins/loggers/wandb.py
+++ b/flair/trainers/plugins/loggers/wandb.py
@@ -1,4 +1,5 @@
 import logging
+from typing import Any, Dict
 
 from flair.trainers.plugins.base import TrainerPlugin
 
@@ -32,8 +33,8 @@ def emit(self, record):
 
 
 class WandbLogger(TrainerPlugin):
-    def __init__(self, wandb, emit_alerts=True, alert_level=logging.WARNING, **kwargs) -> None:
-        super().__init__(**kwargs)
+    def __init__(self, wandb, emit_alerts=True, alert_level=logging.WARNING) -> None:
+        super().__init__()
 
         self.wandb = wandb
         self.emit_alerts = emit_alerts
@@ -70,3 +71,16 @@ def metric_recorded(self, record):
     @TrainerPlugin.hook
     def _training_finally(self, **kw):
         self.writer.close()
+
+    def get_state(self) -> Dict[str, Any]:
+        return {
+            **super().get_state(),
+            "emit_alerts": self.emit_alerts,
+            "alert_level": self.alert_level,
+        }
+
+    @classmethod
+    def from_state(cls, state: Dict[str, Any]) -> "WandbLogger":
+        import wandb
+
+        return cls(wandb=wandb, **state)

From 8ac66dad5865d81c29345155c70431277693db4b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 2 Oct 2023 17:12:17 +0200
Subject: [PATCH 102/124] store types as strings and plugins as dictionaries

---
 flair/trainers/plugins/base.py                | 19 ++-----------------
 .../plugins/functional/anneal_on_plateau.py   | 13 -------------
 flair/trainers/plugins/loggers/wandb.py       |  6 ------
 flair/trainers/trainer.py                     |  6 +++++-
 4 files changed, 7 insertions(+), 37 deletions(-)

diff --git a/flair/trainers/plugins/base.py b/flair/trainers/plugins/base.py
index 12d969357b..958d57b785 100644
--- a/flair/trainers/plugins/base.py
+++ b/flair/trainers/plugins/base.py
@@ -1,5 +1,3 @@
-import abc
-import importlib
 import logging
 from collections import defaultdict
 from inspect import isclass, signature
@@ -178,7 +176,7 @@ def __call__(self, *args, **kw):
             raise err
 
 
-class BasePlugin(abc.ABC):
+class BasePlugin:
     """Base class for all plugins."""
 
     def __init__(self) -> None:
@@ -265,17 +263,8 @@ def __str__(self) -> str:
     def get_state(self) -> Dict[str, Any]:
         return {"__cls__": f"{self.__module__}.{self.__class__.__name__}"}
 
-    @classmethod
-    def from_state(cls, state: Dict[str, Any]) -> "BasePlugin":
-        if "__cls__" in state:
-            module_name, class_name = state.pop("__cls__").rsplit(".", 1)
-            module = importlib.import_module(module_name)
-            plugin_cls: BasePlugin = getattr(module, class_name)
-            return plugin_cls.from_state(state)
-        return cls(**state)
-
 
-class TrainerPlugin(BasePlugin, abc.ABC):
+class TrainerPlugin(BasePlugin):
     @property
     def trainer(self):
         return self.pluggable
@@ -287,7 +276,3 @@ def model(self):
     @property
     def corpus(self):
         return self.trainer.corpus
-
-    @classmethod
-    def from_state(cls, state: Dict[str, Any]) -> "TrainerPlugin":
-        return cast(TrainerPlugin, super().from_state(state))
diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index a91e0a4c1a..0bdf1e6ea5 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -35,8 +35,6 @@ def __init__(
         self.anneal_factor = anneal_factor
         self.patience = patience
         self.initial_extra_patience = initial_extra_patience
-        self.initial_best_value = None
-        self.initial_bad_epochs = None
 
     def store_learning_rate(self):
         optimizer = self.trainer.optimizer
@@ -67,8 +65,6 @@ def after_setup(
             verbose=False,
             optimizer=self.trainer.optimizer,
         )
-        if self.initial_best_value is not None:
-            self.scheduler.best
 
         self.store_learning_rate()
 
@@ -124,12 +120,3 @@ def get_state(self) -> Dict[str, Any]:
             "bad_epochs": self.scheduler.num_bad_epochs,
             "current_best": self.scheduler.best,
         }
-
-    @classmethod
-    def from_state(cls, state: Dict[str, Any]) -> "AnnealingPlugin":
-        num_bad_epochs = state.pop("bad_epochs")
-        current_best = state.pop("current_best")
-        plugin = cls(**state)
-        plugin.initial_best_value = current_best
-        plugin.initial_bad_epochs = num_bad_epochs
-        return plugin
diff --git a/flair/trainers/plugins/loggers/wandb.py b/flair/trainers/plugins/loggers/wandb.py
index 419dac040a..8608fcdbd9 100644
--- a/flair/trainers/plugins/loggers/wandb.py
+++ b/flair/trainers/plugins/loggers/wandb.py
@@ -78,9 +78,3 @@ def get_state(self) -> Dict[str, Any]:
             "emit_alerts": self.emit_alerts,
             "alert_level": self.alert_level,
         }
-
-    @classmethod
-    def from_state(cls, state: Dict[str, Any]) -> "WandbLogger":
-        import wandb
-
-        return cls(wandb=wandb, **state)
diff --git a/flair/trainers/trainer.py b/flair/trainers/trainer.py
index 7db6a7d175..22fb672bd1 100644
--- a/flair/trainers/trainer.py
+++ b/flair/trainers/trainer.py
@@ -860,7 +860,11 @@ def _initialize_model_card(self, **training_parameters):
             k: str(v) if isinstance(v, Path) else v for k, v in training_parameters.items()
         }
 
-        plugins = [plugin.__class__ for plugin in model_card["training_parameters"]["plugins"]]
+        model_card["training_parameters"] = {
+            k: f"{v.__module__}.{v.__name__}" if inspect.isclass(v) else v for k, v in training_parameters.items()
+        }
+
+        plugins = [plugin.get_state() for plugin in model_card["training_parameters"]["plugins"]]
         model_card["training_parameters"]["plugins"] = plugins
 
         return model_card

From 7679c1e60ddff3c69cd6b18fba83d4caa9521434 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 2 Oct 2023 17:25:12 +0200
Subject: [PATCH 103/124] fixup last epoch not being required

---
 flair/trainers/plugins/loggers/loss_file.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/flair/trainers/plugins/loggers/loss_file.py b/flair/trainers/plugins/loggers/loss_file.py
index 5ebeaddd36..29c42fc930 100644
--- a/flair/trainers/plugins/loggers/loss_file.py
+++ b/flair/trainers/plugins/loggers/loss_file.py
@@ -15,7 +15,6 @@ def __init__(
         super().__init__()
 
         self.first_epoch = epoch + 1
-        self.last_epoch = self.first_epoch
         # prepare loss logging file and set up header
         self.loss_txt = init_output_file(base_path, "loss.tsv")
         self.base_path = base_path
@@ -64,7 +63,6 @@ def get_state(self) -> Dict[str, Any]:
             **super().get_state(),
             "base_path": str(self.base_path),
             "metrics_to_collect": self.metrics_to_collect,
-            "first_epoch": self.last_epoch,
         }
 
     @TrainerPlugin.hook
@@ -92,7 +90,6 @@ def metric_recorded(self, record):
     def after_evaluation(self, epoch, **kw):
         """This prints all relevant metrics."""
         if self.loss_txt is not None:
-            self.last_epoch = epoch
             self.current_row[MetricName("timestamp")] = f"{datetime.now():%H:%M:%S}"
 
             # output log file

From 033398f6aeb4aa88c3ed7506e393fdbab9f6a507 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Fri, 6 Oct 2023 22:29:13 +0200
Subject: [PATCH 104/124] add ci to delete old workflow runs

---
 .github/workflows/ci.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 62eddad8a7..750f32a9ea 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,14 @@ on:
     branches: [master]
 
 jobs:
+  steps:
+    - name: Delete workflow runs
+      uses: MajorScruffy/delete-old-workflow-runs@v0.2.0
+      with:
+        repository: flairNLP/flair
+        older-than-seconds: 21600000  # 8 months -> 8 * 30 * 25 * 60 * 60
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   test:
     runs-on: ubuntu-latest
     env:

From b690fc83d6117e378a3cb375ba75cc20088d2668 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Fri, 6 Oct 2023 22:33:51 +0200
Subject: [PATCH 105/124] add ci to delete old workflow runs

---
 .github/workflows/ci.yml | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 750f32a9ea..a7d38779ba 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,14 +5,16 @@ on:
     branches: [master]
 
 jobs:
-  steps:
-    - name: Delete workflow runs
-      uses: MajorScruffy/delete-old-workflow-runs@v0.2.0
-      with:
-        repository: flairNLP/flair
-        older-than-seconds: 21600000  # 8 months -> 8 * 30 * 25 * 60 * 60
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  delete:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Delete workflow runs
+        uses: MajorScruffy/delete-old-workflow-runs@v0.2.0
+        with:
+          repository: flairNLP/flair
+          older-than-seconds: 21600000  # 8 months -> 8 * 30 * 25 * 60 * 60
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   test:
     runs-on: ubuntu-latest
     env:

From a81e1e62c9d0c5a5161b9e8caf3cc77494e5630a Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Fri, 6 Oct 2023 22:56:17 +0200
Subject: [PATCH 106/124] cleanup disk space on github action

---
 .github/workflows/ci.yml | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a7d38779ba..4eb8880a5b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,16 +5,6 @@ on:
     branches: [master]
 
 jobs:
-  delete:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Delete workflow runs
-        uses: MajorScruffy/delete-old-workflow-runs@v0.2.0
-        with:
-          repository: flairNLP/flair
-          older-than-seconds: 21600000  # 8 months -> 8 * 30 * 25 * 60 * 60
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
   test:
     runs-on: ubuntu-latest
     env:
@@ -27,6 +17,11 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
+      - name: Cleanup Disk Space
+      - run: |
+          sudo swapoff -a
+          sudo rm -f /swapfile
+          sudo apt clean
       - name: Install Flair dependencies
         run: pip install -e .
       - name: Install unittest dependencies

From b5076bb1fff49342524221c58513ed340a37494b Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Fri, 6 Oct 2023 22:57:18 +0200
Subject: [PATCH 107/124] cleanup disk space on github action

---
 .github/workflows/ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4eb8880a5b..9e5dd862ea 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -18,7 +18,7 @@ jobs:
         with:
           python-version: 3.8
       - name: Cleanup Disk Space
-      - run: |
+        run: |
           sudo swapoff -a
           sudo rm -f /swapfile
           sudo apt clean

From 9e6ff65850a4e58344d25604110764ca7861be41 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Mon, 9 Oct 2023 12:51:21 +0200
Subject: [PATCH 108/124] use torch cpu on cicd

---
 .github/workflows/ci.yml           | 7 ++-----
 .github/workflows/publish-docs.yml | 2 ++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9e5dd862ea..ce30e78f3b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,11 +17,8 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: 3.8
-      - name: Cleanup Disk Space
-        run: |
-          sudo swapoff -a
-          sudo rm -f /swapfile
-          sudo apt clean
+      - name: Install Torch cpu
+        run: pip install torch --index-url https://download.pytorch.org/whl/cpu
       - name: Install Flair dependencies
         run: pip install -e .
       - name: Install unittest dependencies
diff --git a/.github/workflows/publish-docs.yml b/.github/workflows/publish-docs.yml
index 5c83adc855..24a424adba 100644
--- a/.github/workflows/publish-docs.yml
+++ b/.github/workflows/publish-docs.yml
@@ -18,6 +18,8 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: ${{ env.python-version }}
+      - name: Install Torch cpu
+        run: pip install torch --index-url https://download.pytorch.org/whl/cpu
       - name: Install Flair dependencies
         run: pip install -e .
       - name: Install unittest dependencies

From 247c5ab1196f12004147d5f51f75343bbcf915d0 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Mon, 9 Oct 2023 13:12:07 +0200
Subject: [PATCH 109/124] fix issue condition

---
 .github/workflows/issues.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/issues.yml b/.github/workflows/issues.yml
index 772773c254..403e16c569 100644
--- a/.github/workflows/issues.yml
+++ b/.github/workflows/issues.yml
@@ -3,7 +3,7 @@ on: issue_comment
 jobs:
   issue_commented:
     name: Issue comment
-    if: ${{ !github.event.issue.pull_request && github.event.issue.author == github.even.issue_comment.author }}
+    if: ${{ github.event.issue.pull_request && github.event.issue.author == github.even.issue_comment.author }}
     runs-on: ubuntu-latest
     steps:
       - uses: actions-ecosystem/action-remove-labels@v1

From 682dad835433e9ba87c65a5e49e7eb76cd21e4dc Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Mon, 9 Oct 2023 13:34:13 +0200
Subject: [PATCH 110/124] remove pass statements

---
 flair/datasets/base.py       | 1 -
 flair/datasets/biomedical.py | 1 -
 flair/embeddings/document.py | 1 -
 flair/embeddings/image.py    | 1 -
 flair/embeddings/legacy.py   | 1 -
 5 files changed, 5 deletions(-)

diff --git a/flair/datasets/base.py b/flair/datasets/base.py
index ca6ead4069..2ba0aabab3 100644
--- a/flair/datasets/base.py
+++ b/flair/datasets/base.py
@@ -156,7 +156,6 @@ def __init__(
             log.warning('ATTENTION! The library "pymongo" is not installed!')
             log.warning('To use MongoDataset, please first install with "pip install pymongo"')
             log.warning("-" * 100)
-            pass
 
         self.in_memory = in_memory
         self.tokenizer = tokenizer
diff --git a/flair/datasets/biomedical.py b/flair/datasets/biomedical.py
index 2ce701d51d..b0289c098a 100644
--- a/flair/datasets/biomedical.py
+++ b/flair/datasets/biomedical.py
@@ -132,7 +132,6 @@ def filter_and_map_entities(
                 new_entities.append(new_entity)
             else:
                 logging.debug(f"Skip entity type {entity.type}")
-                pass
         mapped_entities_per_document[id] = new_entities
 
     return InternalBioNerDataset(documents=dataset.documents, entities_per_document=mapped_entities_per_document)
diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
index 8779d418e4..c1e73442e6 100644
--- a/flair/embeddings/document.py
+++ b/flair/embeddings/document.py
@@ -556,7 +556,6 @@ def __init__(
             log.warning('ATTENTION! The library "sentence-transformers" is not installed!')
             log.warning('To use Sentence Transformers, please first install with "pip install sentence-transformers"')
             log.warning("-" * 100)
-            pass
 
         self.model_name = model
         self.model = SentenceTransformer(
diff --git a/flair/embeddings/image.py b/flair/embeddings/image.py
index faf3a78b17..ae4d68fa48 100644
--- a/flair/embeddings/image.py
+++ b/flair/embeddings/image.py
@@ -104,7 +104,6 @@ def __init__(self, name, pretrained=True, transforms=None) -> None:
             log.warning('ATTENTION! The library "torchvision" is not installed!')
             log.warning('To use convnets pretraned on ImageNet, please first install with "pip install torchvision"')
             log.warning("-" * 100)
-            pass
 
         model_info = {
             "resnet50": (torchvision.models.resnet50, lambda x: list(x)[:-1], 2048),
diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
index 6096fa7582..b2658e2d2f 100644
--- a/flair/embeddings/legacy.py
+++ b/flair/embeddings/legacy.py
@@ -40,7 +40,6 @@ def __init__(
             log.warning('ATTENTION! The library "allennlp" is not installed!')
             log.warning('To use ELMoEmbeddings, please first install with "pip install allennlp==0.9.0"')
             log.warning("-" * 100)
-            pass
 
         assert embedding_mode in ["all", "top", "average"]
 

From 40b700a3fbab42fa16012221bc58a6f40dbe9fac Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Mon, 9 Oct 2023 14:40:07 +0200
Subject: [PATCH 111/124] reduce ci log size

---
 .github/workflows/ci.yml  | 2 +-
 flair/embeddings/image.py | 2 +-
 flair/optim.py            | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index ce30e78f3b..9d97414019 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -33,4 +33,4 @@ jobs:
       - name: Run tests
         run: |
           python -c 'import flair'
-          pytest --runintegration --durations=0 -vv
+          pytest --runintegration -vv
diff --git a/flair/embeddings/image.py b/flair/embeddings/image.py
index ae4d68fa48..df6d1fadd9 100644
--- a/flair/embeddings/image.py
+++ b/flair/embeddings/image.py
@@ -31,7 +31,7 @@ def embedding_type(self) -> str:
 
     def to_params(self) -> Dict[str, Any]:
         # legacy pickle-like saving for image embeddings, as implementation details are not obvious
-        return self.__getstate__()  # type: ignore[operator]
+        return self.__getstate__()
 
     @classmethod
     def from_params(cls, params: Dict[str, Any]) -> "Embeddings":
diff --git a/flair/optim.py b/flair/optim.py
index c41e1b54cf..4c2f880523 100644
--- a/flair/optim.py
+++ b/flair/optim.py
@@ -3,7 +3,7 @@
 import torch
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau, _LRScheduler
-from torch.optim.optimizer import required  # type: ignore[attr-defined]
+from torch.optim.optimizer import required
 
 log = logging.getLogger("flair")
 

From 5631d270766503c1ce4a2bb0e64bede66b319a15 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Wed, 11 Oct 2023 20:33:19 +0200
Subject: [PATCH 112/124] fix wrong import

---
 flair/trainers/plugins/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flair/trainers/plugins/__init__.py b/flair/trainers/plugins/__init__.py
index be02970a09..925b30cf86 100644
--- a/flair/trainers/plugins/__init__.py
+++ b/flair/trainers/plugins/__init__.py
@@ -1,5 +1,4 @@
 from .base import BasePlugin, Pluggable, TrainerPlugin, TrainingInterrupt
-from .functional.amp import AmpPlugin
 from .functional.anneal_on_plateau import AnnealingPlugin
 from .functional.checkpoints import CheckpointPlugin
 from .functional.linear_scheduler import LinearSchedulerPlugin
@@ -11,7 +10,6 @@
 from .metric_records import MetricName, MetricRecord
 
 __all__ = [
-    "AmpPlugin",
     "AnnealingPlugin",
     "CheckpointPlugin",
     "LinearSchedulerPlugin",

From df71f8501bd96cdea8d2efb8f3b5089c2a40dce8 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Wed, 11 Oct 2023 21:10:06 +0200
Subject: [PATCH 113/124] fix typing for scheduler

---
 flair/trainers/plugins/functional/anneal_on_plateau.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index 0bdf1e6ea5..62bf62da20 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -35,6 +35,7 @@ def __init__(
         self.anneal_factor = anneal_factor
         self.patience = patience
         self.initial_extra_patience = initial_extra_patience
+        self.scheduler: AnnealOnPlateau
 
     def store_learning_rate(self):
         optimizer = self.trainer.optimizer

From a290eb43883e42be40cdf3aef78e47ed9c1f369f Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Wed, 11 Oct 2023 22:06:25 +0200
Subject: [PATCH 114/124] fix anneal plugin

---
 .../plugins/functional/anneal_on_plateau.py   |  2 --
 test_emb.py                                   | 27 +++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 test_emb.py

diff --git a/flair/trainers/plugins/functional/anneal_on_plateau.py b/flair/trainers/plugins/functional/anneal_on_plateau.py
index 62bf62da20..e0bff9a190 100644
--- a/flair/trainers/plugins/functional/anneal_on_plateau.py
+++ b/flair/trainers/plugins/functional/anneal_on_plateau.py
@@ -118,6 +118,4 @@ def get_state(self) -> Dict[str, Any]:
             "patience": self.patience,
             "initial_extra_patience": self.initial_extra_patience,
             "anneal_with_restarts": self.anneal_with_restarts,
-            "bad_epochs": self.scheduler.num_bad_epochs,
-            "current_best": self.scheduler.best,
         }
diff --git a/test_emb.py b/test_emb.py
new file mode 100644
index 0000000000..362db85622
--- /dev/null
+++ b/test_emb.py
@@ -0,0 +1,27 @@
+from flair.data import Sentence
+from flair.embeddings import TransformerWordEmbeddings
+
+phrase_0 = Sentence("a  uui")
+embeddings_a = TransformerWordEmbeddings(
+    'roberta-base',
+    use_context=True,
+    use_context_separator=False,
+)
+ebd_a = embeddings_a.embed(phrase_0)
+
+phrase_1 = Sentence("a  uui")
+embeddings_b = TransformerWordEmbeddings(
+    'roberta-base',
+    use_context=True,
+    use_context_separator=False,
+)
+ebd_b = embeddings_b.embed(phrase_1)
+ebd_b = [phrase_1]
+ebd_a = [phrase_0]
+
+print(
+    "token run 0:", ebd_a[-1][-1], "\n",
+    "embedding end run 0:", ebd_a[-1][-1].embedding.tolist()[-2:], "\n",
+    "token run 1: ", ebd_b[-1][-1], "\n",
+    "embedding end run 1:", ebd_b[-1][-1].embedding.tolist()[-2:]
+)
\ No newline at end of file

From c3ee3ab745efc47e1b46a1072ccf978b0381efdc Mon Sep 17 00:00:00 2001
From: mauryaland <amaury@fouret.org>
Date: Thu, 12 Oct 2023 14:47:03 +0200
Subject: [PATCH 115/124] fix E721 Do not compare types, use `isinstance()`

---
 flair/models/sequence_tagger_utils/viterbi.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index b2e1c608e1..06f5e236bb 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -232,13 +232,13 @@ def _all_scores_for_token(
         scores = scores.numpy()
         for i_batch, batch in enumerate(scores):
             for i, (tag_id, tag_scores) in enumerate(zip(tag_seq, batch)):
-                if type(tag_id) != int and tag_id.item() != np.argmax(tag_scores):
+                if not isinstance(tag_id, int) and tag_id.item() != np.argmax(tag_scores):
                     swap_index_score = np.argmax(tag_scores)
                     scores[i_batch][i][tag_id.item()], scores[i_batch][i][swap_index_score] = (
                         scores[i_batch][i][swap_index_score],
                         scores[i_batch][i][tag_id.item()],
                     )
-                elif type(tag_id) == int and tag_id != np.argmax(tag_scores):
+                elif isinstance(tag_id, int) and tag_id != np.argmax(tag_scores):
                     swap_index_score = np.argmax(tag_scores)
                     scores[i_batch][i][tag_id], scores[i_batch][i][swap_index_score] = (
                         scores[i_batch][i][swap_index_score],

From ae5ca66da0aa39fa70aff234925aa4c3f8d4ba28 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Sat, 14 Oct 2023 12:20:25 +0200
Subject: [PATCH 116/124] fix typing errors & simplify logic

---
 flair/models/sequence_tagger_utils/viterbi.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 06f5e236bb..52b59d1f2f 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -232,17 +232,16 @@ def _all_scores_for_token(
         scores = scores.numpy()
         for i_batch, batch in enumerate(scores):
             for i, (tag_id, tag_scores) in enumerate(zip(tag_seq, batch)):
-                if not isinstance(tag_id, int) and tag_id.item() != np.argmax(tag_scores):
-                    swap_index_score = np.argmax(tag_scores)
-                    scores[i_batch][i][tag_id.item()], scores[i_batch][i][swap_index_score] = (
+                if isinstance(tag_id, int):
+                    tag_id_int = tag_id
+                else:
+                    tag_id_int = int(tag_id.item())
+
+                if tag_id_int != np.argmax(tag_scores):
+                    swap_index_score = int(np.argmax(tag_scores))
+                    scores[i_batch][i][tag_id_int], scores[i_batch][i][swap_index_score] = (
                         scores[i_batch][i][swap_index_score],
-                        scores[i_batch][i][tag_id.item()],
-                    )
-                elif isinstance(tag_id, int) and tag_id != np.argmax(tag_scores):
-                    swap_index_score = np.argmax(tag_scores)
-                    scores[i_batch][i][tag_id], scores[i_batch][i][swap_index_score] = (
-                        scores[i_batch][i][swap_index_score],
-                        scores[i_batch][i][tag_id],
+                        scores[i_batch][i][tag_id_int],
                     )
         prob_tags_per_sentence = []
         for scores_sentence, length, sentence in zip(scores, lengths, sentences):

From 04856ef8e3a5a087ba6380e2a15f7da65dabae3e Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <e1526472@student.tuwien.ac.at>
Date: Sat, 14 Oct 2023 13:09:32 +0200
Subject: [PATCH 117/124] fix ruff tenary error

---
 flair/models/sequence_tagger_utils/viterbi.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
index 52b59d1f2f..1c19362676 100644
--- a/flair/models/sequence_tagger_utils/viterbi.py
+++ b/flair/models/sequence_tagger_utils/viterbi.py
@@ -232,10 +232,7 @@ def _all_scores_for_token(
         scores = scores.numpy()
         for i_batch, batch in enumerate(scores):
             for i, (tag_id, tag_scores) in enumerate(zip(tag_seq, batch)):
-                if isinstance(tag_id, int):
-                    tag_id_int = tag_id
-                else:
-                    tag_id_int = int(tag_id.item())
+                tag_id_int = tag_id if isinstance(tag_id, int) else int(tag_id.item())
 
                 if tag_id_int != np.argmax(tag_scores):
                     swap_index_score = int(np.argmax(tag_scores))

From 05cd797dbae3357e712d11c3c852f2cfd48ebca8 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 12:15:25 +0200
Subject: [PATCH 118/124] add documentation for transformersembedding

---
 flair/embeddings/transformer.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index ba90333ea2..416f5a742f 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -8,7 +8,7 @@
 from abc import abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast, Literal
 
 import torch
 import transformers
@@ -982,8 +982,8 @@ def __init__(
         fine_tune: bool = True,
         layers: str = "-1",
         layer_mean: bool = True,
-        subtoken_pooling: str = "first",
-        cls_pooling: str = "cls",
+        subtoken_pooling: Literal["first", "last", "first_last", "mean"] = "first",
+        cls_pooling: Literal["cls", "max", "mean"] = "cls",
         is_token_embedding: bool = True,
         is_document_embedding: bool = True,
         allow_long_sentences: bool = False,
@@ -999,6 +999,32 @@ def __init__(
         use_context_separator: bool = True,
         **kwargs,
     ) -> None:
+        """Instantiate transformers embeddings.
+
+        Allows using transformers as TokenEmbeddings and DocumentEmbeddings or both.
+
+        Args:
+            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            fine_tune: If True, the weights of the transformers embedding will be updated during training.
+            layers: Specify which layers should be extracted for the embeddings. Expects either "all" to extract all layers or a comma separated list of indices (e.g. "-1,-2,-3,-4" for the last 4 layers)
+            layer_mean: If True, the extracted layers will be averaged. Otherwise, they will be concatenated.
+            subtoken_pooling: Specify how multiple sub-tokens will be aggregated for a token-embedding.
+            cls_pooling: Specify how the document-embeddings will be extracted.
+            is_token_embedding: If True, this embeddings can be handled as token-embeddings.
+            is_document_embedding: If True, this embeddings can be handled document-embeddings.
+            allow_long_sentences: If True, too long sentences will be patched and strided and afterwards combined.
+            use_context: If True, predicting multiple sentences at once, will use the previous and next sentences for context.
+            respect_document_boundaries: If True, the context calculation will stop if a sentence represents a context boundary.
+            context_dropout: Integer percentage (0-100) to specify how often the context won't be used during training.
+            saved_config: Pretrained config used when loading embeddings. Always use None.
+            tokenizer_data: Tokenizer data used when loading embeddings. Always use None.
+            feature_extractor_data: Feature extractor data used when loading embeddings. Always use None.
+            name: The name for the embeddings. Per default the name will be used from the used transformers model.
+            force_max_length: If True, the tokenizer will always pad the sequences to maximum length.
+            needs_manual_ocr: If True, bounding boxes will be calculated manually. This is used for models like [layoutlm](https://huggingface.co/docs/transformers/model_doc/layoutlm) where the tokenizer doesn't compute the bounding boxes itself.
+            use_context_separator: If True, the embedding will hold an additional token to allow the model to distingulish between context and prediction.
+            **kwargs: Further values forwarded to the transformers config
+        """
         self.instance_parameters = self.get_instance_parameters(locals=locals())
         del self.instance_parameters["saved_config"]
         del self.instance_parameters["tokenizer_data"]

From 9f8f328bf2670bcccc9346e294bb25e526437405 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 13:07:12 +0200
Subject: [PATCH 119/124] fix layoutlm url

---
 flair/embeddings/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 416f5a742f..fb6b3ddd42 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1021,7 +1021,7 @@ def __init__(
             feature_extractor_data: Feature extractor data used when loading embeddings. Always use None.
             name: The name for the embeddings. Per default the name will be used from the used transformers model.
             force_max_length: If True, the tokenizer will always pad the sequences to maximum length.
-            needs_manual_ocr: If True, bounding boxes will be calculated manually. This is used for models like [layoutlm](https://huggingface.co/docs/transformers/model_doc/layoutlm) where the tokenizer doesn't compute the bounding boxes itself.
+            needs_manual_ocr: If True, bounding boxes will be calculated manually. This is used for models like `layoutlm <https://huggingface.co/docs/transformers/model_doc/layoutlm>`_ where the tokenizer doesn't compute the bounding boxes itself.
             use_context_separator: If True, the embedding will hold an additional token to allow the model to distingulish between context and prediction.
             **kwargs: Further values forwarded to the transformers config
         """

From e3099e6b43eea73b87c3b52348d92777d93252e8 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 13:11:35 +0200
Subject: [PATCH 120/124] fix hf hub url

---
 flair/embeddings/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index fb6b3ddd42..348470f54a 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -1004,7 +1004,7 @@ def __init__(
         Allows using transformers as TokenEmbeddings and DocumentEmbeddings or both.
 
         Args:
-            model: name of transformer model (see https://huggingface.co/transformers/pretrained_models.html for options)
+            model: name of transformer model (see `huggingface hub <https://huggingface.co/models>`_ for options)
             fine_tune: If True, the weights of the transformers embedding will be updated during training.
             layers: Specify which layers should be extracted for the embeddings. Expects either "all" to extract all layers or a comma separated list of indices (e.g. "-1,-2,-3,-4" for the last 4 layers)
             layer_mean: If True, the extracted layers will be averaged. Otherwise, they will be concatenated.

From 3c02b537d960ace27977aa76a44ef75ea325308e Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 13:41:54 +0200
Subject: [PATCH 121/124] fix langdetect link

---
 flair/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/data.py b/flair/data.py
index 24d8cef055..9d0646b94d 100644
--- a/flair/data.py
+++ b/flair/data.py
@@ -720,7 +720,7 @@ def __init__(
                 :class:`flair.tokenization.SegTokTokenizer`. If `use_tokenizer` is set to False,
                 :class:`flair.tokenization.SpaceTokenizer` will be used instead. The tokenizer will be ignored,
                 if `text` refers to pretokenized tokens.
-            language_code: Language of the sentence. If not provided, [langdetect](https://pypi.org/project/langdetect/)
+            language_code: Language of the sentence. If not provided, `langdetect <https://pypi.org/project/langdetect/>`_
                 will be called when the language_code is accessed for the first time.
             start_position: Start char offset of the sentence in the superordinate document.
         """

From 573cc01644a451af7200b40734e2a402aa43c9af Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 13:48:54 +0200
Subject: [PATCH 122/124] add empty glossary

---
 docs/_static/glossary.svg |  8 ++++++++
 docs/glossary/index.rst   |  3 +++
 docs/index.rst            | 26 ++++++++++++++++++++++----
 3 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 docs/_static/glossary.svg
 create mode 100644 docs/glossary/index.rst

diff --git a/docs/_static/glossary.svg b/docs/_static/glossary.svg
new file mode 100644
index 0000000000..ea3f8ad637
--- /dev/null
+++ b/docs/_static/glossary.svg
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg width="800px" height="800px" viewBox="0 0 24 24" xmlns="http://www.w3.org/2000/svg">
+    <g fill="#494c4e" fill-rule="evenodd">
+        <path d="M23.37.67l-.02-.02A2.24 2.24 0 0 0 21.77 0C16.95.01 13.86 1.33 12 2.6A13.365 13.365 0 0 0 8.61.94 20.711 20.711 0 0 0 2.22 0 2.251 2.251 0 0 0 .63.67C.226 1.085 0 1.641 0 2.22v19.56c.005.598.246 1.169.67 1.59.421.4.979.626 1.56.63 4.82-.01 7.91-1.33 9.77-2.6 1.86 1.27 4.95 2.59 9.78 2.6a2.315 2.315 0 0 0 1.59-.67c.402-.416.628-.971.63-1.55V2.22c0-.579-.226-1.135-.63-1.55zM11 19.67C9.42 20.78 6.68 21.99 2.22 22a.468.468 0 0 1-.14-.04.276.276 0 0 1-.08-.18V2.22a.273.273 0 0 1 .06-.16c.043-.04.1-.062.16-.06a18.453 18.453 0 0 1 5.76.84A10.96 10.96 0 0 1 11 4.35v15.32zm11 2.11a.394.394 0 0 1-.04.14.344.344 0 0 1-.18.08c-4.46-.01-7.2-1.22-8.78-2.33V4.33c1.58-1.11 4.32-2.32 8.78-2.33a.269.269 0 0 1 .15.05c.04.048.065.107.07.17v19.56z"/>
+        <path d="M8 13H5a1 1 0 0 1 0-2h3a1 1 0 0 1 0 2zm0-4a1 1 0 0 1-.316-.052l-3-1a1.001 1.001 0 0 1 .633-1.9l3 1A1 1 0 0 1 8 9zm-3 9a1 1 0 0 1-.316-1.948l3-1a1.001 1.001 0 0 1 .633 1.9l-3 1A1 1 0 0 1 5 18zm14-5h-3a1 1 0 0 1 0-2h3a1 1 0 0 1 0 2zm-3-4a1 1 0 0 1-.316-1.949l3-1a1.001 1.001 0 0 1 .633 1.9l-3 1A1 1 0 0 1 16 9zm3 9a1 1 0 0 1-.316-.052l-3-1a1.001 1.001 0 0 1 .633-1.9l3 1A1 1 0 0 1 19 18z"/>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/docs/glossary/index.rst b/docs/glossary/index.rst
new file mode 100644
index 0000000000..5725d0d449
--- /dev/null
+++ b/docs/glossary/index.rst
@@ -0,0 +1,3 @@
+Glossary
+========
+
diff --git a/docs/index.rst b/docs/index.rst
index 0e37ddc70e..3cff769118 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,7 +7,7 @@ flair
 **Version**: |version|
 
 **Useful links**:
-`Getting started <gtutorial/index/intro.html>`_ |
+`Getting started <tutorial/intro.html>`_ |
 `Source Repository <https://github.com/flairNLP/flair>`_ |
 `Issue Tracker <https://github.com/flairNLP/flair/issues>`_ |
 
@@ -55,9 +55,9 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
         Contributor's Guide
         ^^^^^^^^^^^^^^^^^^^
 
-        Want to add to the codebase? Can help add translation or a flowchart to the
+        Want to add to the codebase? Can help add to the
         documentation? The contributing guidelines will guide you through the
-        process of improving NumPy.
+        process of improving Flair.
 
         +++
 
@@ -68,10 +68,28 @@ Flair is a very simple framework for state-of-the-art Natural Language Processin
 
             To the contributor's guide
 
+    .. grid-item-card::
+        :img-top: ./_static/glossary.svg
+
+        Glossary
+        ^^^^^^^^
+
+        Not sure what the exact meaning of certain terms is? Find their definition in the Glossary.
+
+        +++
+
+        .. button-ref:: glossary/index
+            :expand:
+            :color: secondary
+            :click-parent:
+
+            To the glossary
+
 .. toctree::
    :maxdepth: 3
    :hidden:
 
    Tutorials <tutorial/index>
    API reference <api/index>
-   Contributing <contributing/index>
\ No newline at end of file
+   Contributing <contributing/index>
+   Glossary <glossary/index>
\ No newline at end of file

From 82dc352cd0ad1f9e036dd40099148ca259121db1 Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 16 Oct 2023 14:52:44 +0200
Subject: [PATCH 123/124] add glossary for sentence

---
 docs/glossary/index.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/glossary/index.rst b/docs/glossary/index.rst
index 5725d0d449..c732a1a121 100644
--- a/docs/glossary/index.rst
+++ b/docs/glossary/index.rst
@@ -1,3 +1,7 @@
 Glossary
 ========
 
+.. glossary::
+
+    Sentence
+      a sentence is a text-unit consisting of tokens, labels and possibly metadata. Notice that a sentence is not limited in size, hence a Sentence itself could hold either a full document, a paragraph, a simple phrase or a linguistic
\ No newline at end of file

From 1a862659d609cadea0309a35e826e201a762499f Mon Sep 17 00:00:00 2001
From: Benedikt Fuchs <benedikt.fuchs@rise-world.com>
Date: Mon, 23 Oct 2023 11:00:07 +0200
Subject: [PATCH 124/124] fix ruff error

---
 flair/embeddings/transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
index 348470f54a..8635825eec 100644
--- a/flair/embeddings/transformer.py
+++ b/flair/embeddings/transformer.py
@@ -8,7 +8,7 @@
 from abc import abstractmethod
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast, Literal
+from typing import Any, Dict, List, Literal, Optional, Tuple, Type, Union, cast
 
 import torch
 import transformers