flairNLP · mariosaenger · Mar 14, 2023 · Mar 14, 2023 · Mar 15, 2023 · Mar 22, 2023
diff --git a/flair/data.py b/flair/data.py
@@ -6,10 +6,10 @@
 from collections import Counter, defaultdict, namedtuple
 from operator import itemgetter
 from pathlib import Path
-from typing import Dict, Iterable, List, Optional, Union, cast
+from typing import Dict, Iterable, List, Optional, Tuple, Union, cast
 
 import torch
-from deprecated import deprecated
+from deprecated import deprecated  # type: ignore
 from torch.utils.data import Dataset, IterableDataset
 from torch.utils.data.dataset import ConcatDataset, Subset
 
@@ -326,11 +326,13 @@ def get_metadata(self, key: str) -> typing.Any:
     def has_metadata(self, key: str) -> bool:
         return key in self._metadata
 
-    def add_label(self, typename: str, value: str, score: float = 1.0):
+    def add_label(self, typename: str, value_or_label: Union[str, Label], score: float = 1.0):
+        label = value_or_label if isinstance(value_or_label, Label) else Label(self, value_or_label, score)
+
         if typename not in self.annotation_layers:
-            self.annotation_layers[typename] = [Label(self, value, score)]
+            self.annotation_layers[typename] = [label]
         else:
-            self.annotation_layers[typename].append(Label(self, value, score))
+            self.annotation_layers[typename].append(label)
 
         return self
 
@@ -421,6 +423,100 @@ def __len__(self) -> int:
         raise NotImplementedError
 
 
+class EntityLinkingCandidate:
+    """Represent a single candidate returned by a CandidateGenerator"""
+
+    def __init__(
+        self,
+        concept_id: str,
+        concept_name: str,
+        database_name: str,
+        score: float = 1.0,
+        additional_ids: Optional[Union[List[str], str]] = None,
+    ):
+        """
+        :param concept_id: Identifier of the entity / concept from the knowledge base / ontology
+        :param concept_name: (Canonical) name of the entity / concept from the knowledge base / ontology
+        :param score: Matching score of the entity / concept according to the entity mention
+        :param additional_ids: List of additional identifiers for the concept / entity in the KB / ontology
+        :param database_name: Name of the knowlege base / ontology
+        """
+        self.concept_id = concept_id
+        self.concept_name = concept_name
+        self.database_name = database_name
+        self.score = score
+        self.additional_ids = additional_ids
+
+    def __str__(self) -> str:
+        string = f"EntityLinkingCandidate: {self.database_name}:{self.concept_id} - {self.concept_name} - {self.score}"
+        if self.additional_ids is not None:
+            string += f" - {self.additional_ids}"
+        return string
+
+    def __repr__(self) -> str:
+        return str(self)
+
+
+class EntityLinkingLabel(Label):
+    """
+    Label class models entity linking annotations. Each entity linking label has a data point it refers
+    to as well as the identifier and name of the concept / entity from a knowledge base or ontology.
+    Optionally, additional concepts identifier and the database name can be provided.
+    """
+
+    def __init__(self, data_point: DataPoint, candidates: List[EntityLinkingCandidate]):
+        """
+        Initializes the label instance.
+        :param data_point: Data point / span the label refers to
+        :param candidates: **sorted** list of candidates from candidate generator
+        """
+
+        def is_sorted(lst, key=lambda x: x, comparison=lambda x, y: x >= y):
+            for i, el in enumerate(lst[1:]):
+                if comparison(key(el), key(lst[i])):
+                    return False
+            return True
+
+        # candidates must be sorted, regardless if higher is better or not
+        assert is_sorted(candidates, key=lambda x: x.score) or is_sorted(
+            candidates, key=lambda x: x.score, comparison=lambda x, y: x <= y
+        ), "List of candidates must be sorted!"
+
+        super().__init__(data_point, candidates[0].concept_id, candidates[0].score)
+        self.candidates = candidates
+        self.concept_name = self.candidates[0].concept_name
+        self.database_name = self.candidates[0].database_name
+
+    def __str__(self):
+        return (
+            f"{self.data_point.unlabeled_identifier}{flair._arrow} "
+            f"{self.concept_name} - {self.database_name}:{self._value} ({round(self._score, 4)})"
+        )
+
+    def __repr__(self):
+        return (
+            f"{self.data_point.unlabeled_identifier}{flair._arrow} "
+            f"{self.concept_name} - {self.database_name}:{self._value} ({round(self._score, 4)})"
+        )
+
+    def __len__(self):
+        return len(self.data_point)
+
+    def __eq__(self, other):
+        return (
+            self.value == other.value
+            and self.data_point == other.data_point
+            and self.concept_name == other.concept_name
+            and self.identifier == other.identifier
+            and self.database_name == other.database_name
+            and self.score == other.score
+        )
+
+    @property
+    def identifier(self):
+        return f"{self.value}"
+
+
 DT = typing.TypeVar("DT", bound=DataPoint)
 DT2 = typing.TypeVar("DT2", bound=DataPoint)
 

diff --git a/flair/datasets/__init__.py b/flair/datasets/__init__.py
@@ -37,6 +37,8 @@
     CLL,
     CRAFT,
     CRAFT_V4,
+    CTD_CHEMICALS_DICTIONARY,
+    CTD_DISEASES_DICTIONARY,
     DECA,
     FSU,
     GELLUS,
@@ -90,6 +92,8 @@
     LOCTEXT,
     MIRNA,
     NCBI_DISEASE,
+    NCBI_GENE_HUMAN_DICTIONARY,
+    NCBI_TAXONOMY_DICTIONARY,
     OSIRIS,
     PDR,
     S800,
@@ -386,6 +390,10 @@
     "LINNEAUS",
     "LOCTEXT",
     "MIRNA",
+    "NCBI_GENE_HUMAN_DICTIONARY",
+    "NCBI_TAXONOMY_DICTIONARY",
+    "CTD_DISEASES_DICTIONARY",
+    "CTD_CHEMICALS_DICTIONARY",
     "NCBI_DISEASE",
     "ONTONOTES",
     "OSIRIS",