Merge pull request #3410 from flairNLP/hunflair2-release

Update documentation for Hunflair2 release
flairNLP · Apr 5, 2024 · 223f346 · 223f346
2 parents 4fbc403 + 189c6e2
commit 223f346
Show file tree

Hide file tree

Showing 13 changed files with 779 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ Flair is:
 
 * **A powerful NLP library.** Flair allows you to apply our state-of-the-art natural language processing (NLP)
 models to your text, such as named entity recognition (NER), sentiment analysis, part-of-speech tagging (PoS),
-  special support for [biomedical data](/resources/docs/HUNFLAIR.md),
+  special support for [biomedical texts](/resources/docs/HUNFLAIR2.md),
  sense disambiguation and classification, with support for a rapidly growing number of languages.
 
 * **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word and

diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
@@ -204,7 +204,7 @@ def __init__(
 
             self.__embedding_length: int = precomputed_word_embeddings.vector_size
 
-            vectors = np.row_stack(
+            vectors = np.vstack(
                 (
                     precomputed_word_embeddings.vectors,
                     np.zeros(self.__embedding_length, dtype="float"),
@@ -399,7 +399,7 @@ def __setstate__(self, state: Dict[str, Any]):
         state.setdefault("field", None)
         if "precomputed_word_embeddings" in state:
             precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings")
-            vectors = np.row_stack(
+            vectors = np.vstack(
                 (
                     precomputed_word_embeddings.vectors,
                     np.zeros(precomputed_word_embeddings.vector_size, dtype="float"),

diff --git a/flair/models/__init__.py b/flair/models/__init__.py
@@ -6,6 +6,7 @@
 from .multitask_model import MultitaskModel
 from .pairwise_classification_model import TextPairClassifier
 from .pairwise_regression_model import TextPairRegressor
+from .prefixed_tagger import PrefixedSequenceTagger  # This import has to be after SequenceTagger!
 from .regexp_tagger import RegexpTagger
 from .relation_classifier_model import RelationClassifier
 from .relation_extractor_model import RelationExtractor
@@ -26,6 +27,7 @@
     "RelationExtractor",
     "RegexpTagger",
     "SequenceTagger",
+    "PrefixedSequenceTagger",
     "TokenClassifier",
     "WordTagger",
     "FewshotClassifier",

diff --git a/flair/models/entity_mention_linking.py b/flair/models/entity_mention_linking.py
@@ -1,6 +1,7 @@
 import inspect
 import logging
 import os
+import platform
 import re
 import stat
 import string
@@ -648,6 +649,8 @@ def p(text: str) -> str:
                         emb = emb / torch.norm(emb)
                     dense_embeddings.append(emb.cpu().numpy())
                     sent.clear_embeddings()
+
+                # empty cuda cache if device is a cuda device
                 if flair.device.type == "cuda":
                     torch.cuda.empty_cache()
 
@@ -681,6 +684,11 @@ def embed(self, entity_mentions: List[str]) -> Dict[str, np.ndarray]:
                         emb = emb / torch.norm(emb)
                     query_embeddings["dense"].append(emb.cpu().numpy())
                     sent.clear_embeddings(self.embeddings["dense"].get_names())
+
+                # Sanity conversion: if flair.device was set as a string, convert to torch.device
+                if isinstance(flair.device, str):
+                    flair.device = torch.device(flair.device)
+
                 if flair.device.type == "cuda":
                     torch.cuda.empty_cache()
 
@@ -836,9 +844,13 @@ def extract_entities_mentions(self, sentence: Sentence, entity_label_types: Dict
         if any(label in ["diseases", "genes", "species", "chemical"] for label in sentence.annotation_layers):
             if not self._warned_legacy_sequence_tagger:
                 logger.warning(
-                    "The tagger `Classifier.load('hunflair') is deprecated. Please update to: `Classifier.load('hunflair2')`."
+                    "It appears that the sentences have been annotated with HunFlair (version 1). "
+                    "Consider using HunFlair2 for improved extraction performance: Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
                 )
                 self._warned_legacy_sequence_tagger = True
+
             entity_types = {e for sublist in entity_label_types.values() for e in sublist}
             entities_mentions = [
                 label for label in sentence.get_labels() if normalize_entity_type(label.value) in entity_types
@@ -939,6 +951,14 @@ def _fetch_model(model_name: str) -> str:
         if model_name in hf_model_map:
             model_name = hf_model_map[model_name]
 
+            if platform.system() == "Windows":
+                logger.warning(
+                    "You seem to run your application on a Windows system. Unfortunately, the abbreviation "
+                    "resolution of HunFlair2 is only available on Linux/Mac systems. Therefore, a model "
+                    "without abbreviation resolution is therefore loaded"
+                )
+                model_name += "-no-ab3p"
+
         return hf_download(model_name)
 
     @classmethod

diff --git a/flair/models/multitask_model.py b/flair/models/multitask_model.py
@@ -260,6 +260,14 @@ def _fetch_model(model_name) -> str:
 
         cache_dir = Path("models")
         if model_name in model_map:
+            if model_name in ["hunflair", "hunflair-paper", "bioner"]:
+                log.warning(
+                    "HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
+                    "Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
+                )
+
             model_name = cached_path(model_map[model_name], cache_dir=cache_dir)
 
         return model_name

diff --git a/flair/models/prefixed_tagger.py b/flair/models/prefixed_tagger.py
@@ -9,7 +9,8 @@
 import flair.data
 from flair.data import Corpus, Sentence, Token
 from flair.datasets import DataLoader, FlairDatapointDataset
-from flair.models import SequenceTagger
+from flair.file_utils import hf_download
+from flair.models.sequence_tagger_model import SequenceTagger
 
 
 class PrefixedSentence(Sentence):
@@ -317,3 +318,21 @@ def augment_sentences(
             sentences = [sentences]
 
         return [self.augmentation_strategy.augment_sentence(sentence, annotation_layers) for sentence in sentences]
+
+    @staticmethod
+    def _fetch_model(model_name) -> str:
+        huggingface_model_map = {"hunflair2": "hunflair/hunflair2-ner"}
+
+        # check if model name is a valid local file
+        if Path(model_name).exists():
+            model_path = model_name
+
+        # check if model name is a pre-configured hf model
+        elif model_name in huggingface_model_map:
+            hf_model_name = huggingface_model_map[model_name]
+            return hf_download(hf_model_name)
+
+        else:
+            model_path = hf_download(model_name)
+
+        return model_path
diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -781,6 +781,14 @@ def _fetch_model(model_name) -> str:
         elif model_name in hu_model_map:
             model_path = cached_path(hu_model_map[model_name], cache_dir=cache_dir)
 
+            if model_name.startswith("hunflair-"):
+                log.warning(
+                    "HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
+                    "Classifier.load('hunflair2')."
+                    "See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
+                    "information."
+                )
+
         # special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
         elif model_name == "de-historic-indirect":
             model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"

diff --git a/resources/docs/HUNFLAIR.md b/resources/docs/HUNFLAIR.md
@@ -8,6 +8,9 @@ NER data sets](HUNFLAIR_CORPORA.md) and comes with a Flair language model ("pubm
 FastText embeddings ("pubmed") that were trained on roughly 3 million full texts and about
 25 million abstracts from the biomedical domain.
 
+**<span style="color:red">Using HunFlair (version 1) is deprecated, please refer to [HunFlair2](HUNFLAIR2.md)
+for an updated and improved version.</span>**
+
 <b>Content:</b>
 [Quick Start](#quick-start) |
 [BioNER-Tool Comparison](#comparison-to-other-biomedical-ner-tools) |

diff --git a/resources/docs/HUNFLAIR2.md b/resources/docs/HUNFLAIR2.md
@@ -0,0 +1,137 @@
+# HunFlair2
+
+*HunFlair2* is a state-of-the-art named entity tagger and linker for biomedical texts. It comes with
+models for genes/proteins, chemicals, diseases, species and cell lines. *HunFlair2*
+builds on pretrained domain-specific language models and outperforms other biomedical
+NER tools on unseen corpora.
+
+<b>Content:</b>
+[Quick Start](#quick-start) |
+[Tool Comparison](#comparison-to-other-biomedical-entity-extraction-tools) |
+[Tutorials](#tutorials) |
+[Citing HunFlair](#citing-hunflair2)
+
+## Quick Start
+
+#### Requirements and Installation
+*HunFlair2* is based on Flair 0.13+ and Python 3.8+. If you do not have Python 3.8, install it first.
+Then, in your favorite virtual environment, simply do:
+```
+pip install flair
+```
+
+#### Example 1: Biomedical NER 
+Let's run named entity recognition (NER) over an example sentence. All you need to do is
+make a Sentence, load a pre-trained model and use it to predict tags for the sentence:
+```python
+from flair.data import Sentence
+from flair.nn import Classifier
+
+# make a sentence 
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")
+
+# load biomedical NER tagger
+tagger = Classifier.load("hunflair2")
+
+# tag sentence
+tagger.predict(sentence)
+```
+Done! The Sentence now has entity annotations. Let's print the entities found by the tagger:
+```python
+for entity in sentence.get_labels():
+    print(entity)
+```
+This should print:
+```console
+Span[0:2]: "Behavioral abnormalities" → Disease (1.0)
+Span[4:5]: "Fmr1" → Gene (1.0)
+Span[6:7]: "Mouse" → Species (1.0)
+Span[9:12]: "Fragile X Syndrome" → Disease (1.0)
+```
+
+#### Example 2: Biomedical NEN
+For improved integration and aggregation from multiple different documents linking / normalizing the entities to 
+standardized ontologies or knowledge bases is required. Let's perform entity normalization by using
+specialized models per entity type:
+```python
+from flair.data import Sentence
+from flair.models import EntityMentionLinker
+from flair.nn import Classifier
+
+# make a sentence
+sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")
+
+# load biomedical NER tagger + predict entities
+tagger = Classifier.load("hunflair2")
+tagger.predict(sentence)
+
+# load gene linker and perform normalization
+gene_linker = EntityMentionLinker.load("gene-linker")
+gene_linker.predict(sentence)
+
+# load disease linker and perform normalization
+disease_linker = EntityMentionLinker.load("disease-linker")
+disease_linker.predict(sentence)
+
+# load species linker and perform normalization
+species_linker = EntityMentionLinker.load("species-linker")
+species_linker.predict(sentence)
+```
+**Note**, the ontologies and knowledge bases used are pre-processed the first time the normalisation is executed, 
+which might takes a certain amount of time. All further calls are then based on this pre-processing and run 
+much faster.
+
+Done! The Sentence now has entity normalizations. Let's print the entity identifiers found by the linkers:
+```python
+for entity in sentence.get_labels("link"):
+    print(entity)
+```
+This should print:
+```console
+Span[0:2]: "Behavioral abnormalities" → MESH:D001523/name=Mental Disorders (197.9467010498047)
+Span[4:5]: "Fmr1" → 108684022/name=FRAXA (219.9510040283203)
+Span[6:7]: "Mouse" → 10090/name=Mus musculus (213.6201934814453)
+Span[9:12]: "Fragile X Syndrome" → MESH:D005600/name=Fragile X Syndrome (193.7115020751953)
+```
+
+## Comparison to other biomedical entity extraction tools
+Tools for biomedical entity extraction are typically trained and evaluated on single, rather small gold standard 
+data sets.  However, they are applied "in the wild" to a much larger collection of texts, often varying in
+topic, entity distribution, genre (e.g. patents vs. scientific articles) and text type (e.g. abstract
+vs. full text), which can lead to severe drops in performance.
+
+*HunFlair2* outperforms other biomedical entity extraction tools on corpora not used for training of neither 
+*HunFlair2* or any of the competitor tools.
+
+| Corpus                                                                                       | Entity Type | BENT  | BERN2 | PubTator Central | SciSpacy | HunFlair    |
+|----------------------------------------------------------------------------------------------|-------------|-------|-------|------------------|----------|-------------|
+| [MedMentions](https://github.com/chanzuckerberg/MedMentions)                                 | Chemical    | 40.90 | 41.79 | 31.28            | 34.95    | *__51.17__* |
+|                                                                                              | Disease     | 45.94 | 47.33 | 41.11            | 40.78    | *__57.27__* |
+| [tmVar (v3)](https://github.com/ncbi/tmVar3?tab=readme-ov-file)                              | Gene        | 0.54  | 43.96 | *__86.02__*      | -        | 76.75       |
+| [BioID](https://biocreative.bioinformatics.udel.edu/media/store/files/2018/BC6_track1_1.pdf) | Species     | 10.35 | 14.35 | *__58.90__*      | 37.14    | 49.66       |
+|||||
+| Average                                                                                      | All         | 24.43 | 36.86 | 54.33            | 37.61    | *__58.79__* |
+
+<sub>All results are F1 scores highlighting end-to-end performance, i.e., named entity recognition and normalization,
+using partial matching of predicted text offsets with the original char offsets of the gold standard data. 
+We allow a shift by max one character.</sub>
+
+You can find detailed evaluations and discussions in [our paper](https://arxiv.org/abs/2402.12372).
+
+## Tutorials
+We provide a set of quick tutorials to get you started with *HunFlair2*:
+* [Tutorial 1: Tagging biomedical named entities](HUNFLAIR2_TUTORIAL_1_TAGGING.md)
+* [Tutorial 2: Linking biomedical named entities](HUNFLAIR2_TUTORIAL_2_LINKING.md)
+* [Tutorial 3: Training NER models](HUNFLAIR2_TUTORIAL_3_TRAINING_NER.md)
+* [Tutorial 4: Customizing linking](HUNFLAIR2_TUTORIAL_4_CUSTOMIZE_LINKING.md)
+
+## Citing HunFlair2
+Please cite the following paper when using *HunFlair2*:
+~~~
+@article{sanger2024hunflair2,
+  title={HunFlair2 in a cross-corpus evaluation of biomedical named entity recognition and normalization tools},
+  author={S{\"a}nger, Mario and Garda, Samuele and Wang, Xing David and Weber-Genzel, Leon and Droop, Pia and Fuchs, Benedikt and Akbik, Alan and Leser, Ulf},
+  journal={arXiv preprint arXiv:2402.12372},
+  year={2024}
+}
+~~~