Skip to content

Commit

Permalink
Merge pull request #3410 from flairNLP/hunflair2-release
Browse files Browse the repository at this point in the history
Update documentation for Hunflair2 release
  • Loading branch information
alanakbik authored Apr 5, 2024
2 parents 4fbc403 + 189c6e2 commit 223f346
Show file tree
Hide file tree
Showing 13 changed files with 779 additions and 5 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Flair is:

* **A powerful NLP library.** Flair allows you to apply our state-of-the-art natural language processing (NLP)
models to your text, such as named entity recognition (NER), sentiment analysis, part-of-speech tagging (PoS),
special support for [biomedical data](/resources/docs/HUNFLAIR.md),
special support for [biomedical texts](/resources/docs/HUNFLAIR2.md),
sense disambiguation and classification, with support for a rapidly growing number of languages.

* **A text embedding library.** Flair has simple interfaces that allow you to use and combine different word and
Expand Down
4 changes: 2 additions & 2 deletions flair/embeddings/token.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ def __init__(

self.__embedding_length: int = precomputed_word_embeddings.vector_size

vectors = np.row_stack(
vectors = np.vstack(
(
precomputed_word_embeddings.vectors,
np.zeros(self.__embedding_length, dtype="float"),
Expand Down Expand Up @@ -399,7 +399,7 @@ def __setstate__(self, state: Dict[str, Any]):
state.setdefault("field", None)
if "precomputed_word_embeddings" in state:
precomputed_word_embeddings: KeyedVectors = state.pop("precomputed_word_embeddings")
vectors = np.row_stack(
vectors = np.vstack(
(
precomputed_word_embeddings.vectors,
np.zeros(precomputed_word_embeddings.vector_size, dtype="float"),
Expand Down
2 changes: 2 additions & 0 deletions flair/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .multitask_model import MultitaskModel
from .pairwise_classification_model import TextPairClassifier
from .pairwise_regression_model import TextPairRegressor
from .prefixed_tagger import PrefixedSequenceTagger # This import has to be after SequenceTagger!
from .regexp_tagger import RegexpTagger
from .relation_classifier_model import RelationClassifier
from .relation_extractor_model import RelationExtractor
Expand All @@ -26,6 +27,7 @@
"RelationExtractor",
"RegexpTagger",
"SequenceTagger",
"PrefixedSequenceTagger",
"TokenClassifier",
"WordTagger",
"FewshotClassifier",
Expand Down
22 changes: 21 additions & 1 deletion flair/models/entity_mention_linking.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import inspect
import logging
import os
import platform
import re
import stat
import string
Expand Down Expand Up @@ -648,6 +649,8 @@ def p(text: str) -> str:
emb = emb / torch.norm(emb)
dense_embeddings.append(emb.cpu().numpy())
sent.clear_embeddings()

# empty cuda cache if device is a cuda device
if flair.device.type == "cuda":
torch.cuda.empty_cache()

Expand Down Expand Up @@ -681,6 +684,11 @@ def embed(self, entity_mentions: List[str]) -> Dict[str, np.ndarray]:
emb = emb / torch.norm(emb)
query_embeddings["dense"].append(emb.cpu().numpy())
sent.clear_embeddings(self.embeddings["dense"].get_names())

# Sanity conversion: if flair.device was set as a string, convert to torch.device
if isinstance(flair.device, str):
flair.device = torch.device(flair.device)

if flair.device.type == "cuda":
torch.cuda.empty_cache()

Expand Down Expand Up @@ -836,9 +844,13 @@ def extract_entities_mentions(self, sentence: Sentence, entity_label_types: Dict
if any(label in ["diseases", "genes", "species", "chemical"] for label in sentence.annotation_layers):
if not self._warned_legacy_sequence_tagger:
logger.warning(
"The tagger `Classifier.load('hunflair') is deprecated. Please update to: `Classifier.load('hunflair2')`."
"It appears that the sentences have been annotated with HunFlair (version 1). "
"Consider using HunFlair2 for improved extraction performance: Classifier.load('hunflair2')."
"See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
"information."
)
self._warned_legacy_sequence_tagger = True

entity_types = {e for sublist in entity_label_types.values() for e in sublist}
entities_mentions = [
label for label in sentence.get_labels() if normalize_entity_type(label.value) in entity_types
Expand Down Expand Up @@ -939,6 +951,14 @@ def _fetch_model(model_name: str) -> str:
if model_name in hf_model_map:
model_name = hf_model_map[model_name]

if platform.system() == "Windows":
logger.warning(
"You seem to run your application on a Windows system. Unfortunately, the abbreviation "
"resolution of HunFlair2 is only available on Linux/Mac systems. Therefore, a model "
"without abbreviation resolution is therefore loaded"
)
model_name += "-no-ab3p"

return hf_download(model_name)

@classmethod
Expand Down
8 changes: 8 additions & 0 deletions flair/models/multitask_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ def _fetch_model(model_name) -> str:

cache_dir = Path("models")
if model_name in model_map:
if model_name in ["hunflair", "hunflair-paper", "bioner"]:
log.warning(
"HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
"Classifier.load('hunflair2')."
"See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
"information."
)

model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

return model_name
Expand Down
21 changes: 20 additions & 1 deletion flair/models/prefixed_tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import flair.data
from flair.data import Corpus, Sentence, Token
from flair.datasets import DataLoader, FlairDatapointDataset
from flair.models import SequenceTagger
from flair.file_utils import hf_download
from flair.models.sequence_tagger_model import SequenceTagger


class PrefixedSentence(Sentence):
Expand Down Expand Up @@ -317,3 +318,21 @@ def augment_sentences(
sentences = [sentences]

return [self.augmentation_strategy.augment_sentence(sentence, annotation_layers) for sentence in sentences]

@staticmethod
def _fetch_model(model_name) -> str:
huggingface_model_map = {"hunflair2": "hunflair/hunflair2-ner"}

# check if model name is a valid local file
if Path(model_name).exists():
model_path = model_name

# check if model name is a pre-configured hf model
elif model_name in huggingface_model_map:
hf_model_name = huggingface_model_map[model_name]
return hf_download(hf_model_name)

else:
model_path = hf_download(model_name)

return model_path
8 changes: 8 additions & 0 deletions flair/models/sequence_tagger_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -781,6 +781,14 @@ def _fetch_model(model_name) -> str:
elif model_name in hu_model_map:
model_path = cached_path(hu_model_map[model_name], cache_dir=cache_dir)

if model_name.startswith("hunflair-"):
log.warning(
"HunFlair (version 1) is deprecated. Consider using HunFlair2 for improved extraction performance: "
"Classifier.load('hunflair2')."
"See https://github.com/flairNLP/flair/blob/master/resources/docs/HUNFLAIR2.md for further "
"information."
)

# special handling for the taggers by the @redewiegergabe project (TODO: move to model hub)
elif model_name == "de-historic-indirect":
model_file = flair.cache_root / cache_dir / "indirect" / "final-model.pt"
Expand Down
3 changes: 3 additions & 0 deletions resources/docs/HUNFLAIR.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ NER data sets](HUNFLAIR_CORPORA.md) and comes with a Flair language model ("pubm
FastText embeddings ("pubmed") that were trained on roughly 3 million full texts and about
25 million abstracts from the biomedical domain.

**<span style="color:red">Using HunFlair (version 1) is deprecated, please refer to [HunFlair2](HUNFLAIR2.md)
for an updated and improved version.</span>**

<b>Content:</b>
[Quick Start](#quick-start) |
[BioNER-Tool Comparison](#comparison-to-other-biomedical-ner-tools) |
Expand Down
137 changes: 137 additions & 0 deletions resources/docs/HUNFLAIR2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# HunFlair2

*HunFlair2* is a state-of-the-art named entity tagger and linker for biomedical texts. It comes with
models for genes/proteins, chemicals, diseases, species and cell lines. *HunFlair2*
builds on pretrained domain-specific language models and outperforms other biomedical
NER tools on unseen corpora.

<b>Content:</b>
[Quick Start](#quick-start) |
[Tool Comparison](#comparison-to-other-biomedical-entity-extraction-tools) |
[Tutorials](#tutorials) |
[Citing HunFlair](#citing-hunflair2)

## Quick Start

#### Requirements and Installation
*HunFlair2* is based on Flair 0.13+ and Python 3.8+. If you do not have Python 3.8, install it first.
Then, in your favorite virtual environment, simply do:
```
pip install flair
```

#### Example 1: Biomedical NER
Let's run named entity recognition (NER) over an example sentence. All you need to do is
make a Sentence, load a pre-trained model and use it to predict tags for the sentence:
```python
from flair.data import Sentence
from flair.nn import Classifier

# make a sentence
sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical NER tagger
tagger = Classifier.load("hunflair2")

# tag sentence
tagger.predict(sentence)
```
Done! The Sentence now has entity annotations. Let's print the entities found by the tagger:
```python
for entity in sentence.get_labels():
print(entity)
```
This should print:
```console
Span[0:2]: "Behavioral abnormalities" → Disease (1.0)
Span[4:5]: "Fmr1" → Gene (1.0)
Span[6:7]: "Mouse" → Species (1.0)
Span[9:12]: "Fragile X Syndrome" → Disease (1.0)
```

#### Example 2: Biomedical NEN
For improved integration and aggregation from multiple different documents linking / normalizing the entities to
standardized ontologies or knowledge bases is required. Let's perform entity normalization by using
specialized models per entity type:
```python
from flair.data import Sentence
from flair.models import EntityMentionLinker
from flair.nn import Classifier

# make a sentence
sentence = Sentence("Behavioral abnormalities in the Fmr1 KO2 Mouse Model of Fragile X Syndrome")

# load biomedical NER tagger + predict entities
tagger = Classifier.load("hunflair2")
tagger.predict(sentence)

# load gene linker and perform normalization
gene_linker = EntityMentionLinker.load("gene-linker")
gene_linker.predict(sentence)

# load disease linker and perform normalization
disease_linker = EntityMentionLinker.load("disease-linker")
disease_linker.predict(sentence)

# load species linker and perform normalization
species_linker = EntityMentionLinker.load("species-linker")
species_linker.predict(sentence)
```
**Note**, the ontologies and knowledge bases used are pre-processed the first time the normalisation is executed,
which might takes a certain amount of time. All further calls are then based on this pre-processing and run
much faster.

Done! The Sentence now has entity normalizations. Let's print the entity identifiers found by the linkers:
```python
for entity in sentence.get_labels("link"):
print(entity)
```
This should print:
```console
Span[0:2]: "Behavioral abnormalities" → MESH:D001523/name=Mental Disorders (197.9467010498047)
Span[4:5]: "Fmr1" → 108684022/name=FRAXA (219.9510040283203)
Span[6:7]: "Mouse" → 10090/name=Mus musculus (213.6201934814453)
Span[9:12]: "Fragile X Syndrome" → MESH:D005600/name=Fragile X Syndrome (193.7115020751953)
```

## Comparison to other biomedical entity extraction tools
Tools for biomedical entity extraction are typically trained and evaluated on single, rather small gold standard
data sets. However, they are applied "in the wild" to a much larger collection of texts, often varying in
topic, entity distribution, genre (e.g. patents vs. scientific articles) and text type (e.g. abstract
vs. full text), which can lead to severe drops in performance.

*HunFlair2* outperforms other biomedical entity extraction tools on corpora not used for training of neither
*HunFlair2* or any of the competitor tools.

| Corpus | Entity Type | BENT | BERN2 | PubTator Central | SciSpacy | HunFlair |
|----------------------------------------------------------------------------------------------|-------------|-------|-------|------------------|----------|-------------|
| [MedMentions](https://github.com/chanzuckerberg/MedMentions) | Chemical | 40.90 | 41.79 | 31.28 | 34.95 | *__51.17__* |
| | Disease | 45.94 | 47.33 | 41.11 | 40.78 | *__57.27__* |
| [tmVar (v3)](https://github.com/ncbi/tmVar3?tab=readme-ov-file) | Gene | 0.54 | 43.96 | *__86.02__* | - | 76.75 |
| [BioID](https://biocreative.bioinformatics.udel.edu/media/store/files/2018/BC6_track1_1.pdf) | Species | 10.35 | 14.35 | *__58.90__* | 37.14 | 49.66 |
|||||
| Average | All | 24.43 | 36.86 | 54.33 | 37.61 | *__58.79__* |

<sub>All results are F1 scores highlighting end-to-end performance, i.e., named entity recognition and normalization,
using partial matching of predicted text offsets with the original char offsets of the gold standard data.
We allow a shift by max one character.</sub>

You can find detailed evaluations and discussions in [our paper](https://arxiv.org/abs/2402.12372).

## Tutorials
We provide a set of quick tutorials to get you started with *HunFlair2*:
* [Tutorial 1: Tagging biomedical named entities](HUNFLAIR2_TUTORIAL_1_TAGGING.md)
* [Tutorial 2: Linking biomedical named entities](HUNFLAIR2_TUTORIAL_2_LINKING.md)
* [Tutorial 3: Training NER models](HUNFLAIR2_TUTORIAL_3_TRAINING_NER.md)
* [Tutorial 4: Customizing linking](HUNFLAIR2_TUTORIAL_4_CUSTOMIZE_LINKING.md)

## Citing HunFlair2
Please cite the following paper when using *HunFlair2*:
~~~
@article{sanger2024hunflair2,
title={HunFlair2 in a cross-corpus evaluation of biomedical named entity recognition and normalization tools},
author={S{\"a}nger, Mario and Garda, Samuele and Wang, Xing David and Weber-Genzel, Leon and Droop, Pia and Fuchs, Benedikt and Akbik, Alan and Leser, Ulf},
journal={arXiv preprint arXiv:2402.12372},
year={2024}
}
~~~
Loading

0 comments on commit 223f346

Please sign in to comment.