Merge branch 'master' into variable_grad_norm_clipping

flairNLP · Aug 8, 2023 · 0de88ce · 0de88ce
2 parents 00d9d49 + c96660c
commit 0de88ce
Show file tree

Hide file tree

Showing 27 changed files with 238 additions and 3,615 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,55 +11,24 @@ jobs:
       TRANSFORMERS_CACHE: ./cache/transformers
       FLAIR_CACHE_ROOT: ./cache/flair
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python 3.7
         id: setup-python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.7
-      #----------------------------------------------
-      #       Try to load cached poetry installation
-      #----------------------------------------------
-      - name: Load cached Poetry installation
-        id: cached-poetry
-        uses: actions/cache@v3
-        with:
-          path: |
-            ~/.local
-            ~/.cache/pypoetry
-          key: poetry-0 # increment to reset cache
-      #----------------------------------------------
-      #       Install poetry if not using cached
-      #----------------------------------------------
-      - name: Install Poetry
-        if: steps.cached-poetry.outputs.cache-hit != 'true'
-        uses: snok/install-poetry@v1
-        with:
-          virtualenvs-in-project: true
-      #----------------------------------------------
-      #       Lock dependencies to get fresh versions
-      #----------------------------------------------
-      - name: Run poetry lock
-        run: |
-          poetry config virtualenvs.in-project true
-          poetry lock
-      #----------------------------------------------
-      # install dependencies if cache does not exist
-      #----------------------------------------------
-      - name: Install dependencies
-        run: |
-          poetry config virtualenvs.in-project true
-          poetry install --no-interaction --no-root
-      #----------------------------------------------
-      #              run test suite
-      #----------------------------------------------
+      - name: Install Flair dependencies
+        run: pip install -e .
+      - name: Install unittest dependencies
+        run: pip install -r requirements-dev.txt
+      - name: Show installed dependencies
+        run: pip freeze
       - name: Cache downloaded models/datasets
         uses: actions/cache@v3
         with:
           path: ./cache
           key: cache-v1.1
       - name: Run tests
         run: |
-          source .venv/bin/activate
           python -c 'import flair'
           pytest --runintegration --durations=0 -vv
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -3,14 +3,11 @@
 We are happy to accept your contributions to make `flair` better and more awesome! To avoid unnecessary work on either
 side, please stick to the following process:
 
-1. Check if there is already [an issue](https://github.com/zalandoresearch/flair/issues) for your concern.
+1. Check if there is already [an issue](https://github.com/flairNLP/flair/issues) for your concern.
 2. If there is not, open a new one to start a discussion. We hate to close finished PRs!
 3. If we decide your concern needs code changes, we would be happy to accept a pull request. Please consider the
    commit guidelines below.
 
-In case you just want to help out and don't know where to start,
-[issues with "help wanted" label](https://github.com/zalandoresearch/flair/labels/help%20wanted) are good for
-first-time contributors.
 
 ## Git Commit Guidelines
 
@@ -30,18 +27,10 @@ the code should hopefully be easy.
 Flair requires python-3.7 or higher. To make sure your code also runs on the oldest supported
 python version, it is recommended to use python-3.7.x for flair development.
 
-We use [poetry](https://python-poetry.org) for dependency and virtual environment management.
-Install poetry following the official guide at https://python-poetry.org/docs/#installation - using the official installer
-or via [pipx](https://pypa.github.io/pipx/).
-
-After the pre-conditions are met, switch into the checked out flair folder.
-
+Create a python environment of your preference and run:
 ```bash
-# Install dependencies (including dev dependencies)
-poetry install
-
-# You can turn on the installed environment
-poetry shell
+pip install -r requirements-dev.txt
+pip install -e .
 ```
 
 ### Tests

diff --git a/flair/__init__.py b/flair/__init__.py
@@ -10,6 +10,10 @@
 
 cache_root = Path(os.getenv("FLAIR_CACHE_ROOT", Path(Path.home(), ".flair")))
 
+device: torch.device
+"""Flair is using a single device for everything. You can set this device by overwriting this variable."""
+
+
 # global variable: device
 if torch.cuda.is_available():
     device_id = os.environ.get("FLAIR_DEVICE")

diff --git a/flair/data.py b/flair/data.py
@@ -143,7 +143,7 @@ def save(self, savefile):
     def __setstate__(self, d):
         self.__dict__ = d
         # set 'add_unk' if the dictionary was created with a version of Flair older than 0.9
-        if "add_unk" not in self.__dict__.keys():
+        if "add_unk" not in self.__dict__:
             self.__dict__["add_unk"] = b"<unk>" in self.__dict__["idx2item"]
 
     @classmethod
@@ -1675,7 +1675,7 @@ def __str__(self) -> str:
             f"{len(self.dev) if self.dev else 0} dev + "
             f"{len(self.test) if self.test else 0} test sentences\n - "
         )
-        output += "\n - ".join([f"{type(corpus).__name__} {str(corpus)} - {corpus.name}" for corpus in self.corpora])
+        output += "\n - ".join([f"{type(corpus).__name__} {corpus!s} - {corpus.name}" for corpus in self.corpora])
         return output
 
 

diff --git a/flair/datasets/sequence_labeling.py b/flair/datasets/sequence_labeling.py
@@ -560,9 +560,11 @@ def _identify_span_columns(self, column_name_map, skip_first_line):
             # check the first 5 sentences
             probe = []
             for _i in range(5):
-                sentence = self._convert_lines_to_sentence(
-                    self._read_next_sentence(file), word_level_tag_columns=column_name_map
-                )
+                next_sentence = self._read_next_sentence(file)
+                if len(next_sentence) == 0:
+                    break
+
+                sentence = self._convert_lines_to_sentence(next_sentence, word_level_tag_columns=column_name_map)
                 if sentence:
                     probe.append(sentence)
                 else:

diff --git a/flair/embeddings/base.py b/flair/embeddings/base.py
@@ -194,7 +194,7 @@ def embedding_type(self) -> str:
     def _everything_embedded(self, data_points: Sequence[Sentence]) -> bool:
         for sentence in data_points:
             for token in sentence.tokens:
-                if self.name not in token._embeddings.keys():
+                if self.name not in token._embeddings:
                     return False
         return True
 

diff --git a/flair/embeddings/document.py b/flair/embeddings/document.py
@@ -333,7 +333,7 @@ def _add_embeddings_internal(self, sentences: List[Sentence]):
 
         pre_allocated_zero_tensor = torch.zeros(
             self.embeddings.embedding_length * longest_token_sequence_in_batch,
-            dtype=torch.float,
+            dtype=self.rnn.all_weights[0][0].dtype,
             device=flair.device,
         )
 
@@ -691,7 +691,7 @@ def _add_embeddings_internal(self, sentences: List[Sentence]):
 
         pre_allocated_zero_tensor = torch.zeros(
             self.embeddings.embedding_length * padding_length,
-            dtype=torch.float,
+            dtype=self.convs[0].weight.dtype,
             device=flair.device,
         )
 

diff --git a/flair/embeddings/image.py b/flair/embeddings/image.py
@@ -73,7 +73,7 @@ class PrecomputedImageEmbeddings(ImageEmbeddings):
     def __init__(self, url2tensor_dict, name) -> None:
         self.url2tensor_dict = url2tensor_dict
         self.name = name
-        self.__embedding_length = len(list(self.url2tensor_dict.values())[0])
+        self.__embedding_length = len(next(iter(self.url2tensor_dict.values())))
         self.static_embeddings = True
         super().__init__()
 

diff --git a/flair/embeddings/legacy.py b/flair/embeddings/legacy.py
@@ -1330,7 +1330,7 @@ def embed(self, sentences: Union[List[Sentence], Sentence]):
             sentences = [sentences]
 
         for sentence in sentences:
-            if self.name not in sentence._embeddings.keys():
+            if self.name not in sentence._embeddings:
                 everything_embedded = False
 
         if not everything_embedded:

diff --git a/flair/embeddings/token.py b/flair/embeddings/token.py
@@ -77,8 +77,8 @@ def __init__(self, embeddings: List[TokenEmbeddings], overwrite_names: bool = Tr
         # IMPORTANT: add embeddings as torch modules
         for i, embedding in enumerate(embeddings):
             if overwrite_names:
-                embedding.name = f"{str(i)}-{embedding.name}"
-            self.add_module(f"list_embedding_{str(i)}", embedding)
+                embedding.name = f"{i!s}-{embedding.name}"
+            self.add_module(f"list_embedding_{i!s}", embedding)
 
         self.name: str = "Stack"
         self.__names = [name for embedding in self.embeddings for name in embedding.get_names()]
@@ -428,14 +428,14 @@ def to_params(self) -> Dict[str, Any]:
             "embedding_length": self.__embedding_length,
         }
 
-    def state_dict(self, *args, destination=None, prefix="", keep_vars=False):
+    def state_dict(self, *args, **kwargs):
         # when loading the old versions from pickle, the embeddings might not be added as pytorch module.
         # we do this delayed, when the weights are collected (e.g. for saving), as doing this earlier might
         # lead to issues while loading (trying to load weights that weren't stored as python weights and therefore
         # not finding them)
         if list(self.modules()) == [self]:
             self.embedding = self.embedding
-        return super().state_dict(*args, destination=destination, prefix=prefix, keep_vars=keep_vars)
+        return super().state_dict(*args, **kwargs)
 
 
 @register_embeddings
@@ -525,7 +525,7 @@ def _add_embeddings_internal(self, sentences: List[Sentence]):
             outputs = outputs.transpose(0, 1)
             chars_embeds_temp = torch.zeros(
                 (outputs.size(0), outputs.size(2)),
-                dtype=torch.float,
+                dtype=outputs.dtype,
                 device=flair.device,
             )
             for i, index in enumerate(output_lengths):
@@ -1086,7 +1086,7 @@ def to_params(self):
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_path = Path(temp_dir)
             out_path = temp_path / "fasttext.model"
-            self.precomputed_word_embeddings.save(str(out_path))
+            self.precomputed_word_embeddings.save(str(out_path), separately=[])
             return {"name": self.name, "field": self.field, "fasttext_binary": out_path.read_bytes()}
 
 

diff --git a/flair/embeddings/transformer.py b/flair/embeddings/transformer.py
@@ -11,6 +11,8 @@
 from typing import Any, Dict, List, Optional, Tuple, Type, Union, cast
 
 import torch
+import transformers
+from semver import Version
 from torch.jit import ScriptModule
 from transformers import (
     CONFIG_MAPPING,
@@ -49,7 +51,7 @@ def pad_sequence_embeddings(all_hidden_states: List[torch.Tensor]) -> torch.Tens
             longest_token_sequence_in_batch = hidden_states.shape[0]
     pre_allocated_zero_tensor = torch.zeros(
         embedding_length * longest_token_sequence_in_batch,
-        dtype=torch.float,
+        dtype=all_hidden_states[0].dtype,
         device=flair.device,
     )
     all_embs = []
@@ -179,15 +181,19 @@ def fill_mean_token_embeddings(
 
 @torch.jit.script_if_tracing
 def document_mean_pooling(sentence_hidden_states: torch.Tensor, sentence_lengths: torch.Tensor):
-    result = torch.zeros(sentence_hidden_states.shape[0], sentence_hidden_states.shape[2])
+    result = torch.zeros(
+        sentence_hidden_states.shape[0], sentence_hidden_states.shape[2], dtype=sentence_hidden_states.dtype
+    )
 
     for i in torch.arange(sentence_hidden_states.shape[0]):
         result[i] = sentence_hidden_states[i, : sentence_lengths[i]].mean(dim=0)
 
 
 @torch.jit.script_if_tracing
 def document_max_pooling(sentence_hidden_states: torch.Tensor, sentence_lengths: torch.Tensor):
-    result = torch.zeros(sentence_hidden_states.shape[0], sentence_hidden_states.shape[2])
+    result = torch.zeros(
+        sentence_hidden_states.shape[0], sentence_hidden_states.shape[2], dtype=sentence_hidden_states.dtype
+    )
 
     for i in torch.arange(sentence_hidden_states.shape[0]):
         result[i], _ = sentence_hidden_states[i, : sentence_lengths[i]].max(dim=0)
@@ -1105,6 +1111,16 @@ def embedding_length(self) -> int:
 
         return self.embedding_length_internal
 
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        if transformers.__version__ >= Version(4, 31, 0):
+            assert isinstance(state_dict, dict)
+            state_dict.pop(f"{prefix}model.embeddings.position_ids", None)
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
     def _has_initial_cls_token(self) -> bool:
         # most models have CLS token as last token (GPT-1, GPT-2, TransfoXL, XLNet, XLM), but BERT is initial
         if self.tokenizer_needs_ocr_boxes:
@@ -1191,6 +1207,8 @@ def __setstate__(self, state):
             self.__dict__[key] = embedding.__dict__[key]
 
         if model_state_dict:
+            if transformers.__version__ >= Version(4, 31, 0):
+                model_state_dict.pop("embeddings.position_ids", None)
             self.model.load_state_dict(model_state_dict)
 
     @classmethod
@@ -1314,7 +1332,11 @@ def forward(
             assert word_ids is not None
             assert token_lengths is not None
             all_token_embeddings = torch.zeros(  # type: ignore[call-overload]
-                word_ids.shape[0], token_lengths.max(), self.embedding_length_internal, device=flair.device
+                word_ids.shape[0],
+                token_lengths.max(),
+                self.embedding_length_internal,
+                device=flair.device,
+                dtype=sentence_hidden_states.dtype,
             )
             true_tensor = torch.ones_like(word_ids[:, :1], dtype=torch.bool)
             if self.subtoken_pooling == "first":

diff --git a/flair/inference_utils.py b/flair/inference_utils.py
@@ -75,7 +75,7 @@ def __init__(self, embedding: WordEmbeddings, backend="sqlite", verbose=True) ->
         self.name = embedding.name
         self.store_path: Path = WordEmbeddingsStore._get_store_path(embedding, backend)
         if verbose:
-            logger.info(f"store filename: {str(self.store_path)}")
+            logger.info(f"store filename: {self.store_path!s}")
         self.backend: Union[WordEmbeddings, WordEmbeddingsStoreBackend]
         if backend == "sqlite":
             self.backend = SqliteWordEmbeddingsStoreBackend(embedding, verbose)
@@ -143,7 +143,7 @@ def delete_stores(model, backend="sqlite"):
         """Deletes the db versions of all word embeddings."""
         for embedding in WordEmbeddingsStore._word_embeddings(model):
             store_path: Path = WordEmbeddingsStore._get_store_path(embedding)
-            logger.info(f"delete store: {str(store_path)}")
+            logger.info(f"delete store: {store_path!s}")
             if store_path.is_file():
                 store_path.unlink()
             elif store_path.is_dir():
@@ -177,7 +177,7 @@ def __init__(self, embedding, verbose) -> None:
                 self.k = len(result[0]) - 1
                 return
             except sqlite3.Error as err:
-                logger.exception(f"Fail to open sqlite database {str(self.store_path)}: {str(err)}")
+                logger.exception(f"Fail to open sqlite database {self.store_path!s}: {err!s}")
         # otherwise, push embedding to database
         if hasattr(embedding, "precomputed_word_embeddings"):
             self.db = sqlite3.connect(str(self.store_path))
@@ -239,7 +239,7 @@ def __init__(self, embedding, verbose) -> None:
                             cursor.close()
                         return
                 except lmdb.Error as err:
-                    logger.exception(f"Fail to open lmdb database {str(self.store_path)}: {str(err)}")
+                    logger.exception(f"Fail to open lmdb database {self.store_path!s}: {err!s}")
             # create and load the database in write mode
             if hasattr(embedding, "precomputed_word_embeddings"):
                 pwe = embedding.precomputed_word_embeddings

diff --git a/flair/models/sequence_tagger_model.py b/flair/models/sequence_tagger_model.py
@@ -340,7 +340,7 @@ def _make_padded_tensor_for_batch(self, sentences: List[Sentence]) -> Tuple[torc
         longest_token_sequence_in_batch: int = max(lengths)
         pre_allocated_zero_tensor = torch.zeros(
             self.embeddings.embedding_length * longest_token_sequence_in_batch,
-            dtype=torch.float,
+            dtype=self.linear.weight.dtype,
             device=flair.device,
         )
         all_embs = []

diff --git a/flair/models/sequence_tagger_utils/viterbi.py b/flair/models/sequence_tagger_utils/viterbi.py
@@ -53,7 +53,7 @@ def forward(self, features_tuple: tuple, targets: torch.Tensor) -> torch.Tensor:
         ]
         gold_score = scores_at_targets.sum() + transitions_to_stop.sum()
 
-        scores_upto_t = torch.zeros(batch_size, self.tagset_size, device=flair.device)
+        scores_upto_t = torch.zeros(batch_size, self.tagset_size, device=flair.device, dtype=features.dtype)
 
         for t in range(max(lengths)):
             batch_size_t = sum(
@@ -151,7 +151,7 @@ def decode(
         seq_len = features.size(1)
 
         # Create a tensor to hold accumulated sequence scores at each current tag
-        scores_upto_t = torch.zeros(batch_size, seq_len + 1, self.tagset_size).to(flair.device)
+        scores_upto_t = torch.zeros(batch_size, seq_len + 1, self.tagset_size, dtype=features.dtype).to(flair.device)
         # Create a tensor to hold back-pointers
         # i.e., indices of the previous_tag that corresponds to maximum accumulated score at current tag
         # Let pads be the <end> tag index, since that was the last tag in the decoded sequence

diff --git a/flair/splitter.py b/flair/splitter.py
@@ -64,8 +64,8 @@ def split(self, text: str) -> List[Sentence]:
                 sentence_offset = text.index(sentence, sentence_offset)
             except ValueError as error:
                 raise AssertionError(
-                    f"Can't find the sentence offset for sentence {repr(sentence)} "
-                    f"starting from position {repr(sentence_offset)}"
+                    f"Can't find the sentence offset for sentence {sentence} "
+                    f"starting from position {sentence_offset}"
                 ) from error
             sentences.append(
                 Sentence(