aphp
diff --git a/‎docs/pipes/index.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/pipes/index.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/references.bib‎
Lines changed: 15 additions & 0 deletions b/‎docs/references.bib‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎edsnlp/data/converters.py‎
Lines changed: 4 additions & 4 deletions b/‎edsnlp/data/converters.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎edsnlp/pipes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎edsnlp/pipes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎edsnlp/pipes/llm/async_worker.py‎
Lines changed: 58 additions & 0 deletions b/‎edsnlp/pipes/llm/async_worker.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎edsnlp/pipes/llm/llm_markup_extraction/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎edsnlp/pipes/llm/llm_markup_extraction/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎edsnlp/pipes/llm/llm_markup_extraction/factory.py‎
Lines changed: 8 additions & 0 deletions b/‎edsnlp/pipes/llm/llm_markup_extraction/factory.py‎
Lines changed: 8 additions & 0 deletions
@@ -36,6 +36,12 @@ EDS-NLP provides easy-to-use pipeline components (aka pipes).
 
     --8<-- "docs/pipes/trainable/index.md:components"
 
+=== "LLM-based"
+
+    See the [LLM-based components overview](/pipes/llm/) for more information.
+
+    --8<-- "docs/pipes/llm/index.md:components"
+
 <!-- --8<-- [end:components] -->
 
 You can add them to your pipeline by simply calling `add_pipe`, for instance:
 
@@ -184,3 +184,18 @@ @inproceedings{grobol:hal-03223424
     hal_id = {hal-03223424},
     hal_version = {v1},
 }
+
+@inproceedings{naguib_2024,
+    title = "Few-shot clinical entity recognition in {E}nglish, {F}rench and {S}panish: masked language models outperform generative model prompting",
+    author = "Naguib, Marco and Tannier, Xavier and Névéol, Aurélie",
+    editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung",
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-emnlp.400/",
+    doi = "10.18653/v1/2024.findings-emnlp.400",
+    pages = "6829--6852",
+    abstract = "Large language models (LLMs) have become the preferred solution for many natural language processing tasks. In low-resource environments such as specialized domains, their few-shot capabilities are expected to deliver high performance. Named Entity Recognition (NER) is a critical task in information extraction that is not covered in recent LLM benchmarks. There is a need for better understanding the performance of LLMs for NER in a variety of settings including languages other than English. This study aims to evaluate generative LLMs, employed through prompt engineering, for few-shot clinical NER. We compare 13 auto-regressive models using prompting and 16 masked models using fine-tuning on 14 NER datasets covering English, French and Spanish. While prompt-based auto-regressive models achieve competitive F1 for general NER, they are outperformed within the clinical domain by lighter biLSTM-CRF taggers based on masked models. Additionally, masked models exhibit lower environmental impact compared to auto-regressive models. Findings are consistent across the three languages studied, which suggests that LLM prompting is not yet suited for NER production in the clinical domain."
+}
@@ -793,11 +793,11 @@ class MarkupToDocConverter:
     PRESETS = {
         "md": {
             "opener": r"(?P<opener>\[)",
-            "closer": r"(?P<closer>\]\(\s*(?P<closer_label>[a-zA-Z0-9_]+)\s*(?P<closer_attrs>.*?)\))",  # noqa: E501
+            "closer": r"(?P<closer>\]\(\s*(?P<closer_label>[A-Za-z0-9_À-ÿ]+)\s*(?P<closer_attrs>.*?)\))",  # noqa: E501
         },
         "xml": {
-            "opener": r"(?P<opener><(?P<opener_label>[a-zA-Z0-9_]+)(?P<opener_attrs>.*?)>)",  # noqa: E501
-            "closer": r"(?P<closer></(?P<closer_label>[a-zA-Z0-9_]+)>)",
+            "opener": r"(?P<opener><(?P<opener_label>[A-Za-z0-9_À-ÿ]+)(?P<opener_attrs>.*?)>)",  # noqa: E501
+            "closer": r"(?P<closer></(?P<closer_label>[A-Za-z0-9_À-ÿ]+)>)",
         },
     }
 
@@ -885,7 +885,7 @@ def _parse(self, inline_text: str):
             text += inline_text[last_inline_offset:]
         if starts:
             warnings.warn(
-                f"Unmatched opening tags at indices {', '.join(s[1] for s in starts)}"
+                f"Unmatched opening tags at {', '.join(str(s[0]) for s in starts)}"
             )
         entities = sorted(entities)
         return text, entities
 
@@ -84,3 +84,4 @@
     from .trainable.embeddings.text_cnn.factory import create_component as text_cnn
     from .misc.split import Split as split
     from .misc.explode import Explode as explode
+    from .llm.llm_markup_extraction import LlmMarkupExtraction as llm_markup_extraction
@@ -0,0 +1,58 @@
+import asyncio
+import threading
+from typing import Any, Coroutine, Dict, Iterable, Optional, Tuple
+
+
+class AsyncRequestWorker:
+    _instance = None
+
+    def __init__(self):
+        self.loop = asyncio.new_event_loop()
+        self.thread = threading.Thread(target=self._run, daemon=True)
+        self.thread.start()
+        self._lock = threading.Lock()
+        self._cv = threading.Condition(self._lock)
+        self._next_id = 0
+        self._results: Dict[int, Tuple[Any, Optional[BaseException]]] = {}
+
+    def _run(self):
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+
+    @classmethod
+    def instance(cls) -> "AsyncRequestWorker":
+        if cls._instance is None:
+            cls._instance = AsyncRequestWorker()
+        return cls._instance
+
+    def submit(self, coro: Coroutine[Any, Any, Any]) -> int:
+        with self._lock:
+            task_id = self._next_id
+            self._next_id += 1
+
+        async def _wrap():
+            try:
+                res = await coro
+                exc = None
+            except BaseException as e:  # noqa: BLE001
+                res = None
+                exc = e
+            with self._cv:
+                self._results[task_id] = (res, exc)
+                self._cv.notify_all()
+
+        asyncio.run_coroutine_threadsafe(_wrap(), self.loop)
+        return task_id
+
+    def pop_result(self, task_id: int) -> Optional[Tuple[Any, Optional[BaseException]]]:
+        with self._cv:
+            return self._results.pop(task_id, None)
+
+    def wait_for_any(self, task_ids: Iterable[int]) -> int:
+        task_ids = set(task_ids)
+        with self._cv:
+            while True:
+                for tid in task_ids:
+                    if tid in self._results:
+                        return tid
+                self._cv.wait()
@@ -0,0 +1 @@
+from .llm_markup_extraction import LlmMarkupExtraction
@@ -0,0 +1,8 @@
+from edsnlp import registry
+
+from .llm_markup_extraction import LlmMarkupExtraction
+
+create_component = registry.factory.register(
+    "eds.llm_markup_extraction",
+    assigns=["doc.ents", "doc.spans"],
+)(LlmMarkupExtraction)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .llm_markup_extraction import LlmMarkupExtraction`