Skip to content

Commit 1f97be2

Browse files
committed
wip: feat: new llm extractor pipe
1 parent a40a2a2 commit 1f97be2

File tree

12 files changed

+429
-146
lines changed

12 files changed

+429
-146
lines changed

docs/pipes/index.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ EDS-NLP provides easy-to-use pipeline components (aka pipes).
3636

3737
--8<-- "docs/pipes/trainable/index.md:components"
3838

39+
=== "LLM-based"
40+
41+
See the [LLM-based components overview](/pipes/llm/) for more information.
42+
43+
--8<-- "docs/pipes/llm/index.md:components"
44+
3945
<!-- --8<-- [end:components] -->
4046

4147
You can add them to your pipeline by simply calling `add_pipe`, for instance:

docs/references.bib

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,18 @@ @inproceedings{grobol:hal-03223424
184184
hal_id = {hal-03223424},
185185
hal_version = {v1},
186186
}
187+
188+
@inproceedings{naguib_2024,
189+
title = "Few-shot clinical entity recognition in {E}nglish, {F}rench and {S}panish: masked language models outperform generative model prompting",
190+
author = "Naguib, Marco and Tannier, Xavier and Névéol, Aurélie",
191+
editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung",
192+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
193+
month = nov,
194+
year = "2024",
195+
address = "Miami, Florida, USA",
196+
publisher = "Association for Computational Linguistics",
197+
url = "https://aclanthology.org/2024.findings-emnlp.400/",
198+
doi = "10.18653/v1/2024.findings-emnlp.400",
199+
pages = "6829--6852",
200+
abstract = "Large language models (LLMs) have become the preferred solution for many natural language processing tasks. In low-resource environments such as specialized domains, their few-shot capabilities are expected to deliver high performance. Named Entity Recognition (NER) is a critical task in information extraction that is not covered in recent LLM benchmarks. There is a need for better understanding the performance of LLMs for NER in a variety of settings including languages other than English. This study aims to evaluate generative LLMs, employed through prompt engineering, for few-shot clinical NER. We compare 13 auto-regressive models using prompting and 16 masked models using fine-tuning on 14 NER datasets covering English, French and Spanish. While prompt-based auto-regressive models achieve competitive F1 for general NER, they are outperformed within the clinical domain by lighter biLSTM-CRF taggers based on masked models. Additionally, masked models exhibit lower environmental impact compared to auto-regressive models. Findings are consistent across the three languages studied, which suggests that LLM prompting is not yet suited for NER production in the clinical domain."
201+
}

edsnlp/data/converters.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -793,11 +793,11 @@ class MarkupToDocConverter:
793793
PRESETS = {
794794
"md": {
795795
"opener": r"(?P<opener>\[)",
796-
"closer": r"(?P<closer>\]\(\s*(?P<closer_label>[a-zA-Z0-9_]+)\s*(?P<closer_attrs>.*?)\))", # noqa: E501
796+
"closer": r"(?P<closer>\]\(\s*(?P<closer_label>[A-Za-z0-9_À-ÿ]+)\s*(?P<closer_attrs>.*?)\))", # noqa: E501
797797
},
798798
"xml": {
799-
"opener": r"(?P<opener><(?P<opener_label>[a-zA-Z0-9_]+)(?P<opener_attrs>.*?)>)", # noqa: E501
800-
"closer": r"(?P<closer></(?P<closer_label>[a-zA-Z0-9_]+)>)",
799+
"opener": r"(?P<opener><(?P<opener_label>[A-Za-z0-9_À-ÿ]+)(?P<opener_attrs>.*?)>)", # noqa: E501
800+
"closer": r"(?P<closer></(?P<closer_label>[A-Za-z0-9_À-ÿ]+)>)",
801801
},
802802
}
803803

@@ -885,7 +885,7 @@ def _parse(self, inline_text: str):
885885
text += inline_text[last_inline_offset:]
886886
if starts:
887887
warnings.warn(
888-
f"Unmatched opening tags at indices {', '.join(s[1] for s in starts)}"
888+
f"Unmatched opening tags at {', '.join(str(s[0]) for s in starts)}"
889889
)
890890
entities = sorted(entities)
891891
return text, entities

edsnlp/pipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,4 @@
8484
from .trainable.embeddings.text_cnn.factory import create_component as text_cnn
8585
from .misc.split import Split as split
8686
from .misc.explode import Explode as explode
87+
from .llm.llm_markup_extraction import LlmMarkupExtraction as llm_markup_extraction

edsnlp/pipes/llm/async_worker.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
import threading
3+
from typing import Any, Coroutine, Dict, Iterable, Optional, Tuple
4+
5+
6+
class AsyncRequestWorker:
7+
_instance = None
8+
9+
def __init__(self):
10+
self.loop = asyncio.new_event_loop()
11+
self.thread = threading.Thread(target=self._run, daemon=True)
12+
self.thread.start()
13+
self._lock = threading.Lock()
14+
self._cv = threading.Condition(self._lock)
15+
self._next_id = 0
16+
self._results: Dict[int, Tuple[Any, Optional[BaseException]]] = {}
17+
18+
def _run(self):
19+
asyncio.set_event_loop(self.loop)
20+
self.loop.run_forever()
21+
22+
@classmethod
23+
def instance(cls) -> "AsyncRequestWorker":
24+
if cls._instance is None:
25+
cls._instance = AsyncRequestWorker()
26+
return cls._instance
27+
28+
def submit(self, coro: Coroutine[Any, Any, Any]) -> int:
29+
with self._lock:
30+
task_id = self._next_id
31+
self._next_id += 1
32+
33+
async def _wrap():
34+
try:
35+
res = await coro
36+
exc = None
37+
except BaseException as e: # noqa: BLE001
38+
res = None
39+
exc = e
40+
with self._cv:
41+
self._results[task_id] = (res, exc)
42+
self._cv.notify_all()
43+
44+
asyncio.run_coroutine_threadsafe(_wrap(), self.loop)
45+
return task_id
46+
47+
def pop_result(self, task_id: int) -> Optional[Tuple[Any, Optional[BaseException]]]:
48+
with self._cv:
49+
return self._results.pop(task_id, None)
50+
51+
def wait_for_any(self, task_ids: Iterable[int]) -> int:
52+
task_ids = set(task_ids)
53+
with self._cv:
54+
while True:
55+
for tid in task_ids:
56+
if tid in self._results:
57+
return tid
58+
self._cv.wait()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .llm_markup_extraction import LlmMarkupExtraction
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from edsnlp import registry
2+
3+
from .llm_markup_extraction import LlmMarkupExtraction
4+
5+
create_component = registry.factory.register(
6+
"eds.llm_markup_extraction",
7+
assigns=["doc.ents", "doc.spans"],
8+
)(LlmMarkupExtraction)

0 commit comments

Comments
 (0)