Skip to content

Commit ea21a0e

Browse files
committed
feat: new llm extractor pipe
1 parent 57950a0 commit ea21a0e

File tree

16 files changed

+893
-8
lines changed

16 files changed

+893
-8
lines changed

changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
- New `DocToMarkupConverter` to convert documents to markdown and improved `MarkupToDocConverter` to allow overlapping markup annotations (e.g., `This is a <a>text <b>with</a> overlapping</b> tags`).
88
- New helper `edsnlp.utils.fuzzy_alignment.align` to map the entities of an annotated document to another document with similar but not identical text (e.g., after some text normalization or minor edits).
99
- We now support `span_getter="sents"` to apply various pipes on sentences instead of entities or spans.
10+
- New LLM generic extractor pipe `eds.llm_markup_extractor`, that can be used to extract entities using a large language model served through an *OpenAPI-style* API.
1011

1112
## v0.18.0 (2025-09-02)
1213

docs/pipes/index.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ EDS-NLP provides easy-to-use pipeline components (aka pipes).
3636

3737
--8<-- "docs/pipes/trainable/index.md:components"
3838

39+
=== "LLM-based"
40+
41+
See the [LLM-based components overview](/pipes/llm/) for more information.
42+
43+
--8<-- "docs/pipes/llm/index.md:components"
44+
3945
<!-- --8<-- [end:components] -->
4046

4147
You can add them to your pipeline by simply calling `add_pipe`, for instance:

docs/pipes/llm/index.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# LLM-based components {: #edsnlp.pipes.llm }
2+
3+
This section regroups components that extract information that can be used by other components, but have little medical value in itself.
4+
5+
For instance, the date detection and normalisation pipeline falls in this category.
6+
7+
## Available components
8+
9+
<!-- --8<-- [start:components] -->
10+
11+
| Component | Description |
12+
|-----------------------------|-----------------------------------------------------------|
13+
| `eds.llm_markup_extractor` | Extract structured information using LLMs through markup. |
14+
15+
<!-- --8<-- [end:components] -->
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# LLM Markup Extraction {: #edsnlp.pipes.llm.llm_markup_extractor.factory.create_component }
2+
3+
::: edsnlp.pipes.llm.llm_markup_extractor.factory.create_component
4+
options:
5+
heading_level: 2
6+
show_bases: false
7+
show_source: false
8+
only_class_level: true

docs/references.bib

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,18 @@ @inproceedings{grobol:hal-03223424
184184
hal_id = {hal-03223424},
185185
hal_version = {v1},
186186
}
187+
188+
@inproceedings{naguib_2024,
189+
title = "Few-shot clinical entity recognition in {E}nglish, {F}rench and {S}panish: masked language models outperform generative model prompting",
190+
author = "Naguib, Marco and Tannier, Xavier and Névéol, Aurélie",
191+
editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung",
192+
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
193+
month = nov,
194+
year = "2024",
195+
address = "Miami, Florida, USA",
196+
publisher = "Association for Computational Linguistics",
197+
url = "https://aclanthology.org/2024.findings-emnlp.400/",
198+
doi = "10.18653/v1/2024.findings-emnlp.400",
199+
pages = "6829--6852",
200+
abstract = "Large language models (LLMs) have become the preferred solution for many natural language processing tasks. In low-resource environments such as specialized domains, their few-shot capabilities are expected to deliver high performance. Named Entity Recognition (NER) is a critical task in information extraction that is not covered in recent LLM benchmarks. There is a need for better understanding the performance of LLMs for NER in a variety of settings including languages other than English. This study aims to evaluate generative LLMs, employed through prompt engineering, for few-shot clinical NER. We compare 13 auto-regressive models using prompting and 16 masked models using fine-tuning on 14 NER datasets covering English, French and Spanish. While prompt-based auto-regressive models achieve competitive F1 for general NER, they are outperformed within the clinical domain by lighter biLSTM-CRF taggers based on masked models. Additionally, masked models exhibit lower environmental impact compared to auto-regressive models. Findings are consistent across the three languages studied, which suggests that LLM prompting is not yet suited for NER production in the clinical domain."
201+
}

edsnlp/core/stream.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -137,11 +137,24 @@ class MapOp(Op):
137137
def __init__(self, pipe, kwargs, context=None):
138138
self.pipe = pipe
139139
self.kwargs = kwargs
140-
self.is_generator = deep_isgeneratorfunction(pipe)
140+
self.has_pipe_method = hasattr(pipe, "pipe") and callable(pipe.pipe)
141+
self.is_generator = self.has_pipe_method or deep_isgeneratorfunction(pipe)
141142
self.elementwise = not self.is_generator
142143
self.context = context or {}
143144

144145
def __call__(self, items):
146+
if self.has_pipe_method:
147+
CONTEXT[0], old = self.context, CONTEXT[0]
148+
res = self.pipe.pipe(
149+
(x for x in items if not isinstance(x, StreamSentinel)),
150+
**self.kwargs,
151+
)
152+
CONTEXT[0] = old
153+
if self.is_generator:
154+
yield from res
155+
else:
156+
yield res
157+
return
145158
for item in items:
146159
if isinstance(item, StreamSentinel):
147160
yield item
@@ -714,17 +727,27 @@ def map_pipeline(
714727
if isinstance(op, (MapOp, MapBatchesOp)):
715728
op.context["tokenizer"] = tokenizer
716729
new_ops.append(op)
730+
has_batches = batch_by is not None or any(
731+
hasattr(p, "batch_process") for n, p in model.pipeline
732+
)
717733
new_ops.append(MapOp(model._ensure_doc, {}))
718-
batch_size, batch_by = self.validate_batching(batch_size, batch_by)
719-
batch_by = batchify_fns.get(batch_by, batch_by)
720-
new_ops.append(BatchifyOp(batch_size, batch_by))
734+
if has_batches:
735+
batch_size, batch_by = self.validate_batching(batch_size, batch_by)
736+
batch_by = batchify_fns.get(batch_by, batch_by)
737+
new_ops.append(BatchifyOp(batch_size, batch_by))
738+
721739
for name, pipe in model.pipeline:
722740
if name not in model._disabled:
723-
op = MapBatchesOp(
724-
pipe, {}, elementwise=not deep_isgeneratorfunction(pipe)
741+
op = (
742+
MapBatchesOp(
743+
pipe, {}, elementwise=not deep_isgeneratorfunction(pipe)
744+
)
745+
if has_batches
746+
else MapOp(pipe, {})
725747
)
726748
new_ops.append(op)
727-
new_ops.append(UnbatchifyOp())
749+
if has_batches:
750+
new_ops.append(UnbatchifyOp())
728751
config = (
729752
{**self.config, "batch_size": model.batch_size}
730753
if self.batch_size is None
@@ -999,7 +1022,7 @@ def validate_ops(self, ops, update: bool = False):
9991022
):
10001023
requires_sentinels.add(self.writer.batch_fn.requires_sentinel)
10011024

1002-
for op in reversed(ops):
1025+
for op in list(reversed(ops)):
10031026
if isinstance(op, BatchifyOp):
10041027
if op.batch_fn is None and op.size is None:
10051028
batch_size = self_batch_size

edsnlp/pipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,3 +84,4 @@
8484
from .trainable.embeddings.text_cnn.factory import create_component as text_cnn
8585
from .misc.split import Split as split
8686
from .misc.explode import Explode as explode
87+
from .llm.llm_markup_extractor import LlmMarkupExtractor as llm_markup_extractor

edsnlp/pipes/llm/__init__.py

Whitespace-only changes.

edsnlp/pipes/llm/async_worker.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import asyncio
2+
import threading
3+
from typing import Any, Coroutine, Dict, Iterable, Optional, Tuple
4+
5+
6+
class AsyncRequestWorker:
7+
_instance = None
8+
9+
def __init__(self):
10+
self.loop = asyncio.new_event_loop()
11+
self.thread = threading.Thread(target=self._run, daemon=True)
12+
self.thread.start()
13+
self._lock = threading.Lock()
14+
self._cv = threading.Condition(self._lock)
15+
self._next_id = 0
16+
self._results: Dict[int, Tuple[Any, Optional[BaseException]]] = {}
17+
18+
def _run(self):
19+
asyncio.set_event_loop(self.loop)
20+
self.loop.run_forever()
21+
22+
@classmethod
23+
def instance(cls) -> "AsyncRequestWorker":
24+
if cls._instance is None:
25+
cls._instance = AsyncRequestWorker()
26+
return cls._instance
27+
28+
def submit(self, coro: Coroutine[Any, Any, Any]) -> int:
29+
with self._lock:
30+
task_id = self._next_id
31+
self._next_id += 1
32+
33+
async def _wrap():
34+
try:
35+
res = await coro
36+
exc = None
37+
except BaseException as e: # noqa: BLE001
38+
res = None
39+
exc = e
40+
with self._cv:
41+
self._results[task_id] = (res, exc)
42+
self._cv.notify_all()
43+
44+
asyncio.run_coroutine_threadsafe(_wrap(), self.loop)
45+
return task_id
46+
47+
def pop_result(self, task_id: int) -> Optional[Tuple[Any, Optional[BaseException]]]:
48+
with self._cv:
49+
return self._results.pop(task_id, None)
50+
51+
def wait_for_any(self, task_ids: Iterable[int]) -> int:
52+
task_ids = set(task_ids)
53+
with self._cv:
54+
while True:
55+
for tid in task_ids:
56+
if tid in self._results:
57+
return tid
58+
self._cv.wait()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .llm_markup_extractor import LlmMarkupExtractor

0 commit comments

Comments
 (0)