aphp
diff --git a/‎changelog.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/pipes/index.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/pipes/index.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/pipes/llm/index.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/pipes/llm/index.md‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎docs/pipes/llm/llm-markup-extraction.md‎
Lines changed: 8 additions & 0 deletions b/‎docs/pipes/llm/llm-markup-extraction.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎docs/references.bib‎
Lines changed: 15 additions & 0 deletions b/‎docs/references.bib‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎edsnlp/core/stream.py‎
Lines changed: 31 additions & 8 deletions b/‎edsnlp/core/stream.py‎
Lines changed: 31 additions & 8 deletions
diff --git a/‎edsnlp/pipes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎edsnlp/pipes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎edsnlp/pipes/llm/__init__.py‎ b/‎edsnlp/pipes/llm/__init__.py‎
diff --git a/‎edsnlp/pipes/llm/async_worker.py‎
Lines changed: 58 additions & 0 deletions b/‎edsnlp/pipes/llm/async_worker.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎edsnlp/pipes/llm/llm_markup_extractor/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎edsnlp/pipes/llm/llm_markup_extractor/__init__.py‎
Lines changed: 1 addition & 0 deletions
@@ -7,6 +7,7 @@
 - New `DocToMarkupConverter` to convert documents to markdown and improved `MarkupToDocConverter` to allow overlapping markup annotations (e.g., `This is a <a>text <b>with</a> overlapping</b> tags`).
 - New helper `edsnlp.utils.fuzzy_alignment.align` to map the entities of an annotated document to another document with similar but not identical text (e.g., after some text normalization or minor edits).
 - We now support `span_getter="sents"` to apply various pipes on sentences instead of entities or spans.
+- New LLM generic extractor pipe `eds.llm_markup_extractor`, that can be used to extract entities using a large language model served through an *OpenAPI-style* API.
 
 ## v0.18.0 (2025-09-02)
 
 
@@ -36,6 +36,12 @@ EDS-NLP provides easy-to-use pipeline components (aka pipes).
 
     --8<-- "docs/pipes/trainable/index.md:components"
 
+=== "LLM-based"
+
+    See the [LLM-based components overview](/pipes/llm/) for more information.
+
+    --8<-- "docs/pipes/llm/index.md:components"
+
 <!-- --8<-- [end:components] -->
 
 You can add them to your pipeline by simply calling `add_pipe`, for instance:
 
@@ -0,0 +1,15 @@
+# LLM-based components {: #edsnlp.pipes.llm }
+
+This section regroups components that extract information that can be used by other components, but have little medical value in itself.
+
+For instance, the date detection and normalisation pipeline falls in this category.
+
+## Available components
+
+<!-- --8<-- [start:components] -->
+
+| Component                   | Description                                               |
+|-----------------------------|-----------------------------------------------------------|
+| `eds.llm_markup_extractor` | Extract structured information using LLMs through markup. |
+
+<!-- --8<-- [end:components] -->
@@ -0,0 +1,8 @@
+# LLM Markup Extraction {: #edsnlp.pipes.llm.llm_markup_extractor.factory.create_component }
+
+::: edsnlp.pipes.llm.llm_markup_extractor.factory.create_component
+    options:
+        heading_level: 2
+        show_bases: false
+        show_source: false
+        only_class_level: true
@@ -184,3 +184,18 @@ @inproceedings{grobol:hal-03223424
     hal_id = {hal-03223424},
     hal_version = {v1},
 }
+
+@inproceedings{naguib_2024,
+    title = "Few-shot clinical entity recognition in {E}nglish, {F}rench and {S}panish: masked language models outperform generative model prompting",
+    author = "Naguib, Marco and Tannier, Xavier and Névéol, Aurélie",
+    editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung",
+    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.findings-emnlp.400/",
+    doi = "10.18653/v1/2024.findings-emnlp.400",
+    pages = "6829--6852",
+    abstract = "Large language models (LLMs) have become the preferred solution for many natural language processing tasks. In low-resource environments such as specialized domains, their few-shot capabilities are expected to deliver high performance. Named Entity Recognition (NER) is a critical task in information extraction that is not covered in recent LLM benchmarks. There is a need for better understanding the performance of LLMs for NER in a variety of settings including languages other than English. This study aims to evaluate generative LLMs, employed through prompt engineering, for few-shot clinical NER. We compare 13 auto-regressive models using prompting and 16 masked models using fine-tuning on 14 NER datasets covering English, French and Spanish. While prompt-based auto-regressive models achieve competitive F1 for general NER, they are outperformed within the clinical domain by lighter biLSTM-CRF taggers based on masked models. Additionally, masked models exhibit lower environmental impact compared to auto-regressive models. Findings are consistent across the three languages studied, which suggests that LLM prompting is not yet suited for NER production in the clinical domain."
+}
@@ -137,11 +137,24 @@ class MapOp(Op):
     def __init__(self, pipe, kwargs, context=None):
         self.pipe = pipe
         self.kwargs = kwargs
-        self.is_generator = deep_isgeneratorfunction(pipe)
+        self.has_pipe_method = hasattr(pipe, "pipe") and callable(pipe.pipe)
+        self.is_generator = self.has_pipe_method or deep_isgeneratorfunction(pipe)
         self.elementwise = not self.is_generator
         self.context = context or {}
 
     def __call__(self, items):
+        if self.has_pipe_method:
+            CONTEXT[0], old = self.context, CONTEXT[0]
+            res = self.pipe.pipe(
+                (x for x in items if not isinstance(x, StreamSentinel)),
+                **self.kwargs,
+            )
+            CONTEXT[0] = old
+            if self.is_generator:
+                yield from res
+            else:
+                yield res
+            return
         for item in items:
             if isinstance(item, StreamSentinel):
                 yield item
@@ -714,17 +727,27 @@ def map_pipeline(
             if isinstance(op, (MapOp, MapBatchesOp)):
                 op.context["tokenizer"] = tokenizer
             new_ops.append(op)
+        has_batches = batch_by is not None or any(
+            hasattr(p, "batch_process") for n, p in model.pipeline
+        )
         new_ops.append(MapOp(model._ensure_doc, {}))
-        batch_size, batch_by = self.validate_batching(batch_size, batch_by)
-        batch_by = batchify_fns.get(batch_by, batch_by)
-        new_ops.append(BatchifyOp(batch_size, batch_by))
+        if has_batches:
+            batch_size, batch_by = self.validate_batching(batch_size, batch_by)
+            batch_by = batchify_fns.get(batch_by, batch_by)
+            new_ops.append(BatchifyOp(batch_size, batch_by))
+
         for name, pipe in model.pipeline:
             if name not in model._disabled:
-                op = MapBatchesOp(
-                    pipe, {}, elementwise=not deep_isgeneratorfunction(pipe)
+                op = (
+                    MapBatchesOp(
+                        pipe, {}, elementwise=not deep_isgeneratorfunction(pipe)
+                    )
+                    if has_batches
+                    else MapOp(pipe, {})
                 )
                 new_ops.append(op)
-        new_ops.append(UnbatchifyOp())
+        if has_batches:
+            new_ops.append(UnbatchifyOp())
         config = (
             {**self.config, "batch_size": model.batch_size}
             if self.batch_size is None
@@ -999,7 +1022,7 @@ def validate_ops(self, ops, update: bool = False):
         ):
             requires_sentinels.add(self.writer.batch_fn.requires_sentinel)
 
-        for op in reversed(ops):
+        for op in list(reversed(ops)):
             if isinstance(op, BatchifyOp):
                 if op.batch_fn is None and op.size is None:
                     batch_size = self_batch_size
 
@@ -84,3 +84,4 @@
     from .trainable.embeddings.text_cnn.factory import create_component as text_cnn
     from .misc.split import Split as split
     from .misc.explode import Explode as explode
+    from .llm.llm_markup_extractor import LlmMarkupExtractor as llm_markup_extractor
@@ -0,0 +1,58 @@
+import asyncio
+import threading
+from typing import Any, Coroutine, Dict, Iterable, Optional, Tuple
+
+
+class AsyncRequestWorker:
+    _instance = None
+
+    def __init__(self):
+        self.loop = asyncio.new_event_loop()
+        self.thread = threading.Thread(target=self._run, daemon=True)
+        self.thread.start()
+        self._lock = threading.Lock()
+        self._cv = threading.Condition(self._lock)
+        self._next_id = 0
+        self._results: Dict[int, Tuple[Any, Optional[BaseException]]] = {}
+
+    def _run(self):
+        asyncio.set_event_loop(self.loop)
+        self.loop.run_forever()
+
+    @classmethod
+    def instance(cls) -> "AsyncRequestWorker":
+        if cls._instance is None:
+            cls._instance = AsyncRequestWorker()
+        return cls._instance
+
+    def submit(self, coro: Coroutine[Any, Any, Any]) -> int:
+        with self._lock:
+            task_id = self._next_id
+            self._next_id += 1
+
+        async def _wrap():
+            try:
+                res = await coro
+                exc = None
+            except BaseException as e:  # noqa: BLE001
+                res = None
+                exc = e
+            with self._cv:
+                self._results[task_id] = (res, exc)
+                self._cv.notify_all()
+
+        asyncio.run_coroutine_threadsafe(_wrap(), self.loop)
+        return task_id
+
+    def pop_result(self, task_id: int) -> Optional[Tuple[Any, Optional[BaseException]]]:
+        with self._cv:
+            return self._results.pop(task_id, None)
+
+    def wait_for_any(self, task_ids: Iterable[int]) -> int:
+        task_ids = set(task_ids)
+        with self._cv:
+            while True:
+                for tid in task_ids:
+                    if tid in self._results:
+                        return tid
+                self._cv.wait()
@@ -0,0 +1 @@
+from .llm_markup_extractor import LlmMarkupExtractor
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .llm_markup_extractor import LlmMarkupExtractor`