From c71b7fec3973ac1c2ce45ee5e5d59fba3e672119 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Fri, 2 May 2025 14:22:42 +0200 Subject: [PATCH 1/8] feat: add pooling to HuggingFaceEmbedder, add CompositeEmbedder --- meilisearch/index.py | 7 +++++++ meilisearch/models/index.py | 32 +++++++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index 9f6e176d..c5f224ec 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -25,6 +25,7 @@ from meilisearch.errors import version_error_hint_message from meilisearch.models.document import Document, DocumentsResults from meilisearch.models.index import ( + CompositeEmbedder, Embedders, Faceting, HuggingFaceEmbedder, @@ -961,6 +962,7 @@ def get_settings(self) -> Dict[str, Any]: | HuggingFaceEmbedder | OllamaEmbedder | RestEmbedder + | CompositeEmbedder | UserProvidedEmbedder, ] = {} for k, v in settings["embedders"].items(): @@ -972,6 +974,8 @@ def get_settings(self) -> Dict[str, Any]: embedders[k] = HuggingFaceEmbedder(**v) elif v.get("source") == "rest": embedders[k] = RestEmbedder(**v) + elif v.get("source") == "composite": + embedders[k] = CompositeEmbedder(**v) else: embedders[k] = UserProvidedEmbedder(**v) @@ -1900,6 +1904,7 @@ def get_embedders(self) -> Embedders | None: | HuggingFaceEmbedder | OllamaEmbedder | RestEmbedder + | CompositeEmbedder | UserProvidedEmbedder, ] = {} for k, v in response.items(): @@ -1911,6 +1916,8 @@ def get_embedders(self) -> Embedders | None: embedders[k] = HuggingFaceEmbedder(**v) elif v.get("source") == "rest": embedders[k] = RestEmbedder(**v) + elif v.get("source") == "composite": + embedders[k] = CompositeEmbedder(**v) else: embedders[k] = UserProvidedEmbedder(**v) diff --git a/meilisearch/models/index.py b/meilisearch/models/index.py index 5457be56..f9959770 100644 --- a/meilisearch/models/index.py +++ b/meilisearch/models/index.py @@ -64,6 +64,12 @@ class LocalizedAttributes(CamelBase): locales: List[str] +class PoolingOpt(str, Enum): + USE_MODEL = "useModel" + FORCE_MEAN = "forceMean" + FORCE_CLS = "forceCls" + + class OpenAiEmbedder(CamelBase): source: str = "openAi" url: Optional[str] = None @@ -84,6 +90,7 @@ class HuggingFaceEmbedder(CamelBase): document_template_max_bytes: Optional[int] = None # Default to 400 distribution: Optional[EmbedderDistribution] = None binary_quantized: Optional[bool] = None + pooling: Optional[PoolingOpt] = None class OllamaEmbedder(CamelBase): @@ -117,10 +124,33 @@ class UserProvidedEmbedder(CamelBase): binary_quantized: Optional[bool] = None +class CompositeEmbedder(CamelBase): + source: str = "composite" + search_embedder: Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder + ] + indexing_embedder: Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder + ] + + class Embedders(CamelBase): embedders: Dict[ str, Union[ - OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder, + CompositeEmbedder, ], ] From 567f800f6d047325268e86b7930c350720fa0683 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Fri, 2 May 2025 16:39:13 +0200 Subject: [PATCH 2/8] tests: add fixture for composite embeds, test --- tests/conftest.py | 17 +++++++++++++ tests/settings/test_settings_embedders.py | 30 ++++++++++++++++++++++- 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 7a814ff7..726f704b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -274,3 +274,20 @@ def new_embedders(): "default": UserProvidedEmbedder(dimensions=1).model_dump(by_alias=True), "open_ai": OpenAiEmbedder().model_dump(by_alias=True), } + + +@fixture +def enable_composite_embedders(): + requests.patch( + f"{common.BASE_URL}/experimental-features", + headers={"Authorization": f"Bearer {common.MASTER_KEY}"}, + json={"compositeEmbedders": True}, + timeout=10, + ) + yield + requests.patch( + f"{common.BASE_URL}/experimental-features", + headers={"Authorization": f"Bearer {common.MASTER_KEY}"}, + json={"compositeEmbedders": False}, + timeout=10, + ) diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index f932bbae..17e40f12 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -1,6 +1,7 @@ # pylint: disable=redefined-outer-name -from meilisearch.models.index import OpenAiEmbedder, UserProvidedEmbedder +from meilisearch.models.index import CompositeEmbedder, HuggingFaceEmbedder, OpenAiEmbedder, UserProvidedEmbedder +import pytest def test_get_default_embedders(empty_index): @@ -42,3 +43,30 @@ def test_reset_embedders(new_embedders, empty_index): assert isinstance(response_get.embedders["open_ai"], OpenAiEmbedder) response_last = index.get_embedders() assert response_last is None + + +@pytest.mark.usefixtures("enable_composite_embedders") +def test_composite_embedders_configurable(empty_index): + """Tests composite embedders.""" + index = empty_index() + + hf_model = HuggingFaceEmbedder().model_dump(by_alias=True, exclude_none=True) + + # create composite embedder + composite_spec = { + "default": { + "source": "composite", + "searchEmbedder": hf_model, + "indexingEmbedder": hf_model, + } + } + + response_update = index.update_embedders(composite_spec) + update = index.wait_for_task(response_update.task_uid) + response_get = index.get_embedders() + assert update.status == "succeeded" + + embedder = response_get.embedders["default"] + assert isinstance(embedder, CompositeEmbedder) + assert isinstance(embedder.search_embedder, HuggingFaceEmbedder) + assert isinstance(embedder.indexing_embedder, HuggingFaceEmbedder) From f75e2dc1010cae32bb0be4f47bfb7d0401ec8c41 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Fri, 2 May 2025 16:46:47 +0200 Subject: [PATCH 3/8] style: black, isort --- meilisearch/models/index.py | 12 ++---------- tests/settings/test_settings_embedders.py | 8 +++++++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/meilisearch/models/index.py b/meilisearch/models/index.py index f9959770..9b0e6eca 100644 --- a/meilisearch/models/index.py +++ b/meilisearch/models/index.py @@ -127,18 +127,10 @@ class UserProvidedEmbedder(CamelBase): class CompositeEmbedder(CamelBase): source: str = "composite" search_embedder: Union[ - OpenAiEmbedder, - HuggingFaceEmbedder, - OllamaEmbedder, - RestEmbedder, - UserProvidedEmbedder + OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder ] indexing_embedder: Union[ - OpenAiEmbedder, - HuggingFaceEmbedder, - OllamaEmbedder, - RestEmbedder, - UserProvidedEmbedder + OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder ] diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index 17e40f12..b0830366 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -1,8 +1,14 @@ # pylint: disable=redefined-outer-name -from meilisearch.models.index import CompositeEmbedder, HuggingFaceEmbedder, OpenAiEmbedder, UserProvidedEmbedder import pytest +from meilisearch.models.index import ( + CompositeEmbedder, + HuggingFaceEmbedder, + OpenAiEmbedder, + UserProvidedEmbedder, +) + def test_get_default_embedders(empty_index): """Tests getting default embedders.""" From 9903f95b6e8951a258636155cee20e03de0ea3a4 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Tue, 20 May 2025 12:03:32 +0200 Subject: [PATCH 4/8] docs --- meilisearch/models/embedders.py | 36 ++++++++++++++------ tests/settings/test_settings_embedders.py | 40 ++++++++++++++++++++++- 2 files changed, 65 insertions(+), 11 deletions(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 4062e902..715226da 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -21,15 +21,17 @@ class Distribution(CamelBase): sigma: float -class PoolingOpt(str, Enum): - """Pooling options for HuggingFaceEmbedder. +class PoolingType(str, Enum): + """Pooling strategies for HuggingFaceEmbedder. - Parameters + Attributes ---------- - mean: float - Mean value between 0 and 1 - sigma: float - Sigma value between 0 and 1 + USE_MODEL : str + Use the model's default pooling strategy. + FORCE_MEAN : str + Force mean pooling over the token embeddings. + FORCE_CLS : str + Use the [CLS] token embedding as the sentence representation. """ USE_MODEL = "useModel" @@ -96,6 +98,8 @@ class HuggingFaceEmbedder(CamelBase): Describes the natural distribution of search results binary_quantized: Optional[bool] Once set to true, irreversibly converts all vector dimensions to 1-bit values + pooling: Optional[PoolingType] + Configures how individual tokens are merged into a single embedding """ source: str = "huggingFace" @@ -107,7 +111,7 @@ class HuggingFaceEmbedder(CamelBase): document_template_max_bytes: Optional[int] = None # Default to 400 distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - pooling: Optional[PoolingOpt] = None + pooling: Optional[PoolingType] = None class OllamaEmbedder(CamelBase): @@ -210,15 +214,27 @@ class UserProvidedEmbedder(CamelBase): class CompositeEmbedder(CamelBase): + """Composite embedder configuration. + + Parameters + ---------- + source: str + The embedder source, must be "userProvided" + indexing_embedder: Union[OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder] + The embedder used for indexing queries remotely + search_embedder: Union[OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder] + The embedder used for processing queries locally + """ + source: str = "composite" - search_embedder: Union[ + indexing_embedder: Union[ OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder, ] - indexing_embedder: Union[ + search_embedder: Union[ OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index 5baf2e09..f42f87c7 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -1,6 +1,12 @@ # pylint: disable=redefined-outer-name -from meilisearch.models.embedders import OpenAiEmbedder, UserProvidedEmbedder +import pytest +from meilisearch.models.embedders import ( + HuggingFaceEmbedder, + OpenAiEmbedder, + UserProvidedEmbedder, + CompositeEmbedder, +) def test_get_default_embedders(empty_index): @@ -183,3 +189,35 @@ def test_user_provided_embedder_format(empty_index): assert embedders.embedders["user_provided"].distribution.mean == 0.5 assert embedders.embedders["user_provided"].distribution.sigma == 0.1 assert embedders.embedders["user_provided"].binary_quantized is False + + +@pytest.mark.usefixtures("enable_composite_embedders") +def test_composite_embedder_format(empty_index): + """Tests that CompositeEmbedder embedder has the required fields and proper format.""" + index = empty_index() + + hf_default = HuggingFaceEmbedder().model_dump(by_alias=True, exclude_none=True) + + # create composite embedder + composite_embedder = { + "default": { + "source": "composite", + "searchEmbedder": hf_default, + "indexingEmbedder": hf_default, + } + } + + response = index.update_embedders(composite_embedder) + index.wait_for_task(response.task_uid) + embedders = index.get_embedders() + print(embedders) + assert embedders.embedders["composite"].source == "composite" + + assert isinstance(embedders.embedders["composite"], CompositeEmbedder) + assert isinstance(embedders.embedders["composite"].search_embedder, HuggingFaceEmbedder) + assert isinstance(embedders.embedders["composite"].indexing_embedder, HuggingFaceEmbedder) + + assert not hasattr(embedders.embedders["composite"].search_embedder, "document_template") + assert not hasattr(embedders.embedders["composite"].search_embedder, "document_template_max_bytes") + assert hasattr(embedders.embedders["composite"].indexing_embedder, "document_template") + From fcb0b1aecf7eaecd872731d32589c6ad1c223cd7 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Tue, 20 May 2025 14:41:46 +0200 Subject: [PATCH 5/8] update tests --- meilisearch/index.py | 2 +- tests/settings/test_settings_embedders.py | 32 +++++++++++++++-------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/meilisearch/index.py b/meilisearch/index.py index c3e4955f..a3a79f6b 100644 --- a/meilisearch/index.py +++ b/meilisearch/index.py @@ -25,6 +25,7 @@ from meilisearch.errors import version_error_hint_message from meilisearch.models.document import Document, DocumentsResults from meilisearch.models.embedders import ( + CompositeEmbedder, Embedders, EmbedderType, HuggingFaceEmbedder, @@ -32,7 +33,6 @@ OpenAiEmbedder, RestEmbedder, UserProvidedEmbedder, - CompositeEmbedder, ) from meilisearch.models.index import ( Faceting, diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index f42f87c7..23c3e4b6 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -1,11 +1,12 @@ # pylint: disable=redefined-outer-name import pytest + from meilisearch.models.embedders import ( + CompositeEmbedder, HuggingFaceEmbedder, OpenAiEmbedder, UserProvidedEmbedder, - CompositeEmbedder, ) @@ -103,6 +104,7 @@ def test_huggingface_embedder_format(empty_index): assert embedders.embedders["huggingface"].distribution.mean == 0.5 assert embedders.embedders["huggingface"].distribution.sigma == 0.1 assert embedders.embedders["huggingface"].binary_quantized is False + assert embedders.embedders["huggingface"].pooling is False def test_ollama_embedder_format(empty_index): @@ -196,28 +198,36 @@ def test_composite_embedder_format(empty_index): """Tests that CompositeEmbedder embedder has the required fields and proper format.""" index = empty_index() - hf_default = HuggingFaceEmbedder().model_dump(by_alias=True, exclude_none=True) + embedder = HuggingFaceEmbedder().model_dump(by_alias=True, exclude_none=True) # create composite embedder composite_embedder = { - "default": { + "composite": { "source": "composite", - "searchEmbedder": hf_default, - "indexingEmbedder": hf_default, + "searchEmbedder": embedder, + "indexingEmbedder": embedder, } } response = index.update_embedders(composite_embedder) - index.wait_for_task(response.task_uid) + update = index.wait_for_task(response.task_uid) embedders = index.get_embedders() - print(embedders) + assert update.status == "succeeded" + assert embedders.embedders["composite"].source == "composite" + # ensure serialization roundtrips nicely assert isinstance(embedders.embedders["composite"], CompositeEmbedder) assert isinstance(embedders.embedders["composite"].search_embedder, HuggingFaceEmbedder) assert isinstance(embedders.embedders["composite"].indexing_embedder, HuggingFaceEmbedder) - assert not hasattr(embedders.embedders["composite"].search_embedder, "document_template") - assert not hasattr(embedders.embedders["composite"].search_embedder, "document_template_max_bytes") - assert hasattr(embedders.embedders["composite"].indexing_embedder, "document_template") - + # ensure search_embedder has no document_template + assert getattr(embedders.embedders["composite"].search_embedder, "document_template") is None + assert ( + getattr( + embedders.embedders["composite"].search_embedder, + "document_template_max_bytes", + ) + is None + ) + assert getattr(embedders.embedders["composite"].indexing_embedder, "document_template") From 8c8dbe519d11f9002f1917ad6b895390b908c985 Mon Sep 17 00:00:00 2001 From: Nate Nethercott <53127799+nnethercott@users.noreply.github.com> Date: Tue, 20 May 2025 14:56:29 +0200 Subject: [PATCH 6/8] Update meilisearch/models/embedders.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- meilisearch/models/embedders.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 715226da..9bd487f2 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -219,14 +219,7 @@ class CompositeEmbedder(CamelBase): Parameters ---------- source: str - The embedder source, must be "userProvided" - indexing_embedder: Union[OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder] - The embedder used for indexing queries remotely - search_embedder: Union[OpenAiEmbedder, HuggingFaceEmbedder, OllamaEmbedder, RestEmbedder, UserProvidedEmbedder] - The embedder used for processing queries locally - """ - - source: str = "composite" + The embedder source, must be "composite" indexing_embedder: Union[ OpenAiEmbedder, HuggingFaceEmbedder, @@ -242,7 +235,6 @@ class CompositeEmbedder(CamelBase): UserProvidedEmbedder, ] - # Type alias for the embedder union type EmbedderType = Union[ OpenAiEmbedder, From 89a90834c16b4e0131544fddcc13d2131f4958fc Mon Sep 17 00:00:00 2001 From: nnethercott Date: Mon, 26 May 2025 20:19:27 +0200 Subject: [PATCH 7/8] revert coderabbit --- meilisearch/models/embedders.py | 17 +++++++++++++++++ tests/settings/test_settings_embedders.py | 3 ++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index 9bd487f2..ccbddc84 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -227,6 +227,15 @@ class CompositeEmbedder(CamelBase): RestEmbedder, UserProvidedEmbedder, ] + search_embedder: Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder, + ]""" + + source: str = "composite" search_embedder: Union[ OpenAiEmbedder, HuggingFaceEmbedder, @@ -234,6 +243,14 @@ class CompositeEmbedder(CamelBase): RestEmbedder, UserProvidedEmbedder, ] + indexing_embedder: Union[ + OpenAiEmbedder, + HuggingFaceEmbedder, + OllamaEmbedder, + RestEmbedder, + UserProvidedEmbedder, + ] + # Type alias for the embedder union type EmbedderType = Union[ diff --git a/tests/settings/test_settings_embedders.py b/tests/settings/test_settings_embedders.py index 23c3e4b6..333678bc 100644 --- a/tests/settings/test_settings_embedders.py +++ b/tests/settings/test_settings_embedders.py @@ -6,6 +6,7 @@ CompositeEmbedder, HuggingFaceEmbedder, OpenAiEmbedder, + PoolingType, UserProvidedEmbedder, ) @@ -104,7 +105,7 @@ def test_huggingface_embedder_format(empty_index): assert embedders.embedders["huggingface"].distribution.mean == 0.5 assert embedders.embedders["huggingface"].distribution.sigma == 0.1 assert embedders.embedders["huggingface"].binary_quantized is False - assert embedders.embedders["huggingface"].pooling is False + assert embedders.embedders["huggingface"].pooling is PoolingType.USE_MODEL def test_ollama_embedder_format(empty_index): From fd0d2856037f26ad41a046b00653c57eaa462846 Mon Sep 17 00:00:00 2001 From: nnethercott Date: Mon, 26 May 2025 20:44:25 +0200 Subject: [PATCH 8/8] make default pooling match docs --- meilisearch/models/embedders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/meilisearch/models/embedders.py b/meilisearch/models/embedders.py index ccbddc84..9dcd5d00 100644 --- a/meilisearch/models/embedders.py +++ b/meilisearch/models/embedders.py @@ -111,7 +111,7 @@ class HuggingFaceEmbedder(CamelBase): document_template_max_bytes: Optional[int] = None # Default to 400 distribution: Optional[Distribution] = None binary_quantized: Optional[bool] = None - pooling: Optional[PoolingType] = None + pooling: Optional[PoolingType] = PoolingType.USE_MODEL class OllamaEmbedder(CamelBase):