Skip to content

Commit 47ea851

Browse files
authored
hybrid search improvements (#2796)
1 parent ef7effd commit 47ea851

File tree

4 files changed

+42
-33
lines changed

4 files changed

+42
-33
lines changed

learning_resources_search/api.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
DEPARTMENT_QUERY_FIELDS,
2222
HYBRID_COMBINED_INDEX,
2323
HYBRID_SEARCH_MODE,
24+
HYBRID_SEARCH_PIPELINE_NAME,
2425
LEARNING_RESOURCE,
2526
LEARNING_RESOURCE_QUERY_FIELDS,
2627
LEARNING_RESOURCE_SEARCH_SORTBY_OPTIONS,
@@ -55,21 +56,6 @@
5556
]
5657

5758
HYBRID_SEARCH_KNN_K_VALUE = 5
58-
HYBRID_SEARCH_PAGINATION_DEPTH = 10
59-
HYBRID_SEARCH_POST_PROCESSOR = {
60-
"description": "Post processor for hybrid search",
61-
"phase_results_processors": [
62-
{
63-
"normalization-processor": {
64-
"normalization": {"technique": "min_max"},
65-
"combination": {
66-
"technique": "arithmetic_mean",
67-
"parameters": {"weights": [0.8, 0.2]},
68-
},
69-
}
70-
}
71-
],
72-
}
7359

7460

7561
def gen_content_file_id(content_file_id):
@@ -679,10 +665,11 @@ def add_text_query_to_search(
679665
}
680666
}
681667

668+
pagination_depth = search_params.get("limit") * 3
682669
search = search.extra(
683670
query={
684671
"hybrid": {
685-
"pagination_depth": HYBRID_SEARCH_PAGINATION_DEPTH,
672+
"pagination_depth": pagination_depth,
686673
"queries": [text_query, vector_query],
687674
}
688675
}
@@ -800,7 +787,7 @@ def execute_learn_search(search_params):
800787
search = construct_search(search_params)
801788

802789
if search_params.get("search_mode") == HYBRID_SEARCH_MODE:
803-
search = search.extra(search_pipeline=HYBRID_SEARCH_POST_PROCESSOR)
790+
search = search.extra(search_pipeline=HYBRID_SEARCH_PIPELINE_NAME)
804791

805792
results = search.execute().to_dict()
806793
if results.get("_shards", {}).get("failures"):

learning_resources_search/api_test.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2470,7 +2470,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
24702470
"size": 1,
24712471
"query": {
24722472
"hybrid": {
2473-
"pagination_depth": 10,
2473+
"pagination_depth": 3,
24742474
"queries": [
24752475
{
24762476
"bool": {
@@ -2776,20 +2776,7 @@ def test_execute_learn_search_with_hybrid_search(mocker, settings, opensearch):
27762776
},
27772777
}
27782778
},
2779-
"search_pipeline": {
2780-
"description": "Post processor for hybrid search",
2781-
"phase_results_processors": [
2782-
{
2783-
"normalization-processor": {
2784-
"normalization": {"technique": "min_max"},
2785-
"combination": {
2786-
"technique": "arithmetic_mean",
2787-
"parameters": {"weights": [0.8, 0.2]},
2788-
},
2789-
}
2790-
}
2791-
],
2792-
},
2779+
"search_pipeline": "hybrid_search_pipeline",
27932780
"_source": {
27942781
"excludes": [
27952782
"created_on",

learning_resources_search/constants.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,22 @@
2525
HYBRID_COMBINED_INDEX = "combined_hybrid"
2626
LEARNING_RESOURCE = "learning_resource"
2727
HYBRID_SEARCH_MODE = "hybrid"
28+
HYBRID_SEARCH_PIPELINE_NAME = "hybrid_search_pipeline"
29+
30+
HYBRID_SEARCH_PIPELINE_BODY = {
31+
"description": "Post processor for hybrid search",
32+
"phase_results_processors": [
33+
{
34+
"normalization-processor": {
35+
"normalization": {"technique": "min_max"},
36+
"combination": {
37+
"technique": "arithmetic_mean",
38+
"parameters": {"weights": [0.8, 0.2]},
39+
},
40+
}
41+
}
42+
],
43+
}
2844

2945

3046
class IndexestoUpdate(Enum):
@@ -322,7 +338,13 @@ class FilterConfig:
322338
"max_weekly_hours": {"type": "integer"},
323339
}
324340

325-
EMBEDDING_FIELDS = {"vector_embedding": {"type": "knn_vector"}}
341+
EMBEDDING_FIELDS = {
342+
"vector_embedding": {
343+
"type": "knn_vector",
344+
"space_type": "cosinesimil",
345+
"method": {"name": "hnsw", "space_type": "cosinesimil", "engine": "faiss"},
346+
}
347+
}
326348

327349

328350
CONTENT_FILE_MAP = {

learning_resources_search/indexing_api.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
COURSE_TYPE,
2727
EMBEDDING_FIELDS,
2828
HYBRID_COMBINED_INDEX,
29+
HYBRID_SEARCH_PIPELINE_BODY,
30+
HYBRID_SEARCH_PIPELINE_NAME,
2931
LEARNING_RESOURCE_MAP,
3032
MAPPING,
3133
PERCOLATE_INDEX_TYPE,
@@ -196,6 +198,17 @@ def clear_and_create_index(*, index_name=None, skip_mapping=False, object_type=N
196198
"properties": (LEARNING_RESOURCE_MAP | vector_map)
197199
}
198200
index_create_data["settings"]["index.knn"] = True
201+
202+
try:
203+
conn.transport.perform_request(
204+
"GET", f"/_search/pipeline/{HYBRID_SEARCH_PIPELINE_NAME}"
205+
)
206+
except NotFoundError:
207+
conn.transport.perform_request(
208+
"PUT",
209+
f"/_search/pipeline/{HYBRID_SEARCH_PIPELINE_NAME}",
210+
body=HYBRID_SEARCH_PIPELINE_BODY,
211+
)
199212
elif not skip_mapping:
200213
index_create_data["mappings"] = {"properties": MAPPING[object_type]}
201214

0 commit comments

Comments
 (0)