From 7e868c5bc852bab6b4f7a4f42c7f87b4401fcefb Mon Sep 17 00:00:00 2001 From: Darren Edge Date: Fri, 8 Nov 2024 11:53:59 +0000 Subject: [PATCH] Fix QTD bugs --- intelligence_toolkit/query_text_data/answer_builder.py | 10 +++++++--- .../query_text_data/relevance_assessor.py | 3 ++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/intelligence_toolkit/query_text_data/answer_builder.py b/intelligence_toolkit/query_text_data/answer_builder.py index cd31d273..d264f3be 100644 --- a/intelligence_toolkit/query_text_data/answer_builder.py +++ b/intelligence_toolkit/query_text_data/answer_builder.py @@ -337,7 +337,7 @@ def build_report_markdown( matched_chunks = { f"{text['title']} ({text['chunk_id']})": text for text in text_jsons } - home_link = "#" + content_structure["report_title"].replace(" ", "-").lower() + home_link = "#report" report = f'# Report\n\n## Query\n\n*{query}*\n\n## Expanded Query\n\n*{expanded_query}*\n\n## Answer\n\n{content_structure["answer"]}\n\n## Analysis\n\n### {content_structure["report_title"]}\n\n{content_structure["report_summary"]}\n\n' for theme in content_structure["theme_order"]: report += f'#### Theme: {theme["theme_title"]}\n\n{theme["theme_summary"]}\n\n' @@ -378,8 +378,12 @@ def cluster_cids(relevant_cids, cid_to_vector, target_clusters): clustered_cids = {} if len(relevant_cids) > 0: # use k-means clustering to group relevant cids into target_clusters clusters - cids = [cid for cid in relevant_cids] - vectors = [cid_to_vector[cid] for cid in cids] + cids = [] + vectors = [] + for relevant_cid in relevant_cids: + if relevant_cid in cid_to_vector: + cids.append(relevant_cid) + vectors.append(cid_to_vector[relevant_cid]) kmeans = cluster.KMeans(n_clusters=target_clusters) kmeans.fit(vectors) cluster_assignments = kmeans.predict(vectors) diff --git a/intelligence_toolkit/query_text_data/relevance_assessor.py b/intelligence_toolkit/query_text_data/relevance_assessor.py index 0ba77abd..0e51e82b 100644 --- a/intelligence_toolkit/query_text_data/relevance_assessor.py +++ b/intelligence_toolkit/query_text_data/relevance_assessor.py @@ -174,8 +174,9 @@ async def detect_relevant_chunks( community_mean_rank = [] for community, cids in level_to_community_to_candidate_cids[level].items(): + filtered_cids = [c for c in cids if c in semantic_search_cids] mean_rank = np.mean( - sorted([semantic_search_cids.index(c) for c in cids])[ + sorted([semantic_search_cids.index(c) for c in filtered_cids])[ : chunk_search_config.community_ranking_chunks ] )