Skip to content

Commit b52fb90

Browse files
authored
Generate summaries for new video transcripts (#2428)
* adding check for related contents having summaries * adding custom model (ollama) capability * update docstring * adding test * adding required param * adding default settings to contentfile tests
1 parent 6d72b9d commit b52fb90

File tree

5 files changed

+59
-1
lines changed

5 files changed

+59
-1
lines changed

learning_resources/content_summarizer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,11 +215,15 @@ def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
215215
if not settings.LITELLM_CUSTOM_PROVIDER:
216216
raise ValueError("The 'LITELLM_CUSTOM_PROVIDER' setting must be set.") # noqa: EM101, TRY003
217217

218+
if not settings.LITELLM_API_BASE:
219+
raise ValueError("The 'LITELLM_API_BASE' setting must be set.") # noqa: EM101, TRY003
220+
218221
return ChatLiteLLM(
219222
model=model,
220223
temperature=temperature,
221224
max_tokens=max_tokens,
222225
custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
226+
api_base=settings.LITELLM_API_BASE,
223227
)
224228

225229
def _generate_summary(self, content: str, llm_model: str) -> str:

learning_resources/content_summarizer_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,11 @@
2121
pytestmark = pytest.mark.django_db
2222

2323

24+
@pytest.fixture(autouse=True)
25+
def setup_settings(settings):
26+
settings.LITELLM_API_BASE = "https://test/api/"
27+
28+
2429
@pytest.fixture
2530
def mock_summarize_single_content_file(mocker):
2631
"""Fixture for mocking the process single file method"""

vector_search/conftest.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ def _use_dummy_encoder(settings):
3131
def _use_test_qdrant_settings(settings, mocker):
3232
settings.QDRANT_HOST = "https://test"
3333
settings.QDRANT_BASE_COLLECTION_NAME = "test"
34+
settings.LITELLM_API_BASE = "https://test/api/"
3435
settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = 0
3536
settings.CONTENT_FILE_EMBEDDING_SEMANTIC_CHUNKING_ENABLED = False
3637
mock_qdrant = mocker.patch("qdrant_client.QdrantClient")

vector_search/utils.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -524,11 +524,15 @@ def embed_learning_resources(ids, resource_type, overwrite):
524524
else:
525525
serialized_resources = list(serialize_bulk_content_files(ids))
526526
# TODO: Pass actual Ids when we want scheduled content file summarization # noqa: FIX002, TD002, TD003 E501
527-
# Currently we only want to summarize content that already has a summary
527+
# Currently we only want to summarize content that either already has a summary
528+
# OR is in a course where atleast one other content file has a summary
528529
existing_summary_content_ids = [
529530
resource["id"]
530531
for resource in serialized_resources
531532
if resource.get("summary")
533+
or ContentFile.objects.filter(run__id=resource.get("run_id"))
534+
.exclude(summary="")
535+
.exists()
532536
]
533537
ContentSummarizer().summarize_content_files_by_ids(
534538
existing_summary_content_ids, overwrite

vector_search/utils_test.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
)
1414
from learning_resources.models import LearningResource
1515
from learning_resources.serializers import LearningResourceMetadataDisplaySerializer
16+
from learning_resources_search.constants import (
17+
CONTENT_FILE_TYPE,
18+
)
1619
from learning_resources_search.serializers import (
1720
serialize_bulk_content_files,
1821
serialize_bulk_learning_resources,
@@ -806,3 +809,44 @@ def test_update_content_file_payload_only_includes_existing_keys(
806809
)
807810
else:
808811
mock_retrieve.assert_not_called()
812+
813+
814+
@pytest.mark.django_db
815+
def test_embed_learning_resources_contentfile_summarization_filter(mocker):
816+
"""
817+
Test that the summarizer runs for a content file if another content file
818+
in the parent learning run also has a summary.
819+
"""
820+
settings.OPENAI_API_KEY = "test"
821+
settings.QDRANT_ENABLE_INDEXING_PLUGIN_HOOKS = True
822+
mock_content_summarizer = mocker.patch(
823+
"learning_resources.content_summarizer.ContentSummarizer.summarize_content_files_by_ids"
824+
)
825+
mock_chat_llm = mocker.patch(
826+
"learning_resources.content_summarizer.ChatLiteLLM", autospec=True
827+
)
828+
mock_instance = mock_chat_llm.return_value
829+
mock_summary_response = mocker.MagicMock()
830+
mock_summary_response.content = "mocked summary"
831+
mock_instance.invoke.return_value = mock_summary_response
832+
mock_instance.with_structured_output.return_value.invoke.return_value = {
833+
"flashcards": [
834+
{
835+
"question": "Generated Question",
836+
"answer": "Generated Answer",
837+
}
838+
]
839+
}
840+
841+
run = LearningResourceRunFactory.create(published=True)
842+
ContentFileFactory.create_batch(
843+
2, content="test content", summary="summary text", run=run
844+
)
845+
new_content_files = ContentFileFactory.create_batch(
846+
2, content="new content", summary="", run=run
847+
)
848+
cf_ids = [cf.id for cf in new_content_files]
849+
embed_learning_resources(cf_ids, resource_type=CONTENT_FILE_TYPE, overwrite=False)
850+
851+
# Assert that the summarizer was called with the correct content file IDs
852+
assert sorted(mock_content_summarizer.mock_calls[0].args[0]) == sorted(cf_ids)

0 commit comments

Comments
 (0)