Generate summaries for new video transcripts (#2428)

shanbady · web-flow · commit b52fb906b2eb · 2025-08-14T14:08:44.000-04:00
* adding check for related contents having summaries

* adding custom model (ollama) capability

* update docstring

* adding test

* adding required param

* adding default settings to contentfile tests
diff --git a/learning_resources/content_summarizer.py b/learning_resources/content_summarizer.py
@@ -215,11 +215,15 @@ def _get_llm(self, model=None, temperature=0.0, max_tokens=1000) -> ChatLiteLLM:
         if not settings.LITELLM_CUSTOM_PROVIDER:
             raise ValueError("The 'LITELLM_CUSTOM_PROVIDER' setting must be set.")  # noqa: EM101, TRY003
 
+        if not settings.LITELLM_API_BASE:
+            raise ValueError("The 'LITELLM_API_BASE' setting must be set.")  # noqa: EM101, TRY003
+
         return ChatLiteLLM(
             model=model,
             temperature=temperature,
             max_tokens=max_tokens,
             custom_llm_provider=settings.LITELLM_CUSTOM_PROVIDER,
+            api_base=settings.LITELLM_API_BASE,
         )
 
     def _generate_summary(self, content: str, llm_model: str) -> str:
diff --git a/learning_resources/content_summarizer_test.py b/learning_resources/content_summarizer_test.py
@@ -21,6 +21,11 @@
 pytestmark = pytest.mark.django_db
 
 
+@pytest.fixture(autouse=True)
+def setup_settings(settings):
+    settings.LITELLM_API_BASE = "https://test/api/"
+
+
 @pytest.fixture
 def mock_summarize_single_content_file(mocker):
     """Fixture for mocking the process single file method"""
diff --git a/vector_search/conftest.py b/vector_search/conftest.py
@@ -31,6 +31,7 @@ def _use_dummy_encoder(settings):
 def _use_test_qdrant_settings(settings, mocker):
     settings.QDRANT_HOST = "https://test"
     settings.QDRANT_BASE_COLLECTION_NAME = "test"
+    settings.LITELLM_API_BASE = "https://test/api/"
     settings.CONTENT_FILE_EMBEDDING_CHUNK_OVERLAP = 0
     settings.CONTENT_FILE_EMBEDDING_SEMANTIC_CHUNKING_ENABLED = False
     mock_qdrant = mocker.patch("qdrant_client.QdrantClient")
diff --git a/vector_search/utils.py b/vector_search/utils.py
@@ -524,11 +524,15 @@ def embed_learning_resources(ids, resource_type, overwrite):
     else:
         serialized_resources = list(serialize_bulk_content_files(ids))
         # TODO: Pass actual Ids when we want scheduled content file summarization  # noqa: FIX002, TD002, TD003 E501
-        # Currently we only want to summarize content that already has a summary
+        # Currently we only want to summarize content that either already has a summary
+        # OR is in a course where atleast one other content file has a summary
         existing_summary_content_ids = [
             resource["id"]
             for resource in serialized_resources
             if resource.get("summary")
+            or ContentFile.objects.filter(run__id=resource.get("run_id"))
+            .exclude(summary="")
+            .exists()
         ]
         ContentSummarizer().summarize_content_files_by_ids(
             existing_summary_content_ids, overwrite
diff --git a/vector_search/utils_test.py b/vector_search/utils_test.py
@@ -13,6 +13,9 @@
 )
 from learning_resources.models import LearningResource
 from learning_resources.serializers import LearningResourceMetadataDisplaySerializer
+from learning_resources_search.constants import (
+    CONTENT_FILE_TYPE,
+)
 from learning_resources_search.serializers import (
     serialize_bulk_content_files,
     serialize_bulk_learning_resources,
@@ -806,3 +809,44 @@ def test_update_content_file_payload_only_includes_existing_keys(
         )
     else:
         mock_retrieve.assert_not_called()
+
+
+@pytest.mark.django_db
+def test_embed_learning_resources_contentfile_summarization_filter(mocker):
+    """
+    Test that the summarizer runs for a content file if another content file
+    in the parent learning run also has a summary.
+    """
+    settings.OPENAI_API_KEY = "test"
+    settings.QDRANT_ENABLE_INDEXING_PLUGIN_HOOKS = True
+    mock_content_summarizer = mocker.patch(
+        "learning_resources.content_summarizer.ContentSummarizer.summarize_content_files_by_ids"
+    )
+    mock_chat_llm = mocker.patch(
+        "learning_resources.content_summarizer.ChatLiteLLM", autospec=True
+    )
+    mock_instance = mock_chat_llm.return_value
+    mock_summary_response = mocker.MagicMock()
+    mock_summary_response.content = "mocked summary"
+    mock_instance.invoke.return_value = mock_summary_response
+    mock_instance.with_structured_output.return_value.invoke.return_value = {
+        "flashcards": [
+            {
+                "question": "Generated Question",
+                "answer": "Generated Answer",
+            }
+        ]
+    }
+
+    run = LearningResourceRunFactory.create(published=True)
+    ContentFileFactory.create_batch(
+        2, content="test content", summary="summary text", run=run
+    )
+    new_content_files = ContentFileFactory.create_batch(
+        2, content="new content", summary="", run=run
+    )
+    cf_ids = [cf.id for cf in new_content_files]
+    embed_learning_resources(cf_ids, resource_type=CONTENT_FILE_TYPE, overwrite=False)
+
+    # Assert that the summarizer was called with the correct content file IDs
+    assert sorted(mock_content_summarizer.mock_calls[0].args[0]) == sorted(cf_ids)