Merge pull request #1 from emre570/emre570-yt-summarizer

cobanov · web-flow · commit add9d7da95d5 · 2024-05-01T19:29:15.000+03:00
YouTube Summarizer
diff --git a/summarizer.py b/summarizer.py
@@ -45,7 +45,7 @@ def setup_summarization_chain():
         input_variables=["text"],
     )
 
-    llm = ChatOllama(model="llama3", base_url="http://0.0.0.0:11434")
+    llm = ChatOllama(model="llama3", base_url="http://127.0.0.1:11434")
     llm_chain = LLMChain(llm=llm, prompt=prompt_template)
     return llm_chain
 
diff --git a/webui.py b/webui.py
@@ -1,32 +1,34 @@
 import gradio as gr
 
 from summarizer import load_document, setup_summarization_chain
+from yt_summarizer import summarize_video, check_link
 from translator import setup_translator_chain
 
-
 def summarize(url):
-    docs = load_document(url)
-    llm_chain = setup_summarization_chain()
-    result = llm_chain.run(docs)
+    if check_link(url):
+        result = summarize_video(url)
+    else:
+        docs = load_document(url)
+        llm_chain = setup_summarization_chain()
+        result = llm_chain.run(docs)
 
     return [result, gr.Button("🇹🇷 Translate ", visible=True)]
 
-
 def translate(text):
     llm_chain = setup_translator_chain()
     result = llm_chain.run(text)
     return result
 
-
 with gr.Blocks() as demo:
     gr.Markdown(
-        """# Cobanov Web Summarizer
-    Easily summarize any web page with a single click."""
+        """# Cobanov Web and Video Summarizer
+    Easily summarize any web page or YouTube video with a single click."""
     )
 
     with gr.Row():
         with gr.Column():
             url = gr.Text(label="URL", placeholder="Enter URL here")
+
             btn_generate = gr.Button("Generate")
 
             summary = gr.Markdown(label="Summary")
@@ -36,6 +38,7 @@ def translate(text):
         [
             "https://cobanov.dev/haftalik-bulten/hafta-13",
             "https://bawolf.substack.com/p/embeddings-are-a-good-starting-point",
+            "https://www.youtube.com/watch?v=4pOpQwiUVXc",
         ],
         inputs=[url],
     )
@@ -51,5 +54,4 @@ def translate(text):
     btn_generate.click(summarize, inputs=[url], outputs=[summary, btn_translate])
     btn_translate.click(translate, inputs=[summary], outputs=[summary])
 
-
-demo.launch()
+demo.launch()
diff --git a/yt_summarizer.py b/yt_summarizer.py
@@ -0,0 +1,58 @@
+from langchain_community.document_loaders import YoutubeLoader
+from langchain.text_splitter import TokenTextSplitter
+from langchain_community.chat_models import ChatOllama
+from langchain.chains.summarize import load_summarize_chain
+from langchain_core.prompts import PromptTemplate
+import re
+
+def check_link(link):
+    yt_regex = r"(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+"
+    return re.match(yt_regex, link) is not None
+
+def get_transcript(video_link):
+    # Get video transcript
+    if check_link(video_link):
+        loader = YoutubeLoader.from_youtube_url(video_link, language=["en", "en-US"])
+        transcript = loader.load()
+        return transcript
+    return "Invalid YouTube URL."
+
+def split_chunks(transcript):
+    # Split the transcript into chunks
+    # Llama 3 model takes up to 8192 input tokens, so I set chunk size to 7500 for leaving some space to model.
+    splitter = TokenTextSplitter(chunk_size = 7500, chunk_overlap = 100)
+    chunks = splitter.split_documents(transcript)
+    return chunks
+
+def yt_summarization_chain():
+    prompt_template = PromptTemplate(
+        template="""As a professional summarizer specialized in video content, create a detailed and comprehensive summary of the YouTube video transcript provided. While crafting your summary, adhere to these guidelines:
+            1. Capture the essence of the video, focusing on main ideas and key details. Ensure the summary is in-depth and insightful, reflecting any narrative or instructional elements present in the video.
+
+            2. Exclude any redundant expressions and non-critical details to enhance the clarity and conciseness of the summary.
+
+            3. Base the summary strictly on the transcript provided, avoiding assumptions or additions from external sources.
+
+            4. Present the summary in a well-structured paragraph form, making it easy to read and understand.
+
+            5. Conclude with "[End of Notes, Message #X]", where "X" is the sequence number of the summarizing request, to indicate the completion of the task.
+
+        By adhering to this optimized prompt, you are expected to produce a clear, detailed, and audience-friendly summary that effectively conveys the core content and themes of the YouTube video.
+
+        "{text}"
+
+        DETAILED SUMMARY:""",
+        input_variables=["text"],
+    )
+    llm = ChatOllama(model="llama3")
+    summarize_chain = load_summarize_chain(llm=llm, prompt=prompt_template, verbose=True)
+    return summarize_chain
+
+def summarize_video(video_link):
+    transcript = get_transcript(video_link)
+    chunks = split_chunks(transcript)
+
+    sum_chain = yt_summarization_chain()
+    result = sum_chain.run(chunks)
+
+    return result

Original file line number	Diff line number	Diff line change
`@@ -45,7 +45,7 @@ def setup_summarization_chain():`
`45`	`45`	`input_variables=["text"],`
`46`	`46`	`)`
`47`	`47`
`48`		`- llm = ChatOllama(model="llama3", base_url="http://0.0.0.0:11434")`
	`48`	`+ llm = ChatOllama(model="llama3", base_url="http://127.0.0.1:11434")`
`49`	`49`	`llm_chain = LLMChain(llm=llm, prompt=prompt_template)`
`50`	`50`	`return llm_chain`
`51`	`51`