forked from karpathy/reader3
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsummarizer.py
More file actions
155 lines (119 loc) · 4.22 KB
/
summarizer.py
File metadata and controls
155 lines (119 loc) · 4.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Page summarizer using Qwen 3:4B via Ollama and LangGraph.
Provides an AI-powered summary generation for book pages.
"""
from typing import TypedDict
from langchain_ollama import ChatOllama
from langchain_core.prompts import PromptTemplate
from langgraph.graph import StateGraph, START, END
import re
class SummaryState(TypedDict):
"""State for the summary workflow."""
page_content: str
summary: str
error: str | None
def create_summary_workflow():
"""
Creates a LangGraph workflow for summarizing page content.
Uses Qwen 3:4B model via Ollama.
"""
# Initialize the Qwen 3:4B model via Ollama
model = ChatOllama(
model="qwen3:4b",
base_url="http://localhost:11434", # Default Ollama endpoint
temperature=0.7,
)
# Create prompt template for summarization
summary_prompt = PromptTemplate(
input_variables=["content"],
template="""You are an expert book summarizer. Your task is to create a concise,
informative summary of the given page content. Focus on the main ideas, key events,
and important details. Keep the summary clear and easy to understand.
Page Content:
{content}
Please provide a summary of this page content in 2-3 sentences:"""
)
# Create the workflow graph
workflow = StateGraph(SummaryState)
# Define the summarization node
def summarize_node(state: SummaryState) -> SummaryState:
"""Node that performs the actual summarization."""
try:
# Prepare the input
formatted_prompt = summary_prompt.format(content=state["page_content"])
# Call the model
response = model.invoke(formatted_prompt)
# Extract the text from the response
summary_text = response.content.strip()
return {
**state,
"summary": summary_text,
"error": None,
}
except Exception as e:
return {
**state,
"summary": "",
"error": str(e),
}
# Add nodes to the workflow
workflow.add_node("summarize", summarize_node)
# Add edges
workflow.add_edge(START, "summarize")
workflow.add_edge("summarize", END)
# Compile the workflow
return workflow.compile()
# Global workflow instance (lazy-loaded)
_summary_workflow = None
def get_summary_workflow():
"""Get or create the summary workflow."""
global _summary_workflow
if _summary_workflow is None:
_summary_workflow = create_summary_workflow()
return _summary_workflow
def summarize_page(page_content: str) -> tuple[str, str | None]:
"""
Summarizes a page content using the LangGraph workflow.
Args:
page_content: The HTML or text content of the page to summarize
Returns:
A tuple of (summary, error_message) where error_message is None if successful
"""
# Strip HTML tags for better summarization
clean_content = strip_html_tags(page_content)
# Limit content to first 2000 characters to avoid token limits
if len(clean_content) > 2000:
clean_content = clean_content[:2000] + "..."
# Run the workflow
workflow = get_summary_workflow()
result = workflow.invoke(
{
"page_content": clean_content,
"summary": "",
"error": None,
}
)
return result["summary"], result["error"]
def strip_html_tags(html_content: str) -> str:
"""
Removes HTML tags from content for cleaner text processing.
Args:
html_content: HTML content to clean
Returns:
Cleaned text content
"""
# Remove script and style tags
clean = re.sub(r"<script[^>]*>.*?</script>", "", html_content, flags=re.DOTALL)
clean = re.sub(r"<style[^>]*>.*?</style>", "", clean, flags=re.DOTALL)
# Remove all other HTML tags
clean = re.sub(r"<[^>]+>", "", clean)
# Decode HTML entities
clean = clean.replace(" ", " ")
clean = clean.replace(""", '"')
clean = clean.replace("'", "'")
clean = clean.replace("&", "&")
clean = clean.replace("<", "<")
clean = clean.replace(">", ">")
# Clean up whitespace
clean = re.sub(r"\s+", " ", clean).strip()
return clean