Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 283 additions & 0 deletions week1/Project - Web Summarizer.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 60,
"id": "237afb07-41ea-402d-b4d4-23d6fa6990a3",
"metadata": {},
"outputs": [],
"source": [
"import os \n",
"import requests \n",
"from urllib.request import urlopen\n",
"from openai import OpenAI\n",
"from bs4 import BeautifulSoup\n",
"from IPython.display import Markdown, display\n",
"from dotenv import load_dotenv"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "c157906e-eb2c-4db5-a6fd-bf113abede23",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"API key found and looks good so far!\n"
]
}
],
"source": [
"load_dotenv()\n",
"api_key=os.getenv(\"OPENAI_API_KEY\")\n",
"\n",
"if not api_key:\n",
" print(\"No API key was found - Please try to identify & fix!\")\n",
"elif not api_key.startswith !=\"sk-proj-\":\n",
" print(\"An API key was found, but it doesn't start sk-proj-; please check you're using the right key.\")\n",
"elif not api_key.split()!=api_key:\n",
" print(\"An API key was found, but it looks like it might have space or tab characters at the start or end - please remove them.\")\n",
"else:\n",
" print(\"API key found and looks good so far!\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "3e86115c-1ba1-4e6d-9386-407515a80ece",
"metadata": {},
"outputs": [],
"source": [
"openai = OpenAI()"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "a3d1978d-eae4-44de-a898-f6975577fbe3",
"metadata": {},
"outputs": [],
"source": [
"headers={\n",
" \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "2e5bf684-cd44-4a18-af78-3d81e79765c9",
"metadata": {},
"outputs": [],
"source": [
"re_sess=requests.Session()"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "3c1f079f-4e37-46b6-a079-08cc81b68acb",
"metadata": {},
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Enter a website URL: https://cnn.com\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"200\n"
]
}
],
"source": [
"myurl = input(\"Enter a website URL: \")\n",
"\n",
"try:\n",
" response = requests.get(myurl,headers=headers,timeout=10)\n",
" print(response.status_code)\n",
"except requests.exceptions.RequestException as e:\n",
" print(f\"❌ Failed to fetch website: {e}\")\n",
" myurl=None"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "60ce01f3-b3a6-4542-9f28-6de6517d4c92",
"metadata": {},
"outputs": [],
"source": [
"soup=BeautifulSoup(response.content,'html.parser')"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "8e4bb40d-d22f-4984-a54a-e139a897cd5f",
"metadata": {},
"outputs": [],
"source": [
"title = soup.title.string if soup.title else \"No title found.\""
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "bf2b0b66-99c2-4c0c-a525-5213b6ea444a",
"metadata": {},
"outputs": [],
"source": [
"for irrelevant in soup.body([\"script\", \"style\", \"img\", \"input\"]):\n",
" irrelevant.decompose()\n",
"text = soup.body.get_text(separator=\"\\n\", strip=True)"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "b8ec0d45-7cfe-48ac-9de3-0b014913f8b8",
"metadata": {},
"outputs": [],
"source": [
"system_prompt = \"\"\"You are an assistant that summarizes website content.\n",
"- Provide a short, clear summary (3–5 sentences max).\n",
"- Focus only on the main ideas, purpose, or offerings of the website.\n",
"- Ignore navigation links, ads, boilerplate text, and repetitive content.\n",
"- If the text looks like a blog/article, summarize it as an article.\n",
"- If the text is company/product info, summarize it as a business overview.\n",
"- Respond in clean markdown format with no extra commentary.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "0405ecf4-b13b-4e5e-88a3-3076b740db85",
"metadata": {},
"outputs": [],
"source": [
"def user_prompt_for(myurl):\n",
" user_prompt = \"\" \n",
" user_prompt += f\"You are currently on the website titled {title}\\n\"\n",
" user_prompt += \"\\nThe contents of this website is as follows; \\\n",
"please provide a short summary of this website in markdown. \\\n",
"If it includes news or announcements or some keynotes, then summarize these too.\\n\\n\"\n",
" user_prompt += text\n",
" return user_prompt"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "d5981fd5-cfe1-4426-bb88-bdf6b4fe7fc8",
"metadata": {},
"outputs": [],
"source": [
"def messages_for(myurl):\n",
" return [\n",
" {'role':'system','content':system_prompt},\n",
" {'role':'user','content':user_prompt_for(myurl)}\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 79,
"id": "28c19c30-da9d-4e87-a307-394cb239d84f",
"metadata": {},
"outputs": [],
"source": [
"def summarize(myurl):\n",
" response = openai.chat.completions.create(\n",
" model = \"gpt-5-nano\",\n",
" messages = messages_for(myurl)\n",
" )\n",
" return response.choices[0].message.content"
]
},
{
"cell_type": "code",
"execution_count": 80,
"id": "aa2edc19-b002-4315-9e46-b1be7a7504ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"CNN's Breaking News, Latest News and Videos site provides global and US news coverage across categories such as World, Politics, Business, Health, Entertainment, Style, Travel, Sports, Science, and Climate. The site features articles, in-depth analyses, investigations, and multimedia content—including video and live TV—covering major events like conflicts and elections. Readers can watch CNN videos, listen to podcasts, and sign in to personalize feeds and newsletters. The homepage also spotlights special reports and programs such as CNN Heroes and Call to Earth, highlighting a mix of hard news and long-form storytelling.\""
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"summarize(myurl)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"id": "c716b094-3231-4ac2-8fff-efc55e26301a",
"metadata": {},
"outputs": [],
"source": [
"def display_better(myurl):\n",
" summary = summarize(myurl)\n",
" display(Markdown(summary))"
]
},
{
"cell_type": "code",
"execution_count": 78,
"id": "300b4b1f-06bf-455c-b2c5-e01bb032a2c9",
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"CNN's Breaking News, Latest News and Videos page is a comprehensive news portal offering around-the-clock global coverage across US and world politics, business, health, entertainment, science, climate, and more. It provides live TV and a wide range of video and audio content, including CNN10, CNN Fast, podcasts, and video reports. The site highlights major stories and in-depth analysis on conflicts (Gaza, Ukraine-Russia, Israel-Hamas), elections, technology and other emerging topics, with continually updated headlines and features like “Best of CNN” and “Call to Earth.” It also promotes personalized content through user accounts, newsletters, topics you follow, and ML-powered content recommendations to surface relevant articles and videos."
],
"text/plain": [
"<IPython.core.display.Markdown object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display_better(myurl)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}