Webscraper/WebScraper.py at main · Mysticmarks/Webscraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# -*- coding: utf-8 -*-
"""
Created on Sat Oct  7 11:31:30 2023

@author: mysticmarks
"""

import requests
from bs4 import BeautifulSoup
import tkinter as tk
from tkinter import ttk, messagebox
import threading
import os
from urllib.parse import urlparse, urljoin

# --- Utility Layer ---

class URLManager:
    def __init__(self):
        self.visited_urls = set()

    def normalize_url(self, url, base_url=None):
        if base_url:
            url = urljoin(base_url, url)
        parts = urlparse(url)
        return parts.geturl()

    def mark_visited(self, url):
        self.visited_urls.add(self.normalize_url(url))

    def is_visited(self, url):
        return self.normalize_url(url) in self.visited_urls

class Logger:
    def log(self, message, level='INFO'):
        print(f"{level}: {message}")

# --- Scraping Layer ---

class HttpClient:
    def __init__(self, timeout=10):
        self.timeout = timeout

    def send_request(self, url):
        try:
            response = requests.get(url, timeout=self.timeout)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            raise ConnectionError(f"Error fetching {url}: {str(e)}") from e

class Scraper:
    def __init__(self):
        self.http_client = HttpClient()
        self.url_manager = URLManager()
        self.logger = Logger()

    def scrape_url(self, url, folder_path):
        if self.url_manager.is_visited(url):
            self.logger.log(f"Skipping already visited URL: {url}")
            return
        self.url_manager.mark_visited(url)

        try:
            html = self.http_client.send_request(url)
            self.parse_and_store_data(html, folder_path, url)
        except ConnectionError as e:
            self.logger.log(str(e), level='ERROR')

    def parse_and_store_data(self, data, folder_path, base_url):
        # Parse HTML with BeautifulSoup
        soup = BeautifulSoup(data, 'html.parser')

        # Example: Extract and log all links on the page
        for link in soup.find_all('a'):
            href = link.get('href')
            absolute_url = self.url_manager.normalize_url(href, base_url)
            self.logger.log(f"Found link: {absolute_url}")

        # Store the original HTML data
        if not os.path.exists(folder_path):
            os.makedirs(folder_path)
        with open(os.path.join(folder_path, "data.html"), "w", encoding="utf-8") as file:
            file.write(data)

# --- Application Layer ---

class AppController:
    def __init__(self):
        self.scraper = Scraper()

    def start_scraping(self, url, folder_path):
        threading.Thread(target=self.scraper.scrape_url, args=(url, folder_path)).start()

class GUI(tk.Tk):
    def __init__(self, controller):
        super().__init__()
        self.controller = controller
        self.title("Web Scraper")
        self.setup_ui()

    def setup_ui(self):
        self.url_label = ttk.Label(self, text="Enter URL:")
        self.url_label.pack(padx=10, pady=5)

        self.url_entry = ttk.Entry(self, width=50)
        self.url_entry.pack(padx=10, pady=5)

        self.scrape_button = ttk.Button(self, text="Scrape Data", command=self.on_start_scraping)
        self.scrape_button.pack(padx=10, pady=5)

    def on_start_scraping(self):
        url = self.url_entry.get()
        self.controller.start_scraping(url, 'output_folder')
        messagebox.showinfo("Started", "The scraping has started!")

# --- Main Application Logic ---

controller = AppController()
gui = GUI(controller)
gui.mainloop()