SPAB/DataScraping.py at main · Riskerman21/SPAB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import requests
import re
import pandas as pd

DAYS_TO_PREDICT = ['1', '2', '3', '4', '5', '6', '7']

def scrape_weather(df):
    base_url = 'https://www.accuweather.com/en/th/bangkok/318849/evening-weather-forecast/318849?day='
    start = 0

    url_weather = f"{base_url}{DAYS_TO_PREDICT[start]}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    # Patterns for catching information from the system
    humidity_pattern = r'Humidity\s*<span class="value">(\d+)%</span>'
    precipitation_pattern = r'Probability of Precipitation\s*<span class="value">(\d+)%</span>'
    temperature_pattern = r'<div class=\"temperature\">(\d+)[^\d]*0;</div>'

    while url_weather:

        response = requests.get(url_weather, headers=headers)
        if response.status_code == 200:
            html_content = (response.text)

            humidity_match = int(re.search(humidity_pattern, html_content).group(1))
            precipitation_match = int(re.search(precipitation_pattern, html_content).group(1))

            temperature_match = re.findall(temperature_pattern, html_content)
            temp_max = temperature_match[0]
            temp_min = temperature_match[-1]
            temp_avg = temperature_match[2]

            # Update DataFrame at row
            df.at[start, 'humidity'] = humidity_match
            df.at[start, 'precipprob'] = precipitation_match
            df.at[start, 'tempmax'] = temp_max
            df.at[start, 'tempmin'] = temp_min
            df.at[start, 'temp'] = temp_avg

            start += 1

        if start >= len(DAYS_TO_PREDICT):
            url_weather = None
            break
        url_weather = f"{base_url}{DAYS_TO_PREDICT[start]}"

    return df

def scrape_pressure(df):
    url_pressure = 'https://tides4fishing.com/th/thailand/bangkok/forecast/atmospheric-pressure'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    response = requests.get(url_pressure, headers=headers)

    max_pressure_pattern = r'<td class=\"f_datos_temp_text\">MAX\. PRESSURE<\/td>\s*<td class=\"f_dato_color\"><div class=\"f_dato_color1 f_tam1 f_gris_1 f_dato_font_small\">\s*(\d+) hPa\s*<\/div><\/td>'
    min_pressure_pattern = r'<td class=\"f_datos_temp_text\">MAX\. PRESSURE<\/td>\s*<td class=\"f_dato_color\"><div class=\"f_dato_color1 f_tam1 f_gris_1 f_dato_font_small\">\s*(\d+) hPa\s*<\/div><\/td>'

    if response.status_code == 200:
        html_content = (response.text)
        max_pressure_match = re.findall(max_pressure_pattern, html_content)
        min_pressure_match = re.findall(min_pressure_pattern, html_content)
        avg_pressures = [
            (int(max_pressure_match[i]) + int(min_pressure_match[i])) // 2
            for i in range(len(max_pressure_match))
        ]

        for i in range(0, len(min_pressure_match)):
            df.at[i, 'sealevelpressure'] = avg_pressures[i]
    return df

def main():
    columns = ["tempmax", "tempmin", "temp", "sealevelpressure", "precipprob", "humidity"]
    df = pd.DataFrame(columns=columns)
    df = scrape_weather(df)
    df = scrape_pressure(df)

    df.to_excel('ForecastedWeather.xlsx', index=False)

if __name__ == '__main__':
    main()