Skip to content

Commit 8076f39

Browse files
committed
Added the export file functionality.
1 parent 3e5a83b commit 8076f39

7 files changed

+8644
-280
lines changed

config.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,21 @@
77
Description: Config file.
88
"""
99

10-
LOGIN = {'url': 'http://www.inmet.gov.br/projetos/rede/pesquisa/inicio.php',
11-
'username': '<USER>',
12-
'password': '<PASS>',
13-
}
10+
LOGIN = {
11+
'url': 'http://www.inmet.gov.br/projetos/rede/pesquisa/inicio.php',
12+
'username': '<USER>',
13+
'password': '<PASS>',
14+
}
15+
16+
URL_TEMPLATE = {
17+
'HOUR': 'http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt.php?&mRelEstacao={omm_code}'
18+
'&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}&mAtributos=1,1,,,1,1,,1,1,,,1,,,,,',
19+
'DAY': 'http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt.php?&mRelEstacao={omm_code}'
20+
'&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}&mAtributos=,,1,1,,,,,,1,1,,1,1,1,1,',
21+
'DAYFULL': 'http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt.php?&mRelEstacao={omm_code}'
22+
'&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}'
23+
'&mAtributos=1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1',
24+
'MONTH': 'http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt_mensal.php?&mRelEstacao={omm_code}'
25+
'&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}'
26+
'&mAtributos=1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1',
27+
}

examples/DAYFULL_3stations_2018-2019.csv

+4,634
Large diffs are not rendered by default.

examples/MONTH_2018-2019.csv

+3,848
Large diffs are not rendered by default.

examples/csv_82098.csv

-193
This file was deleted.

examples/todos10.csv

-25
This file was deleted.

extract_data.py

+79-49
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,19 @@
77
Description: Web Crawler to extract information from BDMEP (INMET) database.
88
"""
99

10+
import time
11+
import sys
12+
import csv
1013
from selenium import webdriver
1114
from bs4 import BeautifulSoup
12-
import time
13-
from config import LOGIN
15+
from config import LOGIN, URL_TEMPLATE
1416
from datetime import datetime
1517
from station import Station
16-
# import csv
18+
19+
20+
class InputError(Exception):
21+
"""Exception raised for errors in the input."""
22+
pass
1723

1824

1925
def init_webdriver_config(impl_delay=30):
@@ -30,7 +36,7 @@ def init_webdriver_config(impl_delay=30):
3036
def get_page(_driver, _url):
3137
"""Helper function that navigates and returns an BeautifulSoup page."""
3238
_driver.get(_url)
33-
time.sleep(3)
39+
time.sleep(5)
3440
return BeautifulSoup(_driver.page_source, 'html.parser')
3541

3642

@@ -44,73 +50,97 @@ def inmet_login(_driver):
4450
_driver.find_element_by_name("btnProcesso").click()
4551

4652

53+
def get_url_pattern():
54+
"""This function returns the URL pattern accordingly to the Template passed as system parameter."""
55+
56+
if len(sys.argv) > 1:
57+
if sys.argv[1].upper() in URL_TEMPLATE.keys():
58+
print('TEMPLATE: ', sys.argv[1].upper())
59+
return URL_TEMPLATE[sys.argv[1].upper()]
60+
else:
61+
raise InputError('The template {0} don´t exist.'.format(sys.argv[1].upper()))
62+
63+
print('TEMPLATE Default: MONTH')
64+
return URL_TEMPLATE['MONTH']
65+
66+
4767
def load_station_numbers(_path):
4868
"""It returns the list of stations."""
49-
_f = open(_path)
50-
_station_numbers = _f.readlines()
51-
_station_numbers = list(map(lambda x: x.replace("\n", ""), _station_numbers))
52-
_f.close()
69+
70+
with open(_path) as _file:
71+
_station_numbers = _file.readlines()
72+
_station_numbers = list(map(lambda x: x.replace("\n", ""), _station_numbers))
73+
5374
return _station_numbers
5475

5576

56-
def weather_station_parser(_data):
77+
if __name__ == '__main__':
5778

58-
ds = _data.split('\n')
59-
s = Station(ds[1][20:], ds[2][20:], ds[3][20:], ds[4][20:], ds[5], ds[6][20:])
60-
print(s)
61-
return s
79+
print(">> WebCrawler Started <<")
6280

81+
count_success = 0
82+
count_error = 0
83+
station_list = []
6384

64-
def weather_observation_parser(_data):
65-
ds = _data.split('\n')
66-
ds = list(filter(None, ds))
67-
# header is ds[0]
68-
print(ds[0])
69-
print(*ds[1:], sep='\n')
85+
driver = init_webdriver_config(30)
7086

87+
try:
88+
inmet_login(driver)
7189

72-
# if __name__ == '__main__':
90+
url_pattern = get_url_pattern()
7391

74-
print(">> WebCrawler Started <<")
92+
station_numbers = load_station_numbers("./data/station_numbers.txt")
7593

76-
driver = init_webdriver_config(30)
94+
start_date = "01/01/2018"
95+
end_date = datetime.now().strftime("%d/%m/%Y")
7796

78-
# -- Login --
79-
inmet_login(driver)
97+
for omm_code in station_numbers:
98+
url = url_pattern.format(omm_code=omm_code, start_date=start_date, end_date=end_date)
99+
soup = get_page(driver, url)
80100

101+
soup_pre = soup.select('pre')
81102

82-
# -- Extract --
83-
url_pattern = "http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt_mensal.php?&mRelEstacao={omm_code}" \
84-
"&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}&mAtributos={attributes}"
103+
if len(soup_pre) == 0: # In case of an exception go to the next station
104+
continue
85105

86-
station_numbers = load_station_numbers("./data/station_numbers.txt")
106+
if 'Não existem dados disponiveis' in soup_pre[0].text:
107+
count_error += 1
108+
else:
109+
content = soup_pre[0].text.split('--------------------')
110+
station = Station.parser(content[2])
111+
station.set_observation(Station.observation_parser(content[4]))
112+
station_list.append(station)
113+
count_success += 1
87114

88-
start_date = "01/07/2000"
89-
end_date = datetime.now().strftime("%d/%m/%Y")
90-
# attributes = "1,,,,,,,,,,,,1,1,1,1,"
91-
attributes = "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"
115+
except InputError as e:
116+
print(e)
117+
except Exception as e:
118+
print(e)
119+
finally:
120+
driver.quit()
121+
print(">> WebCrawler Finished <<")
122+
print("SUCCESS: {0}\nERROR: {1}\nTOTAL: {2}".format(count_success, count_error, count_success + count_error))
92123

93-
count_success = 0
94-
count_error = 0
124+
if len(station_list) == 0:
125+
print('No data collected. Exiting...')
126+
exit()
95127

96-
for omm_code in station_numbers[:4]:
97-
url = url_pattern.format(omm_code=omm_code, start_date=start_date, end_date=end_date, attributes=attributes)
98-
soup = get_page(driver, url)
128+
print(">> Data Processing Started <<")
99129

100-
soup_pre = soup.select('pre')
130+
data_header = list(filter(None, station_list[0].weather_observation_header.split(';')))
131+
header = Station.get_station_header() + data_header
101132

102-
if 'Não existem dados disponiveis' in soup_pre[0].text:
103-
count_error += 1
104-
else:
105-
count_success += 1
106-
content = soup_pre[0].text.split('--------------------')
133+
file_path = 'data/output_data.csv'
134+
with open(file_path, 'w', newline='\n', encoding='utf-8') as f:
135+
writer = csv.writer(f, delimiter=';', quotechar="'", quoting=csv.QUOTE_MINIMAL)
107136

108-
station = weather_station_parser(content[2])
109-
weather_observation_parser(content[4])
137+
writer.writerow(header)
110138

139+
for station in station_list:
140+
for ob in station.weather_observation:
141+
row = station.get_station_information() + ob.split(';')
142+
writer.writerow(row)
111143

112-
# driver.quit()
144+
print('Saving the file {0} [...]'.format(file_path))
113145

114-
print(">> WebCrawler Finished <<")
115-
print("SUCCESS: {0}\nERROR: {1}\nTOTAL: {2}".format(count_success, count_error,
116-
count_success + count_error))
146+
print(">> Data Processing Finished <<")

station.py

+65-9
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,76 @@
55

66

77
class Station:
8+
"""This class keep the information related for each one of the stations in station list. And also their
9+
observations"""
810

9-
def __init__(self, name_complete, lat, lng, alt, situation, op_date):
11+
def __init__(self, name_complete, lat, lng, alt, status, op_date):
12+
"""Constructor of the Station class: name_complete, lat, lng, alt, status, op_date."""
1013

11-
self.name = name_complete
12-
self.state_initials = 'SP'
13-
self.state_name = State.find_state_by_key('SP')['name']
14-
self.state_region = State.find_state_by_key('SP')['region']
15-
self.omm_code = -1
14+
self.name_complete = name_complete
1615
self.lat = lat
1716
self.lng = lng
1817
self.alt = alt
19-
self.situation = situation
18+
self.status = status
2019
self.op_start_date = op_date
20+
self.weather_observation = []
21+
self.weather_observation_header = ''
22+
23+
name_splitted = self.name_complete_parser(name_complete)
24+
self.name = name_splitted['name']
25+
self.state_initials = name_splitted['state_initials']
26+
state = State.find_state_by_key(self.state_initials)
27+
self.state_name = state['name']
28+
self.state_region = state['region']
29+
self.omm_code = name_splitted['omm_code']
30+
31+
@staticmethod
32+
def name_complete_parser(name_complete):
33+
"""(Static Method) Parser that given the station name extract from the observation page, it returns a Dict with
34+
the name, state_initials and omm_code"""
35+
36+
s = name_complete.split('-')
37+
name = s[0].strip()
38+
s = s[1].split('(')
39+
state_initials = s[0].strip()
40+
omm_code = s[1].replace('OMM: ', '').replace(')', '')
41+
42+
return {'name': name, 'state_initials': state_initials, 'omm_code': omm_code}
43+
44+
@staticmethod
45+
def parser(data):
46+
"""(Static Method) Parser that given the Station information it returns a Station object."""
47+
ds = data.split('\n')
48+
return Station(ds[1][20:], ds[2][20:], ds[3][20:], ds[4][20:], ds[5], ds[6][20:])
49+
50+
@staticmethod
51+
def observation_parser(data):
52+
"""(Static Method) Parser that given a observation data it returns a Dict with header and data."""
53+
ds = data.split('\n')
54+
ds = list(filter(None, ds))
55+
return {'header': ds[0], 'data': ds[1:], }
56+
57+
def set_observation(self, observation):
58+
"""Given an observation parser object, it sets the observation and observation_header attributes for the Station
59+
class."""
60+
self.weather_observation_header = observation['header']
61+
self.weather_observation = observation['data']
62+
63+
@staticmethod
64+
def get_station_header():
65+
"""(Static Method) It returns the fixed header fields for the Station. This is used to create the CSV header"""
66+
return [
67+
'CodigoOMM', 'NomeEstacao', 'Estado', 'EstadoDesc', 'EstadoRegiao', 'Latitude', 'Longitude', 'Altitude',
68+
'EstacaoStituacao', 'OperanteDesde', ]
69+
70+
def get_station_information(self):
71+
"""It returns the Station information as a list"""
72+
return [
73+
self.omm_code, self.name, self.state_initials, self.state_name, self.state_region, self.lat, self.lng,
74+
self.alt, self.status, self.op_start_date, ]
2175

2276
def __str__(self):
23-
return "Estação: {0}\nLatitude: {1}\nLongitude: {2}\nAltitude: {3}\nSituação: {4}\nEm operação desde {5}"\
24-
.format(self.name, self.lat, self.lng, self.alt, self.situation, self.op_start_date)
77+
return "{0}|{1}/{2}/{3}/{4})| Lat: {5}| Lng: {6}| Alt: {7}| Situação: {8}| Em operação desde {9}"\
78+
.format(
79+
self.omm_code, self.name, self.state_initials, self.state_name, self.state_region, self.lat, self.lng,
80+
self.alt, self.status, self.op_start_date)

0 commit comments

Comments
 (0)