7
7
Description: Web Crawler to extract information from BDMEP (INMET) database.
8
8
"""
9
9
10
+ import time
11
+ import sys
12
+ import csv
10
13
from selenium import webdriver
11
14
from bs4 import BeautifulSoup
12
- import time
13
- from config import LOGIN
15
+ from config import LOGIN , URL_TEMPLATE
14
16
from datetime import datetime
15
17
from station import Station
16
- # import csv
18
+
19
+
20
+ class InputError (Exception ):
21
+ """Exception raised for errors in the input."""
22
+ pass
17
23
18
24
19
25
def init_webdriver_config (impl_delay = 30 ):
@@ -30,7 +36,7 @@ def init_webdriver_config(impl_delay=30):
30
36
def get_page (_driver , _url ):
31
37
"""Helper function that navigates and returns an BeautifulSoup page."""
32
38
_driver .get (_url )
33
- time .sleep (3 )
39
+ time .sleep (5 )
34
40
return BeautifulSoup (_driver .page_source , 'html.parser' )
35
41
36
42
@@ -44,73 +50,97 @@ def inmet_login(_driver):
44
50
_driver .find_element_by_name ("btnProcesso" ).click ()
45
51
46
52
53
+ def get_url_pattern ():
54
+ """This function returns the URL pattern accordingly to the Template passed as system parameter."""
55
+
56
+ if len (sys .argv ) > 1 :
57
+ if sys .argv [1 ].upper () in URL_TEMPLATE .keys ():
58
+ print ('TEMPLATE: ' , sys .argv [1 ].upper ())
59
+ return URL_TEMPLATE [sys .argv [1 ].upper ()]
60
+ else :
61
+ raise InputError ('The template {0} don´t exist.' .format (sys .argv [1 ].upper ()))
62
+
63
+ print ('TEMPLATE Default: MONTH' )
64
+ return URL_TEMPLATE ['MONTH' ]
65
+
66
+
47
67
def load_station_numbers (_path ):
48
68
"""It returns the list of stations."""
49
- _f = open (_path )
50
- _station_numbers = _f .readlines ()
51
- _station_numbers = list (map (lambda x : x .replace ("\n " , "" ), _station_numbers ))
52
- _f .close ()
69
+
70
+ with open (_path ) as _file :
71
+ _station_numbers = _file .readlines ()
72
+ _station_numbers = list (map (lambda x : x .replace ("\n " , "" ), _station_numbers ))
73
+
53
74
return _station_numbers
54
75
55
76
56
- def weather_station_parser ( _data ) :
77
+ if __name__ == '__main__' :
57
78
58
- ds = _data .split ('\n ' )
59
- s = Station (ds [1 ][20 :], ds [2 ][20 :], ds [3 ][20 :], ds [4 ][20 :], ds [5 ], ds [6 ][20 :])
60
- print (s )
61
- return s
79
+ print (">> WebCrawler Started <<" )
62
80
81
+ count_success = 0
82
+ count_error = 0
83
+ station_list = []
63
84
64
- def weather_observation_parser (_data ):
65
- ds = _data .split ('\n ' )
66
- ds = list (filter (None , ds ))
67
- # header is ds[0]
68
- print (ds [0 ])
69
- print (* ds [1 :], sep = '\n ' )
85
+ driver = init_webdriver_config (30 )
70
86
87
+ try :
88
+ inmet_login (driver )
71
89
72
- # if __name__ == '__main__':
90
+ url_pattern = get_url_pattern ()
73
91
74
- print ( ">> WebCrawler Started << " )
92
+ station_numbers = load_station_numbers ( "./data/station_numbers.txt " )
75
93
76
- driver = init_webdriver_config (30 )
94
+ start_date = "01/01/2018"
95
+ end_date = datetime .now ().strftime ("%d/%m/%Y" )
77
96
78
- # -- Login --
79
- inmet_login (driver )
97
+ for omm_code in station_numbers :
98
+ url = url_pattern .format (omm_code = omm_code , start_date = start_date , end_date = end_date )
99
+ soup = get_page (driver , url )
80
100
101
+ soup_pre = soup .select ('pre' )
81
102
82
- # -- Extract --
83
- url_pattern = "http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt_mensal.php?&mRelEstacao={omm_code}" \
84
- "&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}&mAtributos={attributes}"
103
+ if len (soup_pre ) == 0 : # In case of an exception go to the next station
104
+ continue
85
105
86
- station_numbers = load_station_numbers ("./data/station_numbers.txt" )
106
+ if 'Não existem dados disponiveis' in soup_pre [0 ].text :
107
+ count_error += 1
108
+ else :
109
+ content = soup_pre [0 ].text .split ('--------------------' )
110
+ station = Station .parser (content [2 ])
111
+ station .set_observation (Station .observation_parser (content [4 ]))
112
+ station_list .append (station )
113
+ count_success += 1
87
114
88
- start_date = "01/07/2000"
89
- end_date = datetime .now ().strftime ("%d/%m/%Y" )
90
- # attributes = "1,,,,,,,,,,,,1,1,1,1,"
91
- attributes = "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"
115
+ except InputError as e :
116
+ print (e )
117
+ except Exception as e :
118
+ print (e )
119
+ finally :
120
+ driver .quit ()
121
+ print (">> WebCrawler Finished <<" )
122
+ print ("SUCCESS: {0}\n ERROR: {1}\n TOTAL: {2}" .format (count_success , count_error , count_success + count_error ))
92
123
93
- count_success = 0
94
- count_error = 0
124
+ if len (station_list ) == 0 :
125
+ print ('No data collected. Exiting...' )
126
+ exit ()
95
127
96
- for omm_code in station_numbers [:4 ]:
97
- url = url_pattern .format (omm_code = omm_code , start_date = start_date , end_date = end_date , attributes = attributes )
98
- soup = get_page (driver , url )
128
+ print (">> Data Processing Started <<" )
99
129
100
- soup_pre = soup .select ('pre' )
130
+ data_header = list (filter (None , station_list [0 ].weather_observation_header .split (';' )))
131
+ header = Station .get_station_header () + data_header
101
132
102
- if 'Não existem dados disponiveis' in soup_pre [0 ].text :
103
- count_error += 1
104
- else :
105
- count_success += 1
106
- content = soup_pre [0 ].text .split ('--------------------' )
133
+ file_path = 'data/output_data.csv'
134
+ with open (file_path , 'w' , newline = '\n ' , encoding = 'utf-8' ) as f :
135
+ writer = csv .writer (f , delimiter = ';' , quotechar = "'" , quoting = csv .QUOTE_MINIMAL )
107
136
108
- station = weather_station_parser (content [2 ])
109
- weather_observation_parser (content [4 ])
137
+ writer .writerow (header )
110
138
139
+ for station in station_list :
140
+ for ob in station .weather_observation :
141
+ row = station .get_station_information () + ob .split (';' )
142
+ writer .writerow (row )
111
143
112
- # driver.quit( )
144
+ print ( 'Saving the file {0} [...]' . format ( file_path ) )
113
145
114
- print (">> WebCrawler Finished <<" )
115
- print ("SUCCESS: {0}\n ERROR: {1}\n TOTAL: {2}" .format (count_success , count_error ,
116
- count_success + count_error ))
146
+ print (">> Data Processing Finished <<" )
0 commit comments