Skip to content

Commit 3e5a83b

Browse files
committed
Added the parsers
1 parent 95ddf73 commit 3e5a83b

File tree

4 files changed

+63
-23
lines changed

4 files changed

+63
-23
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,9 @@ A formula usada para o cálculo da Temperatura média compensada (TC) é:
9696

9797
## TO DO
9898
* [ ] Lista das estações: http://www.inmet.gov.br/projetos/rede/pesquisa/lista_estacao.php -> Pegar o id delas para usar como parametro de consulta -> Tem Estações que não estão nessa lista. Existem estações de aeroportos como a ID 82022. Checar depois em um range de 82000 até 84000. Precisa modificar um pouco para pegar os dados de aeroporto porque tem mais 1 hifen.
99-
99+
* [ ] Tratamento para login com usuario errado.
100+
* [ ] Exportar para arquivo
101+
* [ ] Criar classe observation
100102

101103

102104
## Saiba mais

config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Author: Fabio Rodrigues Jorge
6+
7+
Description: Config file.
8+
"""
9+
10+
LOGIN = {'url': 'http://www.inmet.gov.br/projetos/rede/pesquisa/inicio.php',
11+
'username': '<USER>',
12+
'password': '<PASS>',
13+
}

extract_data.py

Lines changed: 44 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
from selenium import webdriver
1111
from bs4 import BeautifulSoup
1212
import time
13-
# import datetime
13+
from config import LOGIN
14+
from datetime import datetime
15+
from station import Station
1416
# import csv
1517

1618

@@ -32,59 +34,82 @@ def get_page(_driver, _url):
3234
return BeautifulSoup(_driver.page_source, 'html.parser')
3335

3436

37+
def inmet_login(_driver):
38+
"""This function navigates to the login page and do the login."""
39+
40+
get_page(_driver, LOGIN['url'])
41+
42+
_driver.find_element_by_name("mCod").send_keys(LOGIN['username'])
43+
_driver.find_element_by_name("mSenha").send_keys(LOGIN['password'])
44+
_driver.find_element_by_name("btnProcesso").click()
45+
46+
3547
def load_station_numbers(_path):
48+
"""It returns the list of stations."""
3649
_f = open(_path)
3750
_station_numbers = _f.readlines()
3851
_station_numbers = list(map(lambda x: x.replace("\n", ""), _station_numbers))
3952
_f.close()
4053
return _station_numbers
4154

4255

56+
def weather_station_parser(_data):
57+
58+
ds = _data.split('\n')
59+
s = Station(ds[1][20:], ds[2][20:], ds[3][20:], ds[4][20:], ds[5], ds[6][20:])
60+
print(s)
61+
return s
62+
63+
64+
def weather_observation_parser(_data):
65+
ds = _data.split('\n')
66+
ds = list(filter(None, ds))
67+
# header is ds[0]
68+
print(ds[0])
69+
print(*ds[1:], sep='\n')
70+
71+
4372
# if __name__ == '__main__':
4473

4574
print(">> WebCrawler Started <<")
4675

4776
driver = init_webdriver_config(30)
4877

49-
5078
# -- Login --
51-
login_url = "http://www.inmet.gov.br/projetos/rede/pesquisa/inicio.php"
52-
53-
soup = get_page(driver, login_url)
54-
55-
username = "<USER>"
56-
password = "<PASS>"
79+
inmet_login(driver)
5780

58-
driver.find_element_by_name("mCod").send_keys(username)
59-
driver.find_element_by_name("mSenha").send_keys(password)
60-
driver.find_element_by_name("btnProcesso").click()
6181

6282
# -- Extract --
6383
url_pattern = "http://www.inmet.gov.br/projetos/rede/pesquisa/gera_serie_txt_mensal.php?&mRelEstacao={omm_code}" \
6484
"&btnProcesso=serie&mRelDtInicio={start_date}&mRelDtFim={end_date}&mAtributos={attributes}"
6585

66-
6786
station_numbers = load_station_numbers("./data/station_numbers.txt")
6887

69-
start_date = "14/07/2014"
70-
end_date = "14/08/2019"
71-
attributes = "1,,,,,,,,,,,,1,1,1,1,"
88+
start_date = "01/07/2000"
89+
end_date = datetime.now().strftime("%d/%m/%Y")
90+
# attributes = "1,,,,,,,,,,,,1,1,1,1,"
91+
attributes = "1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1"
7292

7393
count_success = 0
7494
count_error = 0
95+
7596
for omm_code in station_numbers[:4]:
7697
url = url_pattern.format(omm_code=omm_code, start_date=start_date, end_date=end_date, attributes=attributes)
7798
soup = get_page(driver, url)
7899

79-
data = soup.select('pre')
100+
soup_pre = soup.select('pre')
80101

81-
if 'Não existem dados disponiveis' in data[0].text:
102+
if 'Não existem dados disponiveis' in soup_pre[0].text:
82103
count_error += 1
83104
else:
84105
count_success += 1
85-
print(data[0].text)
106+
content = soup_pre[0].text.split('--------------------')
107+
108+
station = weather_station_parser(content[2])
109+
weather_observation_parser(content[4])
110+
86111

87-
driver.quit()
112+
# driver.quit()
88113

89114
print(">> WebCrawler Finished <<")
90115
print("SUCCESS: {0}\nERROR: {1}\nTOTAL: {2}".format(count_success, count_error,

station.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
33

4-
from .state import State
4+
from state import State
55

66

77
class Station:
@@ -20,5 +20,5 @@ def __init__(self, name_complete, lat, lng, alt, situation, op_date):
2020
self.op_start_date = op_date
2121

2222
def __str__(self):
23-
return "Estação: {0}\nLatitude: {1}\nLongitude: {2}\nAltitude: {3}\nSituação: {4}\nEm operação desde {0}"\
24-
.format(self.name, self.omm_code, self.lat, self.lng, self.alt, self.situation, self.op_start_date)
23+
return "Estação: {0}\nLatitude: {1}\nLongitude: {2}\nAltitude: {3}\nSituação: {4}\nEm operação desde {5}"\
24+
.format(self.name, self.lat, self.lng, self.alt, self.situation, self.op_start_date)

0 commit comments

Comments
 (0)