Skip to content

Commit f515d47

Browse files
authoredApr 18, 2025
Add files via upload
1 parent dffe716 commit f515d47

File tree

4 files changed

+368
-0
lines changed

4 files changed

+368
-0
lines changed
 
Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "c5a3099c-85f4-4894-bf34-a311f3aa3c8f",
6+
"metadata": {},
7+
"source": [
8+
"### Install Selenium"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 16,
14+
"id": "b3982d80-89df-48c2-9544-a4fd283fba66",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"# Librerías de Selenium y WebDriver Manager (como usamos en clase)\n",
19+
"from selenium import webdriver\n",
20+
"from selenium.webdriver.common.by import By\n",
21+
"from selenium.webdriver.chrome.service import Service\n",
22+
"from selenium.webdriver.support.ui import WebDriverWait\n",
23+
"from selenium.webdriver.support import expected_conditions as EC\n",
24+
"from webdriver_manager.chrome import ChromeDriverManager\n",
25+
"import time # para poner pausas si es necesario"
26+
]
27+
},
28+
{
29+
"cell_type": "markdown",
30+
"id": "fd18adea-a474-460d-b3da-3a99237e3a6f",
31+
"metadata": {},
32+
"source": [
33+
"### Iniciar en el navegador (Chrome)"
34+
]
35+
},
36+
{
37+
"cell_type": "code",
38+
"execution_count": 43,
39+
"id": "6d55575a-8002-45f6-b9dd-f946b1cddc29",
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"# Creo el navegador Chrome y lo maximizo\n",
44+
"driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))\n",
45+
"driver.maximize_window()\n"
46+
]
47+
},
48+
{
49+
"cell_type": "markdown",
50+
"id": "c8ff444d-6687-4eb2-80bf-7a5c6621b3bc",
51+
"metadata": {},
52+
"source": [
53+
"### Página principal para buscar trabajo"
54+
]
55+
},
56+
{
57+
"cell_type": "code",
58+
"execution_count": 44,
59+
"id": "8c4448e1-3632-4270-8caa-572768672be3",
60+
"metadata": {},
61+
"outputs": [],
62+
"source": [
63+
"# Entramos a la página principal\n",
64+
"driver.get(\"https://www.bumeran.com.pe\")\n",
65+
"time.sleep(2) # Esperamos 2 segundos para que cargue"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"id": "54b3ec17-a4da-4bd0-a794-451812833618",
71+
"metadata": {},
72+
"source": [
73+
"### Data Science en el buscador"
74+
]
75+
},
76+
{
77+
"cell_type": "code",
78+
"execution_count": 23,
79+
"id": "56a92580-3380-4bd1-80d8-34fe93c7c456",
80+
"metadata": {
81+
"scrolled": true
82+
},
83+
"outputs": [
84+
{
85+
"name": "stdout",
86+
"output_type": "stream",
87+
"text": [
88+
"Se encontraron 13 campos de entrada (input)\n",
89+
"0: placeholder = \n",
90+
"1: placeholder = \n",
91+
"2: placeholder = \n",
92+
"3: placeholder = \n",
93+
"4: placeholder = \n",
94+
"5: placeholder = \n",
95+
"6: placeholder = \n",
96+
"7: placeholder = Buscar por puesto o palabra clave\n",
97+
"8: placeholder = \n",
98+
"9: placeholder = \n",
99+
"10: placeholder = Buscar por puesto o palabra clave\n",
100+
"11: placeholder = \n",
101+
"12: placeholder = \n"
102+
]
103+
}
104+
],
105+
"source": [
106+
"# Cargar la página y esperar 5 segundos para asegurar render completo\n",
107+
"driver.get(\"https://www.bumeran.com.pe\")\n",
108+
"time.sleep(5) # Esperamos para que la página cargue completamente\n",
109+
"\n",
110+
"# Ver cuántos <input> hay en el DOM\n",
111+
"inputs = driver.find_elements(By.TAG_NAME, \"input\")\n",
112+
"print(f\"Se encontraron {len(inputs)} campos de entrada (input)\")\n",
113+
"for i, inp in enumerate(inputs):\n",
114+
" print(f\"{i}: placeholder = {inp.get_attribute('placeholder')}\")\n"
115+
]
116+
},
117+
{
118+
"cell_type": "markdown",
119+
"id": "bccab618-c7a6-4282-b6e7-2a9d9d594974",
120+
"metadata": {},
121+
"source": [
122+
"### Buscar Data Science y Lima"
123+
]
124+
},
125+
{
126+
"cell_type": "code",
127+
"execution_count": 56,
128+
"id": "131d3ce0-7c95-41f6-9f61-b6dfd5682b97",
129+
"metadata": {},
130+
"outputs": [],
131+
"source": [
132+
"#Cargar página con filtros aplicados\n",
133+
"url = \"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html\"\n",
134+
"driver.get(url)\n",
135+
"time.sleep(3)"
136+
]
137+
},
138+
{
139+
"cell_type": "markdown",
140+
"id": "99ca2e33-5bfa-427b-832c-283632c1a484",
141+
"metadata": {},
142+
"source": [
143+
"### Info general"
144+
]
145+
},
146+
{
147+
"cell_type": "code",
148+
"execution_count": 59,
149+
"id": "17c5f586-a576-4220-a4f5-2fb34e1438b9",
150+
"metadata": {},
151+
"outputs": [
152+
{
153+
"name": "stdout",
154+
"output_type": "stream",
155+
"text": [
156+
"Total <a> encontrados: 29\n",
157+
"https://www.bumeran.com.pe/empleos-seniority-junior.html?landing-jovenes-profesionales=true\n",
158+
"https://www.bumeran.com.pe/empleos-seniority-gerencia-alta-gerencia-direccion.html?landing-puestos-ejecutivos=true\n",
159+
"https://www.bumeran.com.pe/empleos.html\n",
160+
"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html?relevantes=true\n",
161+
"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html?recientes=true\n",
162+
"https://www.bumeran.com.pe/empleos/docente-de-data-science-machine-learning-visiva-1116743186.html\n",
163+
"https://www.bumeran.com.pe/empleos/analista-data-science-20607325163-1116660540.html\n",
164+
"https://www.bumeran.com.pe/empleos/practicante-data-science-rimac-cia-de-seguros-y-reaseguros-s.a.-1116658516.html\n",
165+
"https://www.bumeran.com.pe/empleos/coordinador-de-data-science-industrias-san-miguel-1116658349.html\n",
166+
"https://www.bumeran.com.pe/empleos/practicante-pre-profesional-en-data-science-e-ai-1116609896.html\n",
167+
"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html#\n",
168+
"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html\n",
169+
"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html#\n"
170+
]
171+
}
172+
],
173+
"source": [
174+
"# Info general\n",
175+
"import time\n",
176+
"time.sleep(5)\n",
177+
"\n",
178+
"# Probar: imprimir todos los href\n",
179+
"anchors = driver.find_elements(By.TAG_NAME, \"a\")\n",
180+
"\n",
181+
"print(\"Total <a> encontrados:\", len(anchors))\n",
182+
"for a in anchors:\n",
183+
" href = a.get_attribute(\"href\")\n",
184+
" if href and \"data-science\" in href or \"empleo\" in href.lower():\n",
185+
" print(href)\n"
186+
]
187+
},
188+
{
189+
"cell_type": "markdown",
190+
"id": "8cae3929-fd4b-464e-9208-b026c50d09f1",
191+
"metadata": {},
192+
"source": [
193+
"### Filtrar (2nd stage)"
194+
]
195+
},
196+
{
197+
"cell_type": "code",
198+
"execution_count": 68,
199+
"id": "a7eb93f0-b538-41d7-85d8-88dc6bc47913",
200+
"metadata": {},
201+
"outputs": [
202+
{
203+
"name": "stdout",
204+
"output_type": "stream",
205+
"text": [
206+
"✅ Archivo guardado como: bumeran_data_science_jobs.csv\n"
207+
]
208+
}
209+
],
210+
"source": [
211+
"import pandas as pd\n",
212+
"from selenium.webdriver.common.by import By\n",
213+
"import time\n",
214+
"\n",
215+
"def get_job_details(driver, link):\n",
216+
" driver.get(link)\n",
217+
" time.sleep(2)\n",
218+
"\n",
219+
" def try_xpath(xpath):\n",
220+
" try:\n",
221+
" return driver.find_element(By.XPATH, xpath).text\n",
222+
" except:\n",
223+
" return \"N/A\"\n",
224+
"\n",
225+
" distrito = try_xpath(\"//h2[contains(text(), 'Distrito')]/following-sibling::p\")\n",
226+
" modalidad = try_xpath(\"//h2[contains(text(), 'Modalidad de trabajo')]/following-sibling::p\")\n",
227+
" nivel = try_xpath(\"//h2[contains(text(), 'Nivel laboral')]/following-sibling::p\")\n",
228+
" discapacidad = try_xpath(\"//h2[contains(text(), 'Postulantes con discapacidad')]/following-sibling::p\")\n",
229+
"\n",
230+
" return {\n",
231+
" \"URL\": link,\n",
232+
" \"Distrito\": distrito,\n",
233+
" \"Modalidad de trabajo\": modalidad,\n",
234+
" \"Nivel laboral\": nivel,\n",
235+
" \"Discapacidad\": discapacidad\n",
236+
" }\n",
237+
"\n",
238+
"# Recorrer todos los enlaces\n",
239+
"data = []\n",
240+
"for i, link in enumerate(job_links):\n",
241+
" print(f\"[{i+1}/{len(job_links)}] Scraping: {link}\")\n",
242+
" details = get_job_details(driver, link)\n",
243+
" data.append(details)\n",
244+
"\n",
245+
"# Guardar en CSV\n",
246+
"df = pd.DataFrame(data)\n",
247+
"df.to_csv(\"bumeran_data_science_jobs.csv\", index=False)\n",
248+
"print(\"✅ Archivo guardado como: bumeran_data_science_jobs.csv\")\n"
249+
]
250+
}
251+
],
252+
"metadata": {
253+
"kernelspec": {
254+
"display_name": "Python 3 (ipykernel)",
255+
"language": "python",
256+
"name": "python3"
257+
},
258+
"language_info": {
259+
"codemirror_mode": {
260+
"name": "ipython",
261+
"version": 3
262+
},
263+
"file_extension": ".py",
264+
"mimetype": "text/x-python",
265+
"name": "python",
266+
"nbconvert_exporter": "python",
267+
"pygments_lexer": "ipython3",
268+
"version": "3.13.2"
269+
}
270+
},
271+
"nbformat": 4,
272+
"nbformat_minor": 5
273+
}
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
### Install Selenium
2+
# Librerías de Selenium y WebDriver Manager (como usamos en clase)
3+
from selenium import webdriver
4+
from selenium.webdriver.common.by import By
5+
from selenium.webdriver.chrome.service import Service
6+
from selenium.webdriver.support.ui import WebDriverWait
7+
from selenium.webdriver.support import expected_conditions as EC
8+
from webdriver_manager.chrome import ChromeDriverManager
9+
import time # para poner pausas si es necesario
10+
11+
### Iniciar en el navegador (Chrome)
12+
# Creo el navegador Chrome y lo maximizo
13+
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
14+
driver.maximize_window()
15+
16+
### Página principal para buscar trabajo
17+
# Entramos a la página principal
18+
driver.get("https://www.bumeran.com.pe")
19+
time.sleep(2) # Esperamos 2 segundos para que cargue
20+
21+
### Data Science en el buscador
22+
# Cargar la página y esperar 5 segundos para asegurar render completo
23+
driver.get("https://www.bumeran.com.pe")
24+
time.sleep(5) # Esperamos para que la página cargue completamente
25+
26+
# Ver cuántos <input> hay en el DOM
27+
inputs = driver.find_elements(By.TAG_NAME, "input")
28+
print(f"Se encontraron {len(inputs)} campos de entrada (input)")
29+
for i, inp in enumerate(inputs):
30+
print(f"{i}: placeholder = {inp.get_attribute('placeholder')}")
31+
32+
### Buscar Data Science y Lima
33+
#Cargar página con filtros aplicados
34+
url = "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html"
35+
driver.get(url)
36+
time.sleep(3)
37+
38+
# Info general
39+
import time
40+
time.sleep(5)
41+
42+
# Probar: imprimir todos los href
43+
anchors = driver.find_elements(By.TAG_NAME, "a")
44+
45+
print("Total <a> encontrados:", len(anchors))
46+
for a in anchors:
47+
href = a.get_attribute("href")
48+
if href and "data-science" in href or "empleo" in href.lower():
49+
print(href)
50+
51+
#2nd stage
52+
53+
import pandas as pd
54+
from selenium.webdriver.common.by import By
55+
import time
56+
57+
def get_job_details(driver, link):
58+
driver.get(link)
59+
time.sleep(2)
60+
61+
def try_xpath(xpath):
62+
try:
63+
return driver.find_element(By.XPATH, xpath).text
64+
except:
65+
return "N/A"
66+
67+
distrito = try_xpath("//h2[contains(text(), 'Distrito')]/following-sibling::p")
68+
modalidad = try_xpath("//h2[contains(text(), 'Modalidad de trabajo')]/following-sibling::p")
69+
nivel = try_xpath("//h2[contains(text(), 'Nivel laboral')]/following-sibling::p")
70+
discapacidad = try_xpath("//h2[contains(text(), 'Postulantes con discapacidad')]/following-sibling::p")
71+
72+
return {
73+
"URL": link,
74+
"Distrito": distrito,
75+
"Modalidad de trabajo": modalidad,
76+
"Nivel laboral": nivel,
77+
"Discapacidad": discapacidad
78+
}
79+
80+
# Recorrer todos los enlaces
81+
data = []
82+
for i, link in enumerate(job_links):
83+
print(f"[{i+1}/{len(job_links)}] Scraping: {link}")
84+
details = get_job_details(driver, link)
85+
data.append(details)
86+
87+
# Guardar en CSV
88+
df = pd.DataFrame(data)
89+
df.to_csv("bumeran_data_science_jobs.csv", index=False)
90+
print("✅ Archivo guardado como: bumeran_data_science_jobs.csv")
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
selenium
2+
webdriver-manager
3+
pandas
4+

0 commit comments

Comments
 (0)
Please sign in to comment.