Skip to content

Commit c432571

Browse files
committed
intento de subida 2
1 parent c76573d commit c432571

File tree

4 files changed

+1150
-0
lines changed

4 files changed

+1150
-0
lines changed
Lines changed: 280 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,280 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": null,
6+
"id": "5ef22780-d8af-4dac-9fc2-7014ae48ea64",
7+
"metadata": {},
8+
"outputs": [],
9+
"source": [
10+
"#Para crear el enviroment ----------------------------\n",
11+
"#--------------Creamos el enviroment\n",
12+
"# conda create -n Tarea1 python=3.11\n",
13+
"#-------------Lo activamos\n",
14+
"# conda activate Tarea1 \n",
15+
"#------------Vamos a la ruta del txt\n",
16+
"# cd Documents/GitHub/Data-Science-Python/homework/hw1/259359_hw1_2025_1\n",
17+
"#------------Instalamos\n",
18+
"# pip install -r requirements.txt "
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": 5,
24+
"id": "fbf681bd-2fb1-42f1-baeb-91c5821757bc",
25+
"metadata": {},
26+
"outputs": [
27+
{
28+
"name": "stdout",
29+
"output_type": "stream",
30+
"text": [
31+
"Collecting webdriver-manager\n",
32+
" Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)\n",
33+
"Requirement already satisfied: requests in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from webdriver-manager) (2.32.3)\n",
34+
"Collecting python-dotenv (from webdriver-manager)\n",
35+
" Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)\n",
36+
"Requirement already satisfied: packaging in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from webdriver-manager) (24.2)\n",
37+
"Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (3.4.1)\n",
38+
"Requirement already satisfied: idna<4,>=2.5 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (3.10)\n",
39+
"Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (2.3.0)\n",
40+
"Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (2025.1.31)\n",
41+
"Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)\n",
42+
"Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)\n",
43+
"Installing collected packages: python-dotenv, webdriver-manager\n",
44+
"Successfully installed python-dotenv-1.1.0 webdriver-manager-4.0.2\n"
45+
]
46+
}
47+
],
48+
"source": [
49+
"!pip install webdriver-manager #instalo el driver"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 9,
55+
"id": "f11af34a-9478-4b18-8913-4bde935da04f",
56+
"metadata": {},
57+
"outputs": [],
58+
"source": [
59+
"#importo las librerías que se instalaron arriba\n",
60+
"from selenium import webdriver\n",
61+
"from selenium.webdriver.chrome.service import Service\n",
62+
"from webdriver_manager.chrome import ChromeDriverManager\n",
63+
"from selenium.webdriver.common.by import By\n",
64+
"from selenium.webdriver.support.ui import WebDriverWait\n",
65+
"from selenium.webdriver.support import expected_conditions as EC\n",
66+
"import time\n",
67+
"import re\n",
68+
"import pandas as pd"
69+
]
70+
},
71+
{
72+
"cell_type": "code",
73+
"execution_count": 53,
74+
"id": "b76f4fb9-d540-416c-9e67-d3447e871032",
75+
"metadata": {},
76+
"outputs": [],
77+
"source": [
78+
"##ejecuto el driver y abro la pagina de bumeran\n",
79+
"driver = webdriver.Chrome()\n",
80+
"url = \"https://www.bumeran.com.pe/empleos.html\"\n",
81+
"driver.get(url)\n",
82+
"driver.maximize_window()\n",
83+
"time.sleep(5) #esperamos a que se abra la pestaña\n",
84+
"driver.execute_script(\"document.body.style.zoom='100%'\") #ampliamos la ventana"
85+
]
86+
},
87+
{
88+
"cell_type": "markdown",
89+
"id": "9e5051a4-ed60-489b-867d-adb48384d820",
90+
"metadata": {},
91+
"source": [
92+
"----------FILTROS------------------------"
93+
]
94+
},
95+
{
96+
"cell_type": "code",
97+
"execution_count": null,
98+
"id": "b911f333-cdc5-43a8-a31f-6c84a942538d",
99+
"metadata": {},
100+
"outputs": [],
101+
"source": [
102+
"# Abriendo el filtro de fecha de publicación y seleccionando los últimos 15 días\n",
103+
"driver.find_element(By.NAME, 'icon-light-calendar').click()\n",
104+
"time.sleep(3)\n",
105+
"\n",
106+
"fecha_15_dias = driver.find_element(By.XPATH, \"//button[contains(text(), 'Menor a 15 días')]\")\n",
107+
"fecha_15_dias.click()\n",
108+
"time.sleep(3)\n",
109+
"\n",
110+
"# Entrando al menú de áreas laborales y eligiendo \"Tecnología, Sistemas y Telecomunicaciones\"\n",
111+
"driver.find_element(By.NAME, 'icon-light-cube').click()\n",
112+
"time.sleep(3)\n",
113+
"\n",
114+
"area_tecnologia = driver.find_element(By.XPATH, \"//button[contains(text(), 'Tecnología, Sistemas y Telecomunicaciones')]\")\n",
115+
"area_tecnologia.click()\n",
116+
"time.sleep(3)\n",
117+
"\n",
118+
"# Accediendo al menú de subáreas y seleccionando \"Programación\"\n",
119+
"driver.find_element(By.NAME, 'icon-light-layers').click()\n",
120+
"time.sleep(3)\n",
121+
"\n",
122+
"subarea_programacion = driver.find_element(By.XPATH, \"//button[contains(text(), 'Programación')]\")\n",
123+
"subarea_programacion.click()\n",
124+
"time.sleep(7)\n",
125+
"\n",
126+
"# Configurando el filtro de ubicación en \"Lima\"\n",
127+
"driver.find_element(By.XPATH, \"//button//i[@name='icon-light-location-pin']/..\").click()\n",
128+
"time.sleep(9)\n",
129+
"\n",
130+
"lima = driver.find_element(By.XPATH, \"//button[contains(text(), 'Lima')]\")\n",
131+
"lima.click()\n",
132+
"time.sleep(10)\n",
133+
"\n",
134+
"# Ajustando la preferencia de jornada laboral a \"Full-time\"\n",
135+
"driver.find_element(By.NAME, 'icon-light-bookmark').click()\n",
136+
"time.sleep(10)\n",
137+
"\n",
138+
"full_time = driver.find_element(By.XPATH, \"//button[contains(text(), 'Full-time')]\")\n",
139+
"full_time.click()\n"
140+
]
141+
},
142+
{
143+
"cell_type": "code",
144+
"execution_count": null,
145+
"id": "e02a58e5-efb8-41f1-bac2-5abeb10f1614",
146+
"metadata": {},
147+
"outputs": [],
148+
"source": [
149+
"# Esperar a que los botones con enlaces de empleo carguen en la página\n",
150+
"WebDriverWait(driver, 8).until(EC.presence_of_all_elements_located((By.CLASS_NAME, \"sc-zDqdV\")))\n",
151+
"# Buscar todos los botones que contienen los enlaces de las ofertas\n",
152+
"botones_enlaces = driver.find_elements(By.CLASS_NAME, \"sc-zDqdV\")\n",
153+
"# Extraer los enlaces de cada botón y guardarlos en una lista\n",
154+
"enlaces_empleos = [boton.get_attribute(\"href\") for boton in botones_enlaces]"
155+
]
156+
},
157+
{
158+
"cell_type": "code",
159+
"execution_count": null,
160+
"id": "e2bf51cf-c22d-4384-b085-7da2eecf4a68",
161+
"metadata": {},
162+
"outputs": [],
163+
"source": [
164+
"enlaces_empleos # comprobamos"
165+
]
166+
},
167+
{
168+
"cell_type": "code",
169+
"execution_count": null,
170+
"id": "b49ad877-824e-49af-950f-36a0aab6ee07",
171+
"metadata": {},
172+
"outputs": [],
173+
"source": [
174+
"#✅ Stage 2: Scrape Job Details\n",
175+
"# Iniciamos el navegador y lo maximizamos\n",
176+
"driver = webdriver.Chrome()\n",
177+
"driver.maximize_window()\n",
178+
"\n",
179+
"# Lista para almacenar la información\n",
180+
"datos_empleos = []\n",
181+
"\n",
182+
"# Función para obtener el distrito\n",
183+
"def obtener_distrito(driver):\n",
184+
" xpaths_distrito = [\n",
185+
" \"//div[contains(@class, 'ubicacion')]//h2\",\n",
186+
" \"//li[contains(@class, 'ubicacion')]//a\"\n",
187+
" ]\n",
188+
" for xpath in xpaths_distrito:\n",
189+
" elementos = driver.find_elements(By.XPATH, xpath)\n",
190+
" if elementos:\n",
191+
" return elementos[0].text.split(\",\")[0].strip() # Extraemos solo el distrito\n",
192+
" return \"Distrito no encontrado\"\n",
193+
"\n",
194+
"# Función para obtener la modalidad de trabajo\n",
195+
"def obtener_modalidad(driver):\n",
196+
" xpaths_modalidad = [\n",
197+
" \"//ul[contains(@class, 'modalidad')]//li/a\",\n",
198+
" \"//div[contains(@class, 'detalles')]//ul/li\"\n",
199+
" ]\n",
200+
" for xpath in xpaths_modalidad:\n",
201+
" elementos = driver.find_elements(By.XPATH, xpath)\n",
202+
" if elementos:\n",
203+
" return elementos[0].text.strip()\n",
204+
" return \"No especificado\"\n",
205+
"\n",
206+
"# Función para obtener la descripción del empleo\n",
207+
"def obtener_descripcion(driver):\n",
208+
" try:\n",
209+
" descripcion_completa = driver.find_element(By.ID, \"ficha-detalle\").text\n",
210+
" descripcion, _, _ = descripcion_completa.partition(\"Beneficios\") # Toma solo lo anterior a \"Beneficios\"\n",
211+
" return descripcion.strip()\n",
212+
" except:\n",
213+
" return \"Descripción no disponible\"\n",
214+
"\n",
215+
"# Recorremos los enlaces y extraemos la información\n",
216+
"for enlace in lista_enlaces:\n",
217+
" try:\n",
218+
" driver.get(enlace)\n",
219+
"\n",
220+
" # Extraer título\n",
221+
" titulo = WebDriverWait(driver, 8).until(\n",
222+
" EC.presence_of_element_located((By.TAG_NAME, 'h1'))\n",
223+
" ).text\n",
224+
"\n",
225+
" # Extraer información usando funciones\n",
226+
" distrito = obtener_distrito(driver)\n",
227+
" modalidad = obtener_modalidad(driver)\n",
228+
" descripcion = obtener_descripcion(driver)\n",
229+
"\n",
230+
" # Guardamos los datos\n",
231+
" datos_empleos.append([titulo, descripcion, distrito, modalidad, enlace])\n",
232+
"\n",
233+
" print(f\"Datos obtenidos de: {enlace}\")\n",
234+
"\n",
235+
" except Exception as e:\n",
236+
" print(f\"Error al procesar {enlace}: {e}\")\n",
237+
"\n",
238+
"# Cerrar navegador\n",
239+
"driver.quit()\n",
240+
"\n",
241+
"# Guardar en un archivo Excel\n",
242+
"df = pd.DataFrame(datos_empleos, columns=[\"Título\", \"Descripción\", \"Distrito\", \"Modalidad\", \"Enlace\"])\n",
243+
"df.to_csv(\"Trabajos_Bumeran.csv\", index=False, encoding=\"utf-8\", sep=\"|\")\n",
244+
"\n",
245+
"print(\"Extracción completada. Datos guardados en 'Trabajos_Bumeran.xlsx'.\")"
246+
]
247+
},
248+
{
249+
"cell_type": "code",
250+
"execution_count": null,
251+
"id": "4b5c1235-1191-4c1b-b035-a5cfbfb5cd5c",
252+
"metadata": {},
253+
"outputs": [],
254+
"source": [
255+
"#Gracias :)))))"
256+
]
257+
}
258+
],
259+
"metadata": {
260+
"kernelspec": {
261+
"display_name": "Python [conda env:Tarea1]",
262+
"language": "python",
263+
"name": "conda-env-Tarea1-py"
264+
},
265+
"language_info": {
266+
"codemirror_mode": {
267+
"name": "ipython",
268+
"version": 3
269+
},
270+
"file_extension": ".py",
271+
"mimetype": "text/x-python",
272+
"name": "python",
273+
"nbconvert_exporter": "python",
274+
"pygments_lexer": "ipython3",
275+
"version": "3.11.11"
276+
}
277+
},
278+
"nbformat": 4,
279+
"nbformat_minor": 5
280+
}

0 commit comments

Comments
 (0)