|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": null, |
| 6 | + "id": "5ef22780-d8af-4dac-9fc2-7014ae48ea64", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [], |
| 9 | + "source": [ |
| 10 | + "#Para crear el enviroment ----------------------------\n", |
| 11 | + "#--------------Creamos el enviroment\n", |
| 12 | + "# conda create -n Tarea1 python=3.11\n", |
| 13 | + "#-------------Lo activamos\n", |
| 14 | + "# conda activate Tarea1 \n", |
| 15 | + "#------------Vamos a la ruta del txt\n", |
| 16 | + "# cd Documents/GitHub/Data-Science-Python/homework/hw1/259359_hw1_2025_1\n", |
| 17 | + "#------------Instalamos\n", |
| 18 | + "# pip install -r requirements.txt " |
| 19 | + ] |
| 20 | + }, |
| 21 | + { |
| 22 | + "cell_type": "code", |
| 23 | + "execution_count": 5, |
| 24 | + "id": "fbf681bd-2fb1-42f1-baeb-91c5821757bc", |
| 25 | + "metadata": {}, |
| 26 | + "outputs": [ |
| 27 | + { |
| 28 | + "name": "stdout", |
| 29 | + "output_type": "stream", |
| 30 | + "text": [ |
| 31 | + "Collecting webdriver-manager\n", |
| 32 | + " Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)\n", |
| 33 | + "Requirement already satisfied: requests in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from webdriver-manager) (2.32.3)\n", |
| 34 | + "Collecting python-dotenv (from webdriver-manager)\n", |
| 35 | + " Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)\n", |
| 36 | + "Requirement already satisfied: packaging in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from webdriver-manager) (24.2)\n", |
| 37 | + "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (3.4.1)\n", |
| 38 | + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (3.10)\n", |
| 39 | + "Requirement already satisfied: urllib3<3,>=1.21.1 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (2.3.0)\n", |
| 40 | + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\fabrizio\\anaconda3\\envs\\tarea1\\lib\\site-packages (from requests->webdriver-manager) (2025.1.31)\n", |
| 41 | + "Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)\n", |
| 42 | + "Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)\n", |
| 43 | + "Installing collected packages: python-dotenv, webdriver-manager\n", |
| 44 | + "Successfully installed python-dotenv-1.1.0 webdriver-manager-4.0.2\n" |
| 45 | + ] |
| 46 | + } |
| 47 | + ], |
| 48 | + "source": [ |
| 49 | + "!pip install webdriver-manager #instalo el driver" |
| 50 | + ] |
| 51 | + }, |
| 52 | + { |
| 53 | + "cell_type": "code", |
| 54 | + "execution_count": 9, |
| 55 | + "id": "f11af34a-9478-4b18-8913-4bde935da04f", |
| 56 | + "metadata": {}, |
| 57 | + "outputs": [], |
| 58 | + "source": [ |
| 59 | + "#importo las librerías que se instalaron arriba\n", |
| 60 | + "from selenium import webdriver\n", |
| 61 | + "from selenium.webdriver.chrome.service import Service\n", |
| 62 | + "from webdriver_manager.chrome import ChromeDriverManager\n", |
| 63 | + "from selenium.webdriver.common.by import By\n", |
| 64 | + "from selenium.webdriver.support.ui import WebDriverWait\n", |
| 65 | + "from selenium.webdriver.support import expected_conditions as EC\n", |
| 66 | + "import time\n", |
| 67 | + "import re\n", |
| 68 | + "import pandas as pd" |
| 69 | + ] |
| 70 | + }, |
| 71 | + { |
| 72 | + "cell_type": "code", |
| 73 | + "execution_count": 53, |
| 74 | + "id": "b76f4fb9-d540-416c-9e67-d3447e871032", |
| 75 | + "metadata": {}, |
| 76 | + "outputs": [], |
| 77 | + "source": [ |
| 78 | + "##ejecuto el driver y abro la pagina de bumeran\n", |
| 79 | + "driver = webdriver.Chrome()\n", |
| 80 | + "url = \"https://www.bumeran.com.pe/empleos.html\"\n", |
| 81 | + "driver.get(url)\n", |
| 82 | + "driver.maximize_window()\n", |
| 83 | + "time.sleep(5) #esperamos a que se abra la pestaña\n", |
| 84 | + "driver.execute_script(\"document.body.style.zoom='100%'\") #ampliamos la ventana" |
| 85 | + ] |
| 86 | + }, |
| 87 | + { |
| 88 | + "cell_type": "markdown", |
| 89 | + "id": "9e5051a4-ed60-489b-867d-adb48384d820", |
| 90 | + "metadata": {}, |
| 91 | + "source": [ |
| 92 | + "----------FILTROS------------------------" |
| 93 | + ] |
| 94 | + }, |
| 95 | + { |
| 96 | + "cell_type": "code", |
| 97 | + "execution_count": null, |
| 98 | + "id": "b911f333-cdc5-43a8-a31f-6c84a942538d", |
| 99 | + "metadata": {}, |
| 100 | + "outputs": [], |
| 101 | + "source": [ |
| 102 | + "# Abriendo el filtro de fecha de publicación y seleccionando los últimos 15 días\n", |
| 103 | + "driver.find_element(By.NAME, 'icon-light-calendar').click()\n", |
| 104 | + "time.sleep(3)\n", |
| 105 | + "\n", |
| 106 | + "fecha_15_dias = driver.find_element(By.XPATH, \"//button[contains(text(), 'Menor a 15 días')]\")\n", |
| 107 | + "fecha_15_dias.click()\n", |
| 108 | + "time.sleep(3)\n", |
| 109 | + "\n", |
| 110 | + "# Entrando al menú de áreas laborales y eligiendo \"Tecnología, Sistemas y Telecomunicaciones\"\n", |
| 111 | + "driver.find_element(By.NAME, 'icon-light-cube').click()\n", |
| 112 | + "time.sleep(3)\n", |
| 113 | + "\n", |
| 114 | + "area_tecnologia = driver.find_element(By.XPATH, \"//button[contains(text(), 'Tecnología, Sistemas y Telecomunicaciones')]\")\n", |
| 115 | + "area_tecnologia.click()\n", |
| 116 | + "time.sleep(3)\n", |
| 117 | + "\n", |
| 118 | + "# Accediendo al menú de subáreas y seleccionando \"Programación\"\n", |
| 119 | + "driver.find_element(By.NAME, 'icon-light-layers').click()\n", |
| 120 | + "time.sleep(3)\n", |
| 121 | + "\n", |
| 122 | + "subarea_programacion = driver.find_element(By.XPATH, \"//button[contains(text(), 'Programación')]\")\n", |
| 123 | + "subarea_programacion.click()\n", |
| 124 | + "time.sleep(7)\n", |
| 125 | + "\n", |
| 126 | + "# Configurando el filtro de ubicación en \"Lima\"\n", |
| 127 | + "driver.find_element(By.XPATH, \"//button//i[@name='icon-light-location-pin']/..\").click()\n", |
| 128 | + "time.sleep(9)\n", |
| 129 | + "\n", |
| 130 | + "lima = driver.find_element(By.XPATH, \"//button[contains(text(), 'Lima')]\")\n", |
| 131 | + "lima.click()\n", |
| 132 | + "time.sleep(10)\n", |
| 133 | + "\n", |
| 134 | + "# Ajustando la preferencia de jornada laboral a \"Full-time\"\n", |
| 135 | + "driver.find_element(By.NAME, 'icon-light-bookmark').click()\n", |
| 136 | + "time.sleep(10)\n", |
| 137 | + "\n", |
| 138 | + "full_time = driver.find_element(By.XPATH, \"//button[contains(text(), 'Full-time')]\")\n", |
| 139 | + "full_time.click()\n" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "cell_type": "code", |
| 144 | + "execution_count": null, |
| 145 | + "id": "e02a58e5-efb8-41f1-bac2-5abeb10f1614", |
| 146 | + "metadata": {}, |
| 147 | + "outputs": [], |
| 148 | + "source": [ |
| 149 | + "# Esperar a que los botones con enlaces de empleo carguen en la página\n", |
| 150 | + "WebDriverWait(driver, 8).until(EC.presence_of_all_elements_located((By.CLASS_NAME, \"sc-zDqdV\")))\n", |
| 151 | + "# Buscar todos los botones que contienen los enlaces de las ofertas\n", |
| 152 | + "botones_enlaces = driver.find_elements(By.CLASS_NAME, \"sc-zDqdV\")\n", |
| 153 | + "# Extraer los enlaces de cada botón y guardarlos en una lista\n", |
| 154 | + "enlaces_empleos = [boton.get_attribute(\"href\") for boton in botones_enlaces]" |
| 155 | + ] |
| 156 | + }, |
| 157 | + { |
| 158 | + "cell_type": "code", |
| 159 | + "execution_count": null, |
| 160 | + "id": "e2bf51cf-c22d-4384-b085-7da2eecf4a68", |
| 161 | + "metadata": {}, |
| 162 | + "outputs": [], |
| 163 | + "source": [ |
| 164 | + "enlaces_empleos # comprobamos" |
| 165 | + ] |
| 166 | + }, |
| 167 | + { |
| 168 | + "cell_type": "code", |
| 169 | + "execution_count": null, |
| 170 | + "id": "b49ad877-824e-49af-950f-36a0aab6ee07", |
| 171 | + "metadata": {}, |
| 172 | + "outputs": [], |
| 173 | + "source": [ |
| 174 | + "#✅ Stage 2: Scrape Job Details\n", |
| 175 | + "# Iniciamos el navegador y lo maximizamos\n", |
| 176 | + "driver = webdriver.Chrome()\n", |
| 177 | + "driver.maximize_window()\n", |
| 178 | + "\n", |
| 179 | + "# Lista para almacenar la información\n", |
| 180 | + "datos_empleos = []\n", |
| 181 | + "\n", |
| 182 | + "# Función para obtener el distrito\n", |
| 183 | + "def obtener_distrito(driver):\n", |
| 184 | + " xpaths_distrito = [\n", |
| 185 | + " \"//div[contains(@class, 'ubicacion')]//h2\",\n", |
| 186 | + " \"//li[contains(@class, 'ubicacion')]//a\"\n", |
| 187 | + " ]\n", |
| 188 | + " for xpath in xpaths_distrito:\n", |
| 189 | + " elementos = driver.find_elements(By.XPATH, xpath)\n", |
| 190 | + " if elementos:\n", |
| 191 | + " return elementos[0].text.split(\",\")[0].strip() # Extraemos solo el distrito\n", |
| 192 | + " return \"Distrito no encontrado\"\n", |
| 193 | + "\n", |
| 194 | + "# Función para obtener la modalidad de trabajo\n", |
| 195 | + "def obtener_modalidad(driver):\n", |
| 196 | + " xpaths_modalidad = [\n", |
| 197 | + " \"//ul[contains(@class, 'modalidad')]//li/a\",\n", |
| 198 | + " \"//div[contains(@class, 'detalles')]//ul/li\"\n", |
| 199 | + " ]\n", |
| 200 | + " for xpath in xpaths_modalidad:\n", |
| 201 | + " elementos = driver.find_elements(By.XPATH, xpath)\n", |
| 202 | + " if elementos:\n", |
| 203 | + " return elementos[0].text.strip()\n", |
| 204 | + " return \"No especificado\"\n", |
| 205 | + "\n", |
| 206 | + "# Función para obtener la descripción del empleo\n", |
| 207 | + "def obtener_descripcion(driver):\n", |
| 208 | + " try:\n", |
| 209 | + " descripcion_completa = driver.find_element(By.ID, \"ficha-detalle\").text\n", |
| 210 | + " descripcion, _, _ = descripcion_completa.partition(\"Beneficios\") # Toma solo lo anterior a \"Beneficios\"\n", |
| 211 | + " return descripcion.strip()\n", |
| 212 | + " except:\n", |
| 213 | + " return \"Descripción no disponible\"\n", |
| 214 | + "\n", |
| 215 | + "# Recorremos los enlaces y extraemos la información\n", |
| 216 | + "for enlace in lista_enlaces:\n", |
| 217 | + " try:\n", |
| 218 | + " driver.get(enlace)\n", |
| 219 | + "\n", |
| 220 | + " # Extraer título\n", |
| 221 | + " titulo = WebDriverWait(driver, 8).until(\n", |
| 222 | + " EC.presence_of_element_located((By.TAG_NAME, 'h1'))\n", |
| 223 | + " ).text\n", |
| 224 | + "\n", |
| 225 | + " # Extraer información usando funciones\n", |
| 226 | + " distrito = obtener_distrito(driver)\n", |
| 227 | + " modalidad = obtener_modalidad(driver)\n", |
| 228 | + " descripcion = obtener_descripcion(driver)\n", |
| 229 | + "\n", |
| 230 | + " # Guardamos los datos\n", |
| 231 | + " datos_empleos.append([titulo, descripcion, distrito, modalidad, enlace])\n", |
| 232 | + "\n", |
| 233 | + " print(f\"Datos obtenidos de: {enlace}\")\n", |
| 234 | + "\n", |
| 235 | + " except Exception as e:\n", |
| 236 | + " print(f\"Error al procesar {enlace}: {e}\")\n", |
| 237 | + "\n", |
| 238 | + "# Cerrar navegador\n", |
| 239 | + "driver.quit()\n", |
| 240 | + "\n", |
| 241 | + "# Guardar en un archivo Excel\n", |
| 242 | + "df = pd.DataFrame(datos_empleos, columns=[\"Título\", \"Descripción\", \"Distrito\", \"Modalidad\", \"Enlace\"])\n", |
| 243 | + "df.to_csv(\"Trabajos_Bumeran.csv\", index=False, encoding=\"utf-8\", sep=\"|\")\n", |
| 244 | + "\n", |
| 245 | + "print(\"Extracción completada. Datos guardados en 'Trabajos_Bumeran.xlsx'.\")" |
| 246 | + ] |
| 247 | + }, |
| 248 | + { |
| 249 | + "cell_type": "code", |
| 250 | + "execution_count": null, |
| 251 | + "id": "4b5c1235-1191-4c1b-b035-a5cfbfb5cd5c", |
| 252 | + "metadata": {}, |
| 253 | + "outputs": [], |
| 254 | + "source": [ |
| 255 | + "#Gracias :)))))" |
| 256 | + ] |
| 257 | + } |
| 258 | + ], |
| 259 | + "metadata": { |
| 260 | + "kernelspec": { |
| 261 | + "display_name": "Python [conda env:Tarea1]", |
| 262 | + "language": "python", |
| 263 | + "name": "conda-env-Tarea1-py" |
| 264 | + }, |
| 265 | + "language_info": { |
| 266 | + "codemirror_mode": { |
| 267 | + "name": "ipython", |
| 268 | + "version": 3 |
| 269 | + }, |
| 270 | + "file_extension": ".py", |
| 271 | + "mimetype": "text/x-python", |
| 272 | + "name": "python", |
| 273 | + "nbconvert_exporter": "python", |
| 274 | + "pygments_lexer": "ipython3", |
| 275 | + "version": "3.11.11" |
| 276 | + } |
| 277 | + }, |
| 278 | + "nbformat": 4, |
| 279 | + "nbformat_minor": 5 |
| 280 | +} |
0 commit comments