|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 17, |
| 6 | + "id": "d2cebc35-168e-46b4-8a37-85250b2f5586", |
| 7 | + "metadata": {}, |
| 8 | + "outputs": [ |
| 9 | + { |
| 10 | + "ename": "InvalidSessionIdException", |
| 11 | + "evalue": "Message: invalid session id: session deleted as the browser has closed the connection\nfrom disconnected: not connected to DevTools\n (Session info: chrome=134.0.6998.178)\nStacktrace:\n\tGetHandleVerifier [0x00007FF7E5994C25+3179557]\n\t(No symbol) [0x00007FF7E55F88A0]\n\t(No symbol) [0x00007FF7E54891CA]\n\t(No symbol) [0x00007FF7E5474F85]\n\t(No symbol) [0x00007FF7E5499F94]\n\t(No symbol) [0x00007FF7E550F9DF]\n\t(No symbol) [0x00007FF7E552FBE2]\n\t(No symbol) [0x00007FF7E5507A03]\n\t(No symbol) [0x00007FF7E54D06D0]\n\t(No symbol) [0x00007FF7E54D1983]\n\tGetHandleVerifier [0x00007FF7E59F67CD+3579853]\n\tGetHandleVerifier [0x00007FF7E5A0D1D2+3672530]\n\tGetHandleVerifier [0x00007FF7E5A02153+3627347]\n\tGetHandleVerifier [0x00007FF7E576092A+868650]\n\t(No symbol) [0x00007FF7E5602FFF]\n\t(No symbol) [0x00007FF7E55FF4A4]\n\t(No symbol) [0x00007FF7E55FF646]\n\t(No symbol) [0x00007FF7E55EEAA9]\n\tBaseThreadInitThunk [0x00007FFDB604E8D7+23]\n\tRtlUserThreadStart [0x00007FFDB80914FC+44]\n", |
| 12 | + "output_type": "error", |
| 13 | + "traceback": [ |
| 14 | + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", |
| 15 | + "\u001b[1;31mInvalidSessionIdException\u001b[0m Traceback (most recent call last)", |
| 16 | + "Cell \u001b[1;32mIn[17], line 21\u001b[0m\n\u001b[0;32m 17\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m#Aplicar filtros\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m#Fecha\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m \u001b[43mdriver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_element\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mXPATH\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m//button[contains(text(), \u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mFecha\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m)]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mclick()\n\u001b[0;32m 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m 23\u001b[0m driver\u001b[38;5;241m.\u001b[39mfind_element(By\u001b[38;5;241m.\u001b[39mXPATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m//button[contains(text(), \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMenor a 15 días\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m)]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mclick()\n", |
| 17 | + "File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:888\u001b[0m, in \u001b[0;36mWebDriver.find_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m 885\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m NoSuchElementException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot locate relative element with: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mby\u001b[38;5;241m.\u001b[39mroot\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 886\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m elements[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m--> 888\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mCommand\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFIND_ELEMENT\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43musing\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvalue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n", |
| 18 | + "File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:429\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 427\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_executor\u001b[38;5;241m.\u001b[39mexecute(driver_command, params)\n\u001b[0;32m 428\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response:\n\u001b[1;32m--> 429\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merror_handler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 430\u001b[0m response[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_unwrap_value(response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n", |
| 19 | + "File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py:232\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 230\u001b[0m alert_text \u001b[38;5;241m=\u001b[39m value[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124malert\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 231\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[38;5;66;03m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[1;32m--> 232\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n", |
| 20 | + "\u001b[1;31mInvalidSessionIdException\u001b[0m: Message: invalid session id: session deleted as the browser has closed the connection\nfrom disconnected: not connected to DevTools\n (Session info: chrome=134.0.6998.178)\nStacktrace:\n\tGetHandleVerifier [0x00007FF7E5994C25+3179557]\n\t(No symbol) [0x00007FF7E55F88A0]\n\t(No symbol) [0x00007FF7E54891CA]\n\t(No symbol) [0x00007FF7E5474F85]\n\t(No symbol) [0x00007FF7E5499F94]\n\t(No symbol) [0x00007FF7E550F9DF]\n\t(No symbol) [0x00007FF7E552FBE2]\n\t(No symbol) [0x00007FF7E5507A03]\n\t(No symbol) [0x00007FF7E54D06D0]\n\t(No symbol) [0x00007FF7E54D1983]\n\tGetHandleVerifier [0x00007FF7E59F67CD+3579853]\n\tGetHandleVerifier [0x00007FF7E5A0D1D2+3672530]\n\tGetHandleVerifier [0x00007FF7E5A02153+3627347]\n\tGetHandleVerifier [0x00007FF7E576092A+868650]\n\t(No symbol) [0x00007FF7E5602FFF]\n\t(No symbol) [0x00007FF7E55FF4A4]\n\t(No symbol) [0x00007FF7E55FF646]\n\t(No symbol) [0x00007FF7E55EEAA9]\n\tBaseThreadInitThunk [0x00007FFDB604E8D7+23]\n\tRtlUserThreadStart [0x00007FFDB80914FC+44]\n" |
| 21 | + ] |
| 22 | + } |
| 23 | + ], |
| 24 | + "source": [ |
| 25 | + "from selenium import webdriver\n", |
| 26 | + "import re\n", |
| 27 | + "import time\n", |
| 28 | + "from selenium.webdriver.chrome.service import Service\n", |
| 29 | + "from selenium.webdriver.common.by import By\n", |
| 30 | + "\n", |
| 31 | + "#Driver\n", |
| 32 | + "driver = webdriver.Chrome()\n", |
| 33 | + "#Ingresar pagina web, ajustar pantalla y zoom\n", |
| 34 | + "url = 'https://www.bumeran.com.pe/empleos.html'\n", |
| 35 | + "driver.get(url)\n", |
| 36 | + "\n", |
| 37 | + "options = webdriver.ChromeOptions()\n", |
| 38 | + "driver = webdriver.Chrome(options=options)\n", |
| 39 | + "driver.maximize_window()\n", |
| 40 | + "driver.execute_script(\"document.body.style.zoom='100%'\")\n", |
| 41 | + "time.sleep(3)\n", |
| 42 | + "\n", |
| 43 | + "#Aplicar filtros\n", |
| 44 | + "#Fecha\n", |
| 45 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Fecha')]\").click()\n", |
| 46 | + "time.sleep(3)\n", |
| 47 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Menor a 15 días')]\").click()\n", |
| 48 | + "time.sleep(3)\n", |
| 49 | + "#Area\n", |
| 50 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Área')]\").click()\n", |
| 51 | + "time.sleep(3)\n", |
| 52 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Tecnología')]\").click()\n", |
| 53 | + "time.sleep(3)\n", |
| 54 | + "#Subarea\n", |
| 55 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Subárea')]\").click()\n", |
| 56 | + "time.sleep(3)\n", |
| 57 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Programación')]\").click()\n", |
| 58 | + "time.sleep(3)\n", |
| 59 | + "#Dpto\n", |
| 60 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Departamento')]\").click()\n", |
| 61 | + "time.sleep(3)\n", |
| 62 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Lima')]\").click()\n", |
| 63 | + "time.sleep(3)\n", |
| 64 | + "#Horario\n", |
| 65 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Carga horaria')]\").click()\n", |
| 66 | + "time.sleep(3)\n", |
| 67 | + "driver.find_element(By.XPATH, \"//button[contains(text(), 'Full-time')]\").click()\n", |
| 68 | + "time.sleep(3)\n", |
| 69 | + "\n", |
| 70 | + "#Encontrar todos los urls de ofertas con href\n", |
| 71 | + "ofertas = driver.find_elements(By.XPATH, \"//a[contains(@href, '/empleos/')]\")\n", |
| 72 | + "#Extraer urls y guardarlos en lista urls_ofertas\n", |
| 73 | + "urls_ofertas = [oferta.get_attribute(\"href\") for oferta in ofertas]\n", |
| 74 | + "#Mostrar todos los urls\n", |
| 75 | + "print(urls_ofertas)\n", |
| 76 | + "\n", |
| 77 | + "import csv\n", |
| 78 | + "\n", |
| 79 | + "#Lista para almacenar los datos\n", |
| 80 | + "datos_ofertas = []\n", |
| 81 | + "\n", |
| 82 | + "# Extraer datos de cada oferta\n", |
| 83 | + "for url in urls_ofertas:\n", |
| 84 | + " driver.get(url)\n", |
| 85 | + " time.sleep(5)\n", |
| 86 | + " try:\n", |
| 87 | + " #Extraer título\n", |
| 88 | + " titulo = driver.find_element(By.TAG_NAME, \"h1\").text\n", |
| 89 | + " \n", |
| 90 | + " #Extraer descripción hasta Beneficios\n", |
| 91 | + " descripcion_completa = driver.find_element(By.ID, \"ficha-detalle\").text\n", |
| 92 | + " descripcion = descripcion_completa.split(\"Beneficios\")[0] if \"Beneficios\" in descripcion_completa else descripcion_completa\n", |
| 93 | + " \n", |
| 94 | + " #Extraer distrito\n", |
| 95 | + " distrito = driver.find_element(By.XPATH, \"//div[contains(@class, 'ubicacion')]//h2\").text\n", |
| 96 | + " \n", |
| 97 | + " #Extraer modalidad\n", |
| 98 | + " if \"presencial\" in descripcion_completa or \"Presencial\" in descripcion_completa: modalidad = \"Presencial\",\n", |
| 99 | + " elif \"híbrido\" in descripcion_completa or \"hibrido\" in descripcion_completa or \"Híbrido\" in descripcion_completa or \"Hibrido\" in descripcion_completa: modalidad = \"Híbrido\",\n", |
| 100 | + " elif \"remoto\" in descripcion_completa or \"virtual\" in descripcion_completa or \"Remoto\" in descripcion_completa or \"Virtual\" in descripcion_completa: modalidad = \"Remoto\",\n", |
| 101 | + " else: modalidad = \"No especificado\"\n", |
| 102 | + " \n", |
| 103 | + " #Agregar datos a la lista\n", |
| 104 | + " datos_ofertas.append([titulo, descripcion, distrito, modalidad, url])\n", |
| 105 | + " \n", |
| 106 | + " except Exception as e:\n", |
| 107 | + " print(f\"Error en {url}: {e}\")\n", |
| 108 | + "\n", |
| 109 | + "# Guardar en CSV\n", |
| 110 | + "with open(\"ofertas_trabajo.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as archivo:\n", |
| 111 | + " escritor = csv.writer(archivo)\n", |
| 112 | + " escritor.writerow([\"Título\", \"Descripción\", \"Distrito\", \"Modalidad\", \"URL\"])\n", |
| 113 | + " escritor.writerows(datos_ofertas)\n", |
| 114 | + "\n", |
| 115 | + "print(\"Datos guardados en ofertas_trabajo.csv\")\n", |
| 116 | + "driver.quit()\n" |
| 117 | + ] |
| 118 | + } |
| 119 | + ], |
| 120 | + "metadata": { |
| 121 | + "kernelspec": { |
| 122 | + "display_name": "Python [conda env:web-scrapping-env]", |
| 123 | + "language": "python", |
| 124 | + "name": "conda-env-web-scrapping-env-py" |
| 125 | + }, |
| 126 | + "language_info": { |
| 127 | + "codemirror_mode": { |
| 128 | + "name": "ipython", |
| 129 | + "version": 3 |
| 130 | + }, |
| 131 | + "file_extension": ".py", |
| 132 | + "mimetype": "text/x-python", |
| 133 | + "name": "python", |
| 134 | + "nbconvert_exporter": "python", |
| 135 | + "pygments_lexer": "ipython3", |
| 136 | + "version": "3.10.16" |
| 137 | + } |
| 138 | + }, |
| 139 | + "nbformat": 4, |
| 140 | + "nbformat_minor": 5 |
| 141 | +} |
0 commit comments