Skip to content

Commit cc966dc

Browse files
authored
Merge pull request #59 from d2cml-ai/233531_hw1_2025_1
#13 _2
2 parents 268ee9c + eadb407 commit cc966dc

4 files changed

Lines changed: 1542 additions & 0 deletions

File tree

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 17,
6+
"id": "d2cebc35-168e-46b4-8a37-85250b2f5586",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"ename": "InvalidSessionIdException",
11+
"evalue": "Message: invalid session id: session deleted as the browser has closed the connection\nfrom disconnected: not connected to DevTools\n (Session info: chrome=134.0.6998.178)\nStacktrace:\n\tGetHandleVerifier [0x00007FF7E5994C25+3179557]\n\t(No symbol) [0x00007FF7E55F88A0]\n\t(No symbol) [0x00007FF7E54891CA]\n\t(No symbol) [0x00007FF7E5474F85]\n\t(No symbol) [0x00007FF7E5499F94]\n\t(No symbol) [0x00007FF7E550F9DF]\n\t(No symbol) [0x00007FF7E552FBE2]\n\t(No symbol) [0x00007FF7E5507A03]\n\t(No symbol) [0x00007FF7E54D06D0]\n\t(No symbol) [0x00007FF7E54D1983]\n\tGetHandleVerifier [0x00007FF7E59F67CD+3579853]\n\tGetHandleVerifier [0x00007FF7E5A0D1D2+3672530]\n\tGetHandleVerifier [0x00007FF7E5A02153+3627347]\n\tGetHandleVerifier [0x00007FF7E576092A+868650]\n\t(No symbol) [0x00007FF7E5602FFF]\n\t(No symbol) [0x00007FF7E55FF4A4]\n\t(No symbol) [0x00007FF7E55FF646]\n\t(No symbol) [0x00007FF7E55EEAA9]\n\tBaseThreadInitThunk [0x00007FFDB604E8D7+23]\n\tRtlUserThreadStart [0x00007FFDB80914FC+44]\n",
12+
"output_type": "error",
13+
"traceback": [
14+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
15+
"\u001b[1;31mInvalidSessionIdException\u001b[0m Traceback (most recent call last)",
16+
"Cell \u001b[1;32mIn[17], line 21\u001b[0m\n\u001b[0;32m 17\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m 19\u001b[0m \u001b[38;5;66;03m#Aplicar filtros\u001b[39;00m\n\u001b[0;32m 20\u001b[0m \u001b[38;5;66;03m#Fecha\u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m \u001b[43mdriver\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfind_element\u001b[49m\u001b[43m(\u001b[49m\u001b[43mBy\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mXPATH\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m//button[contains(text(), \u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mFecha\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m)]\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39mclick()\n\u001b[0;32m 22\u001b[0m time\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;241m3\u001b[39m)\n\u001b[0;32m 23\u001b[0m driver\u001b[38;5;241m.\u001b[39mfind_element(By\u001b[38;5;241m.\u001b[39mXPATH, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m//button[contains(text(), \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mMenor a 15 días\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m)]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\u001b[38;5;241m.\u001b[39mclick()\n",
17+
"File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:888\u001b[0m, in \u001b[0;36mWebDriver.find_element\u001b[1;34m(self, by, value)\u001b[0m\n\u001b[0;32m 885\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m NoSuchElementException(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCannot locate relative element with: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mby\u001b[38;5;241m.\u001b[39mroot\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 886\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m elements[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m--> 888\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43mCommand\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mFIND_ELEMENT\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43musing\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mby\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mvalue\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[43mvalue\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n",
18+
"File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\webdriver.py:429\u001b[0m, in \u001b[0;36mWebDriver.execute\u001b[1;34m(self, driver_command, params)\u001b[0m\n\u001b[0;32m 427\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_executor\u001b[38;5;241m.\u001b[39mexecute(driver_command, params)\n\u001b[0;32m 428\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m response:\n\u001b[1;32m--> 429\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43merror_handler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcheck_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mresponse\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 430\u001b[0m response[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_unwrap_value(response\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mvalue\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[0;32m 431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m response\n",
19+
"File \u001b[1;32m~\\anaconda3\\envs\\web-scrapping-env\\lib\\site-packages\\selenium\\webdriver\\remote\\errorhandler.py:232\u001b[0m, in \u001b[0;36mErrorHandler.check_response\u001b[1;34m(self, response)\u001b[0m\n\u001b[0;32m 230\u001b[0m alert_text \u001b[38;5;241m=\u001b[39m value[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124malert\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 231\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception_class(message, screen, stacktrace, alert_text) \u001b[38;5;66;03m# type: ignore[call-arg] # mypy is not smart enough here\u001b[39;00m\n\u001b[1;32m--> 232\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m exception_class(message, screen, stacktrace)\n",
20+
"\u001b[1;31mInvalidSessionIdException\u001b[0m: Message: invalid session id: session deleted as the browser has closed the connection\nfrom disconnected: not connected to DevTools\n (Session info: chrome=134.0.6998.178)\nStacktrace:\n\tGetHandleVerifier [0x00007FF7E5994C25+3179557]\n\t(No symbol) [0x00007FF7E55F88A0]\n\t(No symbol) [0x00007FF7E54891CA]\n\t(No symbol) [0x00007FF7E5474F85]\n\t(No symbol) [0x00007FF7E5499F94]\n\t(No symbol) [0x00007FF7E550F9DF]\n\t(No symbol) [0x00007FF7E552FBE2]\n\t(No symbol) [0x00007FF7E5507A03]\n\t(No symbol) [0x00007FF7E54D06D0]\n\t(No symbol) [0x00007FF7E54D1983]\n\tGetHandleVerifier [0x00007FF7E59F67CD+3579853]\n\tGetHandleVerifier [0x00007FF7E5A0D1D2+3672530]\n\tGetHandleVerifier [0x00007FF7E5A02153+3627347]\n\tGetHandleVerifier [0x00007FF7E576092A+868650]\n\t(No symbol) [0x00007FF7E5602FFF]\n\t(No symbol) [0x00007FF7E55FF4A4]\n\t(No symbol) [0x00007FF7E55FF646]\n\t(No symbol) [0x00007FF7E55EEAA9]\n\tBaseThreadInitThunk [0x00007FFDB604E8D7+23]\n\tRtlUserThreadStart [0x00007FFDB80914FC+44]\n"
21+
]
22+
}
23+
],
24+
"source": [
25+
"from selenium import webdriver\n",
26+
"import re\n",
27+
"import time\n",
28+
"from selenium.webdriver.chrome.service import Service\n",
29+
"from selenium.webdriver.common.by import By\n",
30+
"\n",
31+
"#Driver\n",
32+
"driver = webdriver.Chrome()\n",
33+
"#Ingresar pagina web, ajustar pantalla y zoom\n",
34+
"url = 'https://www.bumeran.com.pe/empleos.html'\n",
35+
"driver.get(url)\n",
36+
"\n",
37+
"options = webdriver.ChromeOptions()\n",
38+
"driver = webdriver.Chrome(options=options)\n",
39+
"driver.maximize_window()\n",
40+
"driver.execute_script(\"document.body.style.zoom='100%'\")\n",
41+
"time.sleep(3)\n",
42+
"\n",
43+
"#Aplicar filtros\n",
44+
"#Fecha\n",
45+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Fecha')]\").click()\n",
46+
"time.sleep(3)\n",
47+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Menor a 15 días')]\").click()\n",
48+
"time.sleep(3)\n",
49+
"#Area\n",
50+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Área')]\").click()\n",
51+
"time.sleep(3)\n",
52+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Tecnología')]\").click()\n",
53+
"time.sleep(3)\n",
54+
"#Subarea\n",
55+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Subárea')]\").click()\n",
56+
"time.sleep(3)\n",
57+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Programación')]\").click()\n",
58+
"time.sleep(3)\n",
59+
"#Dpto\n",
60+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Departamento')]\").click()\n",
61+
"time.sleep(3)\n",
62+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Lima')]\").click()\n",
63+
"time.sleep(3)\n",
64+
"#Horario\n",
65+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Carga horaria')]\").click()\n",
66+
"time.sleep(3)\n",
67+
"driver.find_element(By.XPATH, \"//button[contains(text(), 'Full-time')]\").click()\n",
68+
"time.sleep(3)\n",
69+
"\n",
70+
"#Encontrar todos los urls de ofertas con href\n",
71+
"ofertas = driver.find_elements(By.XPATH, \"//a[contains(@href, '/empleos/')]\")\n",
72+
"#Extraer urls y guardarlos en lista urls_ofertas\n",
73+
"urls_ofertas = [oferta.get_attribute(\"href\") for oferta in ofertas]\n",
74+
"#Mostrar todos los urls\n",
75+
"print(urls_ofertas)\n",
76+
"\n",
77+
"import csv\n",
78+
"\n",
79+
"#Lista para almacenar los datos\n",
80+
"datos_ofertas = []\n",
81+
"\n",
82+
"# Extraer datos de cada oferta\n",
83+
"for url in urls_ofertas:\n",
84+
" driver.get(url)\n",
85+
" time.sleep(5)\n",
86+
" try:\n",
87+
" #Extraer título\n",
88+
" titulo = driver.find_element(By.TAG_NAME, \"h1\").text\n",
89+
" \n",
90+
" #Extraer descripción hasta Beneficios\n",
91+
" descripcion_completa = driver.find_element(By.ID, \"ficha-detalle\").text\n",
92+
" descripcion = descripcion_completa.split(\"Beneficios\")[0] if \"Beneficios\" in descripcion_completa else descripcion_completa\n",
93+
" \n",
94+
" #Extraer distrito\n",
95+
" distrito = driver.find_element(By.XPATH, \"//div[contains(@class, 'ubicacion')]//h2\").text\n",
96+
" \n",
97+
" #Extraer modalidad\n",
98+
" if \"presencial\" in descripcion_completa or \"Presencial\" in descripcion_completa: modalidad = \"Presencial\",\n",
99+
" elif \"híbrido\" in descripcion_completa or \"hibrido\" in descripcion_completa or \"Híbrido\" in descripcion_completa or \"Hibrido\" in descripcion_completa: modalidad = \"Híbrido\",\n",
100+
" elif \"remoto\" in descripcion_completa or \"virtual\" in descripcion_completa or \"Remoto\" in descripcion_completa or \"Virtual\" in descripcion_completa: modalidad = \"Remoto\",\n",
101+
" else: modalidad = \"No especificado\"\n",
102+
" \n",
103+
" #Agregar datos a la lista\n",
104+
" datos_ofertas.append([titulo, descripcion, distrito, modalidad, url])\n",
105+
" \n",
106+
" except Exception as e:\n",
107+
" print(f\"Error en {url}: {e}\")\n",
108+
"\n",
109+
"# Guardar en CSV\n",
110+
"with open(\"ofertas_trabajo.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as archivo:\n",
111+
" escritor = csv.writer(archivo)\n",
112+
" escritor.writerow([\"Título\", \"Descripción\", \"Distrito\", \"Modalidad\", \"URL\"])\n",
113+
" escritor.writerows(datos_ofertas)\n",
114+
"\n",
115+
"print(\"Datos guardados en ofertas_trabajo.csv\")\n",
116+
"driver.quit()\n"
117+
]
118+
}
119+
],
120+
"metadata": {
121+
"kernelspec": {
122+
"display_name": "Python [conda env:web-scrapping-env]",
123+
"language": "python",
124+
"name": "conda-env-web-scrapping-env-py"
125+
},
126+
"language_info": {
127+
"codemirror_mode": {
128+
"name": "ipython",
129+
"version": 3
130+
},
131+
"file_extension": ".py",
132+
"mimetype": "text/x-python",
133+
"name": "python",
134+
"nbconvert_exporter": "python",
135+
"pygments_lexer": "ipython3",
136+
"version": "3.10.16"
137+
}
138+
},
139+
"nbformat": 4,
140+
"nbformat_minor": 5
141+
}

0 commit comments

Comments
 (0)