Add files via upload

Eschavez00 · web-flow · commit f515d475fdc2 · 2025-04-18T01:08:50.000-05:00
diff --git a/homework/hw1/200866_hw1_2025_1/200866_hw1_2025-1.ipynb b/homework/hw1/200866_hw1_2025_1/200866_hw1_2025-1.ipynb
@@ -0,0 +1,273 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "c5a3099c-85f4-4894-bf34-a311f3aa3c8f",
+   "metadata": {},
+   "source": [
+    "### Install Selenium"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "b3982d80-89df-48c2-9544-a4fd283fba66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Librerías de Selenium y WebDriver Manager (como usamos en clase)\n",
+    "from selenium import webdriver\n",
+    "from selenium.webdriver.common.by import By\n",
+    "from selenium.webdriver.chrome.service import Service\n",
+    "from selenium.webdriver.support.ui import WebDriverWait\n",
+    "from selenium.webdriver.support import expected_conditions as EC\n",
+    "from webdriver_manager.chrome import ChromeDriverManager\n",
+    "import time  # para poner pausas si es necesario"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fd18adea-a474-460d-b3da-3a99237e3a6f",
+   "metadata": {},
+   "source": [
+    "### Iniciar en el navegador (Chrome)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "6d55575a-8002-45f6-b9dd-f946b1cddc29",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Creo el navegador Chrome y lo maximizo\n",
+    "driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))\n",
+    "driver.maximize_window()\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8ff444d-6687-4eb2-80bf-7a5c6621b3bc",
+   "metadata": {},
+   "source": [
+    "### Página principal para buscar trabajo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "8c4448e1-3632-4270-8caa-572768672be3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Entramos a la página principal\n",
+    "driver.get(\"https://www.bumeran.com.pe\")\n",
+    "time.sleep(2)  # Esperamos 2 segundos para que cargue"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54b3ec17-a4da-4bd0-a794-451812833618",
+   "metadata": {},
+   "source": [
+    "### Data Science en el buscador"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "56a92580-3380-4bd1-80d8-34fe93c7c456",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Se encontraron 13 campos de entrada (input)\n",
+      "0: placeholder = \n",
+      "1: placeholder = \n",
+      "2: placeholder = \n",
+      "3: placeholder = \n",
+      "4: placeholder = \n",
+      "5: placeholder = \n",
+      "6: placeholder = \n",
+      "7: placeholder = Buscar por puesto o palabra clave\n",
+      "8: placeholder = \n",
+      "9: placeholder = \n",
+      "10: placeholder = Buscar por puesto o palabra clave\n",
+      "11: placeholder = \n",
+      "12: placeholder = \n"
+     ]
+    }
+   ],
+   "source": [
+    "# Cargar la página y esperar 5 segundos para asegurar render completo\n",
+    "driver.get(\"https://www.bumeran.com.pe\")\n",
+    "time.sleep(5)  # Esperamos para que la página cargue completamente\n",
+    "\n",
+    "#  Ver cuántos <input> hay en el DOM\n",
+    "inputs = driver.find_elements(By.TAG_NAME, \"input\")\n",
+    "print(f\"Se encontraron {len(inputs)} campos de entrada (input)\")\n",
+    "for i, inp in enumerate(inputs):\n",
+    "    print(f\"{i}: placeholder = {inp.get_attribute('placeholder')}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bccab618-c7a6-4282-b6e7-2a9d9d594974",
+   "metadata": {},
+   "source": [
+    "### Buscar Data Science y Lima"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "131d3ce0-7c95-41f6-9f61-b6dfd5682b97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Cargar página con filtros aplicados\n",
+    "url = \"https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html\"\n",
+    "driver.get(url)\n",
+    "time.sleep(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "99ca2e33-5bfa-427b-832c-283632c1a484",
+   "metadata": {},
+   "source": [
+    "### Info general"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "17c5f586-a576-4220-a4f5-2fb34e1438b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Total <a> encontrados: 29\n",
+      "https://www.bumeran.com.pe/empleos-seniority-junior.html?landing-jovenes-profesionales=true\n",
+      "https://www.bumeran.com.pe/empleos-seniority-gerencia-alta-gerencia-direccion.html?landing-puestos-ejecutivos=true\n",
+      "https://www.bumeran.com.pe/empleos.html\n",
+      "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html?relevantes=true\n",
+      "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html?recientes=true\n",
+      "https://www.bumeran.com.pe/empleos/docente-de-data-science-machine-learning-visiva-1116743186.html\n",
+      "https://www.bumeran.com.pe/empleos/analista-data-science-20607325163-1116660540.html\n",
+      "https://www.bumeran.com.pe/empleos/practicante-data-science-rimac-cia-de-seguros-y-reaseguros-s.a.-1116658516.html\n",
+      "https://www.bumeran.com.pe/empleos/coordinador-de-data-science-industrias-san-miguel-1116658349.html\n",
+      "https://www.bumeran.com.pe/empleos/practicante-pre-profesional-en-data-science-e-ai-1116609896.html\n",
+      "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html#\n",
+      "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html\n",
+      "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html#\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Info general\n",
+    "import time\n",
+    "time.sleep(5)\n",
+    "\n",
+    "# Probar: imprimir todos los href\n",
+    "anchors = driver.find_elements(By.TAG_NAME, \"a\")\n",
+    "\n",
+    "print(\"Total <a> encontrados:\", len(anchors))\n",
+    "for a in anchors:\n",
+    "    href = a.get_attribute(\"href\")\n",
+    "    if href and \"data-science\" in href or \"empleo\" in href.lower():\n",
+    "        print(href)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cae3929-fd4b-464e-9208-b026c50d09f1",
+   "metadata": {},
+   "source": [
+    "### Filtrar (2nd stage)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "a7eb93f0-b538-41d7-85d8-88dc6bc47913",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "✅ Archivo guardado como: bumeran_data_science_jobs.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from selenium.webdriver.common.by import By\n",
+    "import time\n",
+    "\n",
+    "def get_job_details(driver, link):\n",
+    "    driver.get(link)\n",
+    "    time.sleep(2)\n",
+    "\n",
+    "    def try_xpath(xpath):\n",
+    "        try:\n",
+    "            return driver.find_element(By.XPATH, xpath).text\n",
+    "        except:\n",
+    "            return \"N/A\"\n",
+    "\n",
+    "    distrito = try_xpath(\"//h2[contains(text(), 'Distrito')]/following-sibling::p\")\n",
+    "    modalidad = try_xpath(\"//h2[contains(text(), 'Modalidad de trabajo')]/following-sibling::p\")\n",
+    "    nivel = try_xpath(\"//h2[contains(text(), 'Nivel laboral')]/following-sibling::p\")\n",
+    "    discapacidad = try_xpath(\"//h2[contains(text(), 'Postulantes con discapacidad')]/following-sibling::p\")\n",
+    "\n",
+    "    return {\n",
+    "        \"URL\": link,\n",
+    "        \"Distrito\": distrito,\n",
+    "        \"Modalidad de trabajo\": modalidad,\n",
+    "        \"Nivel laboral\": nivel,\n",
+    "        \"Discapacidad\": discapacidad\n",
+    "    }\n",
+    "\n",
+    "# Recorrer todos los enlaces\n",
+    "data = []\n",
+    "for i, link in enumerate(job_links):\n",
+    "    print(f\"[{i+1}/{len(job_links)}] Scraping: {link}\")\n",
+    "    details = get_job_details(driver, link)\n",
+    "    data.append(details)\n",
+    "\n",
+    "# Guardar en CSV\n",
+    "df = pd.DataFrame(data)\n",
+    "df.to_csv(\"bumeran_data_science_jobs.csv\", index=False)\n",
+    "print(\"✅ Archivo guardado como: bumeran_data_science_jobs.csv\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/homework/hw1/200866_hw1_2025_1/200866_hw1_2025_1.txt b/homework/hw1/200866_hw1_2025_1/200866_hw1_2025_1.txt
@@ -0,0 +1,90 @@
+### Install Selenium
+# Librerías de Selenium y WebDriver Manager (como usamos en clase)
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from webdriver_manager.chrome import ChromeDriverManager
+import time  # para poner pausas si es necesario
+
+### Iniciar en el navegador (Chrome)
+# Creo el navegador Chrome y lo maximizo
+driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
+driver.maximize_window()
+
+### Página principal para buscar trabajo
+# Entramos a la página principal
+driver.get("https://www.bumeran.com.pe")
+time.sleep(2)  # Esperamos 2 segundos para que cargue
+
+### Data Science en el buscador
+# Cargar la página y esperar 5 segundos para asegurar render completo
+driver.get("https://www.bumeran.com.pe")
+time.sleep(5)  # Esperamos para que la página cargue completamente
+
+#  Ver cuántos <input> hay en el DOM
+inputs = driver.find_elements(By.TAG_NAME, "input")
+print(f"Se encontraron {len(inputs)} campos de entrada (input)")
+for i, inp in enumerate(inputs):
+    print(f"{i}: placeholder = {inp.get_attribute('placeholder')}")
+
+### Buscar Data Science y Lima
+#Cargar página con filtros aplicados
+url = "https://www.bumeran.com.pe/en-lima/empleos-busqueda-data-science.html"
+driver.get(url)
+time.sleep(3)
+
+# Info general
+import time
+time.sleep(5)
+
+# Probar: imprimir todos los href
+anchors = driver.find_elements(By.TAG_NAME, "a")
+
+print("Total <a> encontrados:", len(anchors))
+for a in anchors:
+    href = a.get_attribute("href")
+    if href and "data-science" in href or "empleo" in href.lower():
+        print(href)
+
+#2nd stage
+
+import pandas as pd
+from selenium.webdriver.common.by import By
+import time
+
+def get_job_details(driver, link):
+    driver.get(link)
+    time.sleep(2)
+
+    def try_xpath(xpath):
+        try:
+            return driver.find_element(By.XPATH, xpath).text
+        except:
+            return "N/A"
+
+    distrito = try_xpath("//h2[contains(text(), 'Distrito')]/following-sibling::p")
+    modalidad = try_xpath("//h2[contains(text(), 'Modalidad de trabajo')]/following-sibling::p")
+    nivel = try_xpath("//h2[contains(text(), 'Nivel laboral')]/following-sibling::p")
+    discapacidad = try_xpath("//h2[contains(text(), 'Postulantes con discapacidad')]/following-sibling::p")
+
+    return {
+        "URL": link,
+        "Distrito": distrito,
+        "Modalidad de trabajo": modalidad,
+        "Nivel laboral": nivel,
+        "Discapacidad": discapacidad
+    }
+
+# Recorrer todos los enlaces
+data = []
+for i, link in enumerate(job_links):
+    print(f"[{i+1}/{len(job_links)}] Scraping: {link}")
+    details = get_job_details(driver, link)
+    data.append(details)
+
+# Guardar en CSV
+df = pd.DataFrame(data)
+df.to_csv("bumeran_data_science_jobs.csv", index=False)
+print("✅ Archivo guardado como: bumeran_data_science_jobs.csv")
diff --git a/homework/hw1/200866_hw1_2025_1/bumeran_data_science_jobs.csv b/homework/hw1/200866_hw1_2025_1/bumeran_data_science_jobs.csv
@@ -0,0 +1 @@
+
diff --git a/homework/hw1/200866_hw1_2025_1/requirements.txt.txt b/homework/hw1/200866_hw1_2025_1/requirements.txt.txt
@@ -0,0 +1,4 @@
+selenium
+webdriver-manager
+pandas
+

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +selenium
 +webdriver-manager
 +pandas
++