Restructure repository (#26)

* retruscture * 5-inference added * 5-inference added * added .gitignore * refactoring * Solved Embeding insertion into qdrant vector index * Added pulimi remaining resources --------- Co-authored-by: Vesa Alexandru <[email protected]>
decodingml · Jul 5, 2024 · bd5f14d · bd5f14d
1 parent 8590e4e
commit bd5f14d
Show file tree

Hide file tree

Showing 132 changed files with 12,172 additions and 352 deletions.
diff --git a/.docker/Dockerfile.bytewax b/.docker/Dockerfile.bytewax
@@ -0,0 +1,48 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim-bullseye
+
+ENV WORKSPACE_ROOT=/usr/src/app \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    POETRY_HOME="/opt/poetry" \
+    POETRY_NO_INTERACTION=1
+
+RUN mkdir -p $WORKSPACE_ROOT
+
+# Install system dependencies
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends build-essential \
+    gcc \
+    python3-dev \
+    curl \
+    build-essential \
+    && apt-get clean
+
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | python -
+
+# Add Poetry to PATH
+ENV PATH="$POETRY_HOME/bin:$PATH"
+
+RUN apt-get remove -y curl
+
+# Copy the pyproject.toml and poetry.lock files from the root directory
+COPY ./pyproject.toml ./poetry.lock ./
+
+# Install dependencies
+RUN poetry config virtualenvs.create false && poetry install
+
+# Set the working directory
+WORKDIR $WORKSPACE_ROOT
+
+# Copy the 3-feature-pipeline and any other necessary directories
+COPY ./3-feature-pipeline .
+COPY ./core ./core
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=/usr/src/app
+
+RUN chmod +x /usr/src/app/scripts/bytewax_entrypoint.sh
+
+# Command to run the Bytewax pipeline script
+CMD ["/usr/src/app/scripts/bytewax_entrypoint.sh"]
diff --git a/.docker/Dockerfile.cdc b/.docker/Dockerfile.cdc
@@ -0,0 +1,35 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    python3-dev \
+    curl \
+    build-essential \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Poetry
+RUN pip install poetry
+
+# Add Poetry to PATH
+ENV PATH="/etc/poetry/bin:$PATH"
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the pyproject.toml and poetry.lock files from the root directory
+COPY ./pyproject.toml ./poetry.lock ./
+
+# Install dependencies
+RUN poetry install --no-root
+
+# Copy the 2-data-ingestion and core directories
+COPY ./2-data-ingestion ./2-data-ingestion
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=/app
+
+# Command to run the script
+CMD poetry run python /app/2-data-ingestion/cdc.py && tail -f /dev/null
diff --git a/course/module-1/Dockerfile → .docker/Dockerfile.crawlers b/course/module-1/Dockerfile → .docker/Dockerfile.crawlers
@@ -44,20 +44,21 @@ RUN yum install -y \
 COPY --from=build /opt/chrome-linux /opt/chrome
 COPY --from=build /opt/chromedriver /opt/
 
-COPY poetry.lock pyproject.toml ./
+COPY ./pyproject.toml ./poetry.lock ./
 
 # Install Poetry, export dependencies to requirements.txt, and install dependencies
 # in the Lambda task directory, finally cleanup manifest files.
-RUN python3 -m pip install --upgrade pip && pip3 install poetry
-RUN poetry export -f requirements.txt > requirements.txt && \
-    pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
+RUN python -m pip install --upgrade pip && pip install poetry
+RUN poetry export --without 3-feature-pipeline,ml -f requirements.txt > requirements.txt && \
+    pip install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
     rm requirements.txt pyproject.toml poetry.lock
 
 # Optional TLS CA only if you plan to store the extracted data into Document DB
 RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}
+ENV PYTHONPATH=${LAMBDA_TASK_ROOT}/1-data-crawling
 
 # Copy function code
-COPY . ${LAMBDA_TASK_ROOT}
+COPY ./1-data-crawling ${LAMBDA_TASK_ROOT}/1-data-crawling
 
 # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
-CMD ["main.handler"]
+CMD ["1-data-crawling.main.handler"]
diff --git a/.gitignore b/.gitignore
@@ -162,6 +162,10 @@ cython_debug/
 # Ruff
 .ruff_cache
 
+data/
+dataset/
+data
+
 # Data
 output
 .cache

diff --git a/.python-version b/.python-version
@@ -0,0 +1 @@
+3.11.4
diff --git a/course/module-1/Readme.md → 1-data-crawling/README.md b/course/module-1/Readme.md → 1-data-crawling/README.md
diff --git a/1-data-crawling/config.py b/1-data-crawling/config.py
@@ -0,0 +1,12 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
+
+    # MongoDB configs
+    MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
+    MONGO_DATABASE_NAME: str = "scrabble"
+
+
+settings = Settings()
diff --git a/1-data-crawling/crawlers/__init__.py b/1-data-crawling/crawlers/__init__.py
@@ -0,0 +1,5 @@
+from .github import GithubCrawler
+from .linkedin import LinkedInCrawler
+from .medium import MediumCrawler
+
+__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
diff --git a/1-data-crawling/crawlers/base.py b/1-data-crawling/crawlers/base.py
@@ -0,0 +1,67 @@
+import time
+from abc import ABC, abstractmethod
+from tempfile import mkdtemp
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from db.documents import BaseDocument
+
+
+class BaseCrawler(ABC):
+    model: type[BaseDocument]
+
+    @abstractmethod
+    def extract(self, link: str, **kwargs) -> None: ...
+
+
+class BaseAbstractCrawler(BaseCrawler, ABC):
+    def __init__(self, scroll_limit: int = 5) -> None:
+        options = webdriver.ChromeOptions()
+        options.binary_location = "/opt/chrome/chrome"
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless=new")
+        options.add_argument("--single-process")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--log-level=3")
+        options.add_argument("--disable-popup-blocking")
+        options.add_argument("--disable-notifications")
+        options.add_argument("--disable-dev-tools")
+        options.add_argument("--ignore-certificate-errors")
+        options.add_argument("--no-zygote")
+        options.add_argument(f"--user-data-dir={mkdtemp()}")
+        options.add_argument(f"--data-path={mkdtemp()}")
+        options.add_argument(f"--disk-cache-dir={mkdtemp()}")
+        options.add_argument("--remote-debugging-port=9222")
+
+        self.set_extra_driver_options(options)
+
+        self.scroll_limit = scroll_limit
+        self.driver = webdriver.Chrome(
+            service=webdriver.ChromeService("/opt/chromedriver"),
+            options=options,
+        )
+
+    def set_extra_driver_options(self, options: Options) -> None:
+        pass
+
+    def login(self) -> None:
+        pass
+
+    def scroll_page(self) -> None:
+        """Scroll through the LinkedIn page based on the scroll limit."""
+        current_scroll = 0
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+            time.sleep(5)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height or (
+                self.scroll_limit and current_scroll >= self.scroll_limit
+            ):
+                break
+            last_height = new_height
+            current_scroll += 1
diff --git a/1-data-crawling/crawlers/github.py b/1-data-crawling/crawlers/github.py
@@ -0,0 +1,57 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from aws_lambda_powertools import Logger
+
+from crawlers.base import BaseCrawler
+from db.documents import RepositoryDocument
+
+logger = Logger(service="llm-twin-course/crawler")
+
+
+class GithubCrawler(BaseCrawler):
+    model = RepositoryDocument
+
+    def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
+        super().__init__()
+        self._ignore = ignore
+
+    def extract(self, link: str, **kwargs) -> None:
+        logger.info(f"Starting scrapping GitHub repository: {link}")
+
+        repo_name = link.rstrip("/").split("/")[-1]
+
+        local_temp = tempfile.mkdtemp()
+
+        try:
+            os.chdir(local_temp)
+            subprocess.run(["git", "clone", link])
+
+            repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
+
+            tree = {}
+            for root, dirs, files in os.walk(repo_path):
+                dir = root.replace(repo_path, "").lstrip("/")
+                if dir.startswith(self._ignore):
+                    continue
+
+                for file in files:
+                    if file.endswith(self._ignore):
+                        continue
+                    file_path = os.path.join(dir, file)
+                    with open(os.path.join(root, file), "r", errors="ignore") as f:
+                        tree[file_path] = f.read().replace(" ", "")
+
+            instance = self.model(
+                name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
+            )
+            instance.save()
+
+        except Exception:
+            raise
+        finally:
+            shutil.rmtree(local_temp)
+
+        logger.info(f"Finished scrapping GitHub repository: {link}")