Merge pull request #11 from decodingml/module-4

Module 4
decodingml · Apr 26, 2024 · 9fa8a5d · 9fa8a5d
2 parents cc4a955 + 5f8724f
commit 9fa8a5d
Show file tree

Hide file tree

Showing 20 changed files with 1,160 additions and 675 deletions.
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,7 @@ cython_debug/
 .vscode
 
 # MacOS
-.DS_Store
+.DS_Store
+
+# Ruff
+.ruff_cache
diff --git a/course/module-1/crawlers/linkedin.py b/course/module-1/crawlers/linkedin.py
@@ -37,8 +37,10 @@ def extract(self, link: str, **kwargs):
 
         self.driver.get(link)
         time.sleep(5)
-        button = self.driver.find_element(By.CSS_SELECTOR,
-                                          ".app-aware-link.profile-creator-shared-content-view__footer-action")
+        button = self.driver.find_element(
+            By.CSS_SELECTOR,
+            ".app-aware-link.profile-creator-shared-content-view__footer-action"
+        )
         button.click()
 
         # Scrolling and scraping posts
@@ -57,12 +59,7 @@ def extract(self, link: str, **kwargs):
         self.driver.close()
 
         self.model.bulk_insert(
-            [
-                PostDocument(
-                    platform="linkedin", content=post, author_id=kwargs.get("user")
-                )
-                for post in posts
-            ]
+            [PostDocument(platform="linkedin", content=post, author_id=kwargs.get("user")) for post in posts]
         )
 
         logger.info(f"Finished scrapping data for profile: {link}")
@@ -98,9 +95,7 @@ def _get_page_content(self, url: str) -> BeautifulSoup:
         time.sleep(5)
         return BeautifulSoup(self.driver.page_source, "html.parser")
 
-    def _extract_posts(
-        self, post_elements: List[Tag], post_images: Dict[str, str]
-    ) -> Dict[str, Dict[str, str]]:
+    def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]:
         """
         Extracts post texts and combines them with their respective images.
 
@@ -139,16 +134,8 @@ def login(self):
         """Log in to LinkedIn."""
         self.driver.get("https://www.linkedin.com/login")
         if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
-            raise ImproperlyConfigured(
-                "LinkedIn scraper requires an valid account to perform extraction"
-            )
+            raise ImproperlyConfigured("LinkedIn scraper requires an valid account to perform extraction")
 
-        self.driver.find_element(By.ID, "username").send_keys(
-            settings.LINKEDIN_USERNAME
-        )
-        self.driver.find_element(By.ID, "password").send_keys(
-            settings.LINKEDIN_PASSWORD
-        )
-        self.driver.find_element(
-            By.CSS_SELECTOR, ".login__form_action_container button"
-        ).click()
+        self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME)
+        self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD)
+        self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()
diff --git a/course/module-2/docker-compose.yml b/course/module-2/docker-compose.yml
@@ -42,20 +42,3 @@ services:
       - ~/rabbitmq/data/:/var/lib/rabbitmq/
       - ~/rabbitmq/log/:/var/log/rabbitmq
     restart: always
-
-  # cdc:
-  #   build:
-  #     context: .
-  #     dockerfile: .docker/Dockerfile.cdc
-  #   container_name: scrabble_cdc
-    # depends_on:
-      # - mq
-      # # - mongo1
-      # # - mongo2
-      # # - mongo3
-    # restart: always
-    # environment:
-    #   - MONGO_DATABASE_HOST=mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set
-    #   - MONGO_DATABASE_NAME=scrabble
-
-
diff --git a/course/module-3/.env.example b/course/module-3/.env.example
@@ -1,3 +1,4 @@
+# Ingestion embedding model config
 EMBEDDING_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2"
 EMBEDDING_MODEL_MAX_INPUT_LENGTH=256
 EMBEDDING_SIZE=384
@@ -19,3 +20,5 @@ RABBITMQ_DEFAULT_PASSWORD="guest"
 RABBITMQ_HOST="localhost"
 RABBITMQ_PORT= 5673
 
+# Retrieval config
+OPENAI_API_KEY="str"
diff --git a/course/module-3/Makefile b/course/module-3/Makefile
@@ -14,4 +14,7 @@ local-insert-data-mongo: #Insert data to mongodb
 	poetry run python insert_data_mongo.py
 
 local-bytewax: # Run bytewax pipeline
-	poetry run python -m bytewax.run data_flow/bytewax_pipeline     
+	poetry run python -m bytewax.run data_flow/bytewax_pipeline 
+
+local-test-retriever: # Test retriever
+	poetry run python retriever.py
diff --git a/course/module-3/data_logic/embedding_data_handlers.py b/course/module-3/data_logic/embedding_data_handlers.py
@@ -27,6 +27,7 @@ def embedd(self, data_model: PostChunkModel) -> PostEmbeddedChunkModel:
             entry_id=data_model.entry_id,
             platform=data_model.platform,
             chunk_id=data_model.chunk_id,
+            chunk_content=data_model.chunk_content,
             embedded_content=embedd_text(data_model.chunk_content),
             author_id=data_model.author_id,
             type=data_model.type,
@@ -39,6 +40,7 @@ def embedd(self, data_model: ArticleChunkModel) -> ArticleEmbeddedChunkModel:
             entry_id=data_model.entry_id,
             platform=data_model.platform,
             link=data_model.link,
+            chunk_content=data_model.chunk_content,
             chunk_id=data_model.chunk_id,
             embedded_content=embedd_text(data_model.chunk_content),
             author_id=data_model.author_id,
@@ -53,6 +55,7 @@ def embedd(self, data_model: RepositoryChunkModel) -> RepositoryEmbeddedChunkMod
             name=data_model.name,
             link=data_model.link,
             chunk_id=data_model.chunk_id,
+            chunk_content=data_model.chunk_content,
             embedded_content=embedd_text(data_model.chunk_content),
             owner_id=data_model.owner_id,
             type=data_model.type,

diff --git a/course/module-3/insert_data_mongo.py b/course/module-3/insert_data_mongo.py
@@ -1,7 +1,7 @@
 from db.documents import ArticleDocument, PostDocument, RepositoryDocument
 
 
-def insert_post_into_collection():
+def insert_post_into_collection_1():
     post = {
         "first post": """Excited to share some big news! 🎉 Our team has been working tirelessly on developing a groundbreaking solution designed to revolutionize the tech industry, and today, we're finally ready to unveil it to the world. Introducing InnovateX, our latest product that leverages cutting-edge AI to streamline operations and enhance productivity for businesses globally. 🚀
 
@@ -24,7 +24,106 @@ def insert_post_into_collection():
     print("Post inserted into collection")
 
 
-def insert_article_into_collection():
+def insert_post_into_collection_2():
+    post = {
+        'first post': """Join me in acknowledging the transformative impact of OpenAI's groundbreaking work! From revolutionizing natural language processing to advancing robotics and healthcare, OpenAI's research has far-reaching implications across diverse fields. By fostering collaboration and knowledge sharing, OpenAI is accelerating the pace of innovation and driving progress in AI-driven technologies. As we witness the remarkable achievements of OpenAI, let's continue to support and amplify their efforts in shaping a future where AI serves as a powerful force for innovation and societal benefit. #OpenAI #AI #Innovation #Technology"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_3():
+    post = {
+        'first post': """Excited to delve deeper into the fascinating realm of Retrieval Augmented Generation (RAG)! This cutting-edge approach combines the strengths of retrieval-based models with generative models, promising groundbreaking advancements in natural language processing (NLP). By enabling systems to retrieve and generate information dynamically, RAG holds immense potential for revolutionizing various applications, from question answering systems to content creation tools. I'm eager to explore how RAG can enhance user experiences, streamline information retrieval processes, and drive innovation across industries. Join me on this journey as we uncover the transformative capabilities of RAG in shaping the future of AI-driven technologies. #RAG #AI #NLP #Innovation"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_4():
+    post = {
+        'first post': """Just wrapped up an intensive exploration of Retrieval Augmented Generation (RAG), and I'm truly astounded by its potential to redefine the landscape of artificial intelligence! By seamlessly integrating retrieval and generation models, RAG introduces a novel paradigm that empowers AI systems to dynamically retrieve and generate information in real-time. This innovative approach opens up new avenues for enhancing human-computer interactions, enabling more intuitive and contextually relevant responses. From personalized virtual assistants to AI-driven content creation platforms, the applications of RAG are boundless. I'm excited to witness the transformative impact of RAG across various domains and industries. #ArtificialIntelligence #RAG #Innovation #FutureTech"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_5():
+    post = {
+        'first post': """Are you intrigued by the convergence of retrieval and generation models in natural language processing? Join me on a journey of discovery as we explore the fascinating world of Retrieval Augmented Generation (RAG). By seamlessly integrating the strengths of both approaches, RAG offers a powerful framework for enhancing information retrieval, question answering, and content generation tasks. Its ability to dynamically retrieve and generate contextually relevant responses marks a significant leap forward in AI capabilities. Let's embark on this exploration together and unlock the full potential of RAG in shaping the future of AI-driven technologies. #RAG #AI #NLP #Technology"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe4200-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_6():
+    post = {
+        'first post': """Just completed an in-depth study of Retrieval Augmented Generation (RAG), and I'm incredibly excited about its transformative potential in natural language processing (NLP). By leveraging the complementary strengths of retrieval-based and generative models, RAG enables AI systems to dynamically retrieve and generate information, leading to more contextually relevant and coherent responses. Whether it's enhancing search engines, improving chatbots, or revolutionizing content creation tools, RAG offers a versatile framework with countless applications. I can't wait to see how this innovative approach will reshape the way we interact with information and AI systems in the years to come. #RAG #NLP #AI #Innovation"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_7():
+    post = {
+        'first post': """Just completed an in-depth study of Retrieval Augmented Generation (RAG), and I'm incredibly excited about its transformative potential in natural language processing (NLP). By leveraging the complementary strengths of retrieval-based and generative models, RAG enables AI systems to dynamically retrieve and generate information, leading to more contextually relevant and coherent responses. Whether it's enhancing search engines, improving chatbots, or revolutionizing content creation tools, RAG offers a versatile framework with countless applications. I can't wait to see how this innovative approach will reshape the way we interact with information and AI systems in the years to come. #RAG #NLP #AI #Innovation"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_8():
+    post = {
+        'first post': """Just attended an enlightening webinar on Retrieval Augmented Generation (RAG) and its potential to revolutionize content creation in the digital age. By seamlessly integrating retrieval-based and generative models, RAG offers a unique solution to the challenges of generating high-quality and contextually relevant content. Its ability to dynamically retrieve information from large knowledge bases while generating coherent and engaging narratives opens up exciting possibilities for marketers, writers, and content creators. Whether it's crafting personalized product descriptions or generating compelling blog posts, RAG promises to elevate the art of content creation to new heights. #RAG #ContentCreation #AI #Innovation"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_9():
+    post = {
+        'first post': """Python classes are a fundamental concept in object-oriented programming, and mastering them is essential for building robust and scalable applications. Whether you're a beginner eager to learn the basics or an experienced developer looking to deepen your understanding, delving into Python classes is a rewarding journey. From encapsulation and inheritance to polymorphism and abstraction, classes empower you to organize your code effectively and create reusable components. Join me in exploring the intricacies of Python classes as we unlock the full potential of object-oriented programming in Python. Let's level up our coding skills together! #Python #Programming #Classes #OOP"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_post_into_collection_10():
+    post = {
+        'first post': """Let's dive into the world of Python classes! As the building blocks of object-oriented programming in Python, classes play a pivotal role in structuring and organizing code for scalability and maintainability. Whether you're developing web applications, data analysis tools, or machine learning models, understanding how to leverage classes effectively is essential. From defining attributes and methods to implementing inheritance and polymorphism, mastering Python classes empowers you to create elegant and efficient solutions to complex problems. Join me in exploring the depths of Python classes as we elevate our coding skills to new heights! #Python #OOP #Coding #Programming"""}
+    PostDocument(
+        platform="linkedin",
+        content=post,
+        author_id="dbe4200-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+    print("Post inserted into collection")
+
+
+def insert_article_into_collection_1():
     content = {"test": "Test data for a article content"}
     ArticleDocument(
         platform="medium",
@@ -36,6 +135,20 @@ def insert_article_into_collection():
     print("Article inserted into collection")
 
 
+def insert_article_into_collection_2():
+    content = {"test": """Retrieval Augmented Generation (RAG) represents a cutting-edge approach in the field of natural language processing (NLP) that merges the strengths of retrieval-based models and generative models. The key concept behind RAG is its ability to dynamically retrieve relevant information from a large corpus or knowledge base and seamlessly incorporate it into generative responses.
+    In traditional generative models, outputs are generated based solely on the internal knowledge of the model. This can lead to issues with outdated information or hallucinations, where the model creates content that seems plausible but is factually incorrect. RAG mitigates these risks by integrating retrieval components that can access up-to-date and contextually appropriate information in real-time. This hybrid approach enhances accuracy and contextual relevance, making it ideal for applications like question answering, chatbots, and content creation tools.
+    RAG's impact is transformative, enabling AI systems to provide more reliable and accurate responses while adapting to changing information landscapes. This flexibility opens the door to a range of new possibilities, from personalized virtual assistants to intelligent document summarization. As RAG continues to evolve, it holds great promise for revolutionizing how we interact with AI systems and extract valuable insights from large volumes of data."""}
+    ArticleDocument(
+        platform="medium",
+        link="/htttps/alex/paul",
+        content=content,
+        author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
+    ).save()
+
+    print("Article inserted into collection")
+
+
 def insert_repo_into_collection():
     content = {"test": "Test data for a repo content into db"}
     RepositoryDocument(
@@ -49,6 +162,16 @@ def insert_repo_into_collection():
 
 
 if __name__ == "__main__":
-    insert_post_into_collection()
-    insert_article_into_collection()
+    insert_post_into_collection_1()
+    insert_post_into_collection_2()
+    insert_post_into_collection_3()
+    insert_post_into_collection_4()
+    insert_post_into_collection_5()
+    insert_post_into_collection_6()
+    insert_post_into_collection_7()
+    insert_post_into_collection_8()
+    insert_post_into_collection_9()
+    insert_post_into_collection_10()
+    insert_article_into_collection_1()
+    insert_article_into_collection_2()
     insert_repo_into_collection()
diff --git a/course/module-3/llm_components/__init__.py b/course/module-3/llm_components/__init__.py
diff --git a/course/module-3/llm_components/chain.py b/course/module-3/llm_components/chain.py
@@ -0,0 +1,10 @@
+from langchain.chains.llm import LLMChain
+from langchain.prompts import PromptTemplate
+
+
+class GeneralChain:
+    @staticmethod
+    def get_chain(llm, template: PromptTemplate, output_key: str, verbose=True):
+        return LLMChain(
+            llm=llm, prompt=template, output_key=output_key, verbose=verbose
+        )
diff --git a/course/module-3/llm_components/prompt_templates.py b/course/module-3/llm_components/prompt_templates.py
@@ -0,0 +1,64 @@
+from abc import ABC, abstractmethod
+
+from langchain.prompts import PromptTemplate
+from pydantic import BaseModel
+
+
+class BasePromptTemplate(ABC, BaseModel):
+    @abstractmethod
+    def create_template(self) -> PromptTemplate:
+        pass
+
+
+class QueryExpansionTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to generate Five
+    different versions of the given user question to retrieve relevant documents from a vector
+    database. By generating multiple perspectives on the user question, your goal is to help
+    the user overcome some of the limitations of the distance-based similarity search.
+    Provide these alternative questions seperated by newlines.
+    Original question: {question}"""
+
+    question: str
+
+    def create_template(self) -> PromptTemplate:
+        template = PromptTemplate(
+            template=self.prompt, input_variables=["question"], verbose=True
+        )
+        template.format(question=self.question)
+        return template
+
+
+class SelfQueryTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
+    The required information that needs to be extracted is the user id. 
+    Your response should consists of only the extracted id (e.g. 1345256), nothing else.
+    User question: {question}"""
+
+    question: str
+
+    def create_template(self) -> PromptTemplate:
+        template = PromptTemplate(
+            template=self.prompt, input_variables=["question"], verbose=True
+        )
+        template.format(question=self.question)
+        return template
+
+
+class RerankingTemplate(BasePromptTemplate):
+    prompt: str = """You are an AI language model assistant. Your task is to rerank passages related to a query
+    based on their relevance. The most relevant passages should be put at the beginning and at the end. 
+    You should only pick at max 5 passages.
+    The following are passages related to this query: {question}.
+    Passages: {passages}
+    """
+
+    question: str
+
+    passages: str
+
+    def create_template(self) -> PromptTemplate:
+        template = PromptTemplate(
+            template=self.prompt, input_variables=["question", "passages"], verbose=True
+        )
+        template.format(question=self.question, passages=self.passages)
+        return template