Skip to content

Commit

Permalink
Merge pull request #11 from decodingml/module-4
Browse files Browse the repository at this point in the history
Module 4
  • Loading branch information
iusztinpaul committed Apr 26, 2024
2 parents cc4a955 + 5f8724f commit 9fa8a5d
Show file tree
Hide file tree
Showing 20 changed files with 1,160 additions and 675 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,7 @@ cython_debug/
.vscode

# MacOS
.DS_Store
.DS_Store

# Ruff
.ruff_cache
33 changes: 10 additions & 23 deletions course/module-1/crawlers/linkedin.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def extract(self, link: str, **kwargs):

self.driver.get(link)
time.sleep(5)
button = self.driver.find_element(By.CSS_SELECTOR,
".app-aware-link.profile-creator-shared-content-view__footer-action")
button = self.driver.find_element(
By.CSS_SELECTOR,
".app-aware-link.profile-creator-shared-content-view__footer-action"
)
button.click()

# Scrolling and scraping posts
Expand All @@ -57,12 +59,7 @@ def extract(self, link: str, **kwargs):
self.driver.close()

self.model.bulk_insert(
[
PostDocument(
platform="linkedin", content=post, author_id=kwargs.get("user")
)
for post in posts
]
[PostDocument(platform="linkedin", content=post, author_id=kwargs.get("user")) for post in posts]
)

logger.info(f"Finished scrapping data for profile: {link}")
Expand Down Expand Up @@ -98,9 +95,7 @@ def _get_page_content(self, url: str) -> BeautifulSoup:
time.sleep(5)
return BeautifulSoup(self.driver.page_source, "html.parser")

def _extract_posts(
self, post_elements: List[Tag], post_images: Dict[str, str]
) -> Dict[str, Dict[str, str]]:
def _extract_posts(self, post_elements: List[Tag], post_images: Dict[str, str]) -> Dict[str, Dict[str, str]]:
"""
Extracts post texts and combines them with their respective images.
Expand Down Expand Up @@ -139,16 +134,8 @@ def login(self):
"""Log in to LinkedIn."""
self.driver.get("https://www.linkedin.com/login")
if not settings.LINKEDIN_USERNAME and not settings.LINKEDIN_PASSWORD:
raise ImproperlyConfigured(
"LinkedIn scraper requires an valid account to perform extraction"
)
raise ImproperlyConfigured("LinkedIn scraper requires an valid account to perform extraction")

self.driver.find_element(By.ID, "username").send_keys(
settings.LINKEDIN_USERNAME
)
self.driver.find_element(By.ID, "password").send_keys(
settings.LINKEDIN_PASSWORD
)
self.driver.find_element(
By.CSS_SELECTOR, ".login__form_action_container button"
).click()
self.driver.find_element(By.ID, "username").send_keys(settings.LINKEDIN_USERNAME)
self.driver.find_element(By.ID, "password").send_keys(settings.LINKEDIN_PASSWORD)
self.driver.find_element(By.CSS_SELECTOR, ".login__form_action_container button").click()
17 changes: 0 additions & 17 deletions course/module-2/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,3 @@ services:
- ~/rabbitmq/data/:/var/lib/rabbitmq/
- ~/rabbitmq/log/:/var/log/rabbitmq
restart: always

# cdc:
# build:
# context: .
# dockerfile: .docker/Dockerfile.cdc
# container_name: scrabble_cdc
# depends_on:
# - mq
# # - mongo1
# # - mongo2
# # - mongo3
# restart: always
# environment:
# - MONGO_DATABASE_HOST=mongodb://mongo1:30001,mongo2:30002,mongo3:30003/?replicaSet=my-replica-set
# - MONGO_DATABASE_NAME=scrabble


3 changes: 3 additions & 0 deletions course/module-3/.env.example
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# Ingestion embedding model config
EMBEDDING_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_MODEL_MAX_INPUT_LENGTH=256
EMBEDDING_SIZE=384
Expand All @@ -19,3 +20,5 @@ RABBITMQ_DEFAULT_PASSWORD="guest"
RABBITMQ_HOST="localhost"
RABBITMQ_PORT= 5673

# Retrieval config
OPENAI_API_KEY="str"
5 changes: 4 additions & 1 deletion course/module-3/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ local-insert-data-mongo: #Insert data to mongodb
poetry run python insert_data_mongo.py

local-bytewax: # Run bytewax pipeline
poetry run python -m bytewax.run data_flow/bytewax_pipeline
poetry run python -m bytewax.run data_flow/bytewax_pipeline

local-test-retriever: # Test retriever
poetry run python retriever.py
3 changes: 3 additions & 0 deletions course/module-3/data_logic/embedding_data_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def embedd(self, data_model: PostChunkModel) -> PostEmbeddedChunkModel:
entry_id=data_model.entry_id,
platform=data_model.platform,
chunk_id=data_model.chunk_id,
chunk_content=data_model.chunk_content,
embedded_content=embedd_text(data_model.chunk_content),
author_id=data_model.author_id,
type=data_model.type,
Expand All @@ -39,6 +40,7 @@ def embedd(self, data_model: ArticleChunkModel) -> ArticleEmbeddedChunkModel:
entry_id=data_model.entry_id,
platform=data_model.platform,
link=data_model.link,
chunk_content=data_model.chunk_content,
chunk_id=data_model.chunk_id,
embedded_content=embedd_text(data_model.chunk_content),
author_id=data_model.author_id,
Expand All @@ -53,6 +55,7 @@ def embedd(self, data_model: RepositoryChunkModel) -> RepositoryEmbeddedChunkMod
name=data_model.name,
link=data_model.link,
chunk_id=data_model.chunk_id,
chunk_content=data_model.chunk_content,
embedded_content=embedd_text(data_model.chunk_content),
owner_id=data_model.owner_id,
type=data_model.type,
Expand Down
131 changes: 127 additions & 4 deletions course/module-3/insert_data_mongo.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from db.documents import ArticleDocument, PostDocument, RepositoryDocument


def insert_post_into_collection():
def insert_post_into_collection_1():
post = {
"first post": """Excited to share some big news! 🎉 Our team has been working tirelessly on developing a groundbreaking solution designed to revolutionize the tech industry, and today, we're finally ready to unveil it to the world. Introducing InnovateX, our latest product that leverages cutting-edge AI to streamline operations and enhance productivity for businesses globally. 🚀
Expand All @@ -24,7 +24,106 @@ def insert_post_into_collection():
print("Post inserted into collection")


def insert_article_into_collection():
def insert_post_into_collection_2():
post = {
'first post': """Join me in acknowledging the transformative impact of OpenAI's groundbreaking work! From revolutionizing natural language processing to advancing robotics and healthcare, OpenAI's research has far-reaching implications across diverse fields. By fostering collaboration and knowledge sharing, OpenAI is accelerating the pace of innovation and driving progress in AI-driven technologies. As we witness the remarkable achievements of OpenAI, let's continue to support and amplify their efforts in shaping a future where AI serves as a powerful force for innovation and societal benefit. #OpenAI #AI #Innovation #Technology"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_3():
post = {
'first post': """Excited to delve deeper into the fascinating realm of Retrieval Augmented Generation (RAG)! This cutting-edge approach combines the strengths of retrieval-based models with generative models, promising groundbreaking advancements in natural language processing (NLP). By enabling systems to retrieve and generate information dynamically, RAG holds immense potential for revolutionizing various applications, from question answering systems to content creation tools. I'm eager to explore how RAG can enhance user experiences, streamline information retrieval processes, and drive innovation across industries. Join me on this journey as we uncover the transformative capabilities of RAG in shaping the future of AI-driven technologies. #RAG #AI #NLP #Innovation"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_4():
post = {
'first post': """Just wrapped up an intensive exploration of Retrieval Augmented Generation (RAG), and I'm truly astounded by its potential to redefine the landscape of artificial intelligence! By seamlessly integrating retrieval and generation models, RAG introduces a novel paradigm that empowers AI systems to dynamically retrieve and generate information in real-time. This innovative approach opens up new avenues for enhancing human-computer interactions, enabling more intuitive and contextually relevant responses. From personalized virtual assistants to AI-driven content creation platforms, the applications of RAG are boundless. I'm excited to witness the transformative impact of RAG across various domains and industries. #ArtificialIntelligence #RAG #Innovation #FutureTech"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_5():
post = {
'first post': """Are you intrigued by the convergence of retrieval and generation models in natural language processing? Join me on a journey of discovery as we explore the fascinating world of Retrieval Augmented Generation (RAG). By seamlessly integrating the strengths of both approaches, RAG offers a powerful framework for enhancing information retrieval, question answering, and content generation tasks. Its ability to dynamically retrieve and generate contextually relevant responses marks a significant leap forward in AI capabilities. Let's embark on this exploration together and unlock the full potential of RAG in shaping the future of AI-driven technologies. #RAG #AI #NLP #Technology"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe4200-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_6():
post = {
'first post': """Just completed an in-depth study of Retrieval Augmented Generation (RAG), and I'm incredibly excited about its transformative potential in natural language processing (NLP). By leveraging the complementary strengths of retrieval-based and generative models, RAG enables AI systems to dynamically retrieve and generate information, leading to more contextually relevant and coherent responses. Whether it's enhancing search engines, improving chatbots, or revolutionizing content creation tools, RAG offers a versatile framework with countless applications. I can't wait to see how this innovative approach will reshape the way we interact with information and AI systems in the years to come. #RAG #NLP #AI #Innovation"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_7():
post = {
'first post': """Just completed an in-depth study of Retrieval Augmented Generation (RAG), and I'm incredibly excited about its transformative potential in natural language processing (NLP). By leveraging the complementary strengths of retrieval-based and generative models, RAG enables AI systems to dynamically retrieve and generate information, leading to more contextually relevant and coherent responses. Whether it's enhancing search engines, improving chatbots, or revolutionizing content creation tools, RAG offers a versatile framework with countless applications. I can't wait to see how this innovative approach will reshape the way we interact with information and AI systems in the years to come. #RAG #NLP #AI #Innovation"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_8():
post = {
'first post': """Just attended an enlightening webinar on Retrieval Augmented Generation (RAG) and its potential to revolutionize content creation in the digital age. By seamlessly integrating retrieval-based and generative models, RAG offers a unique solution to the challenges of generating high-quality and contextually relevant content. Its ability to dynamically retrieve information from large knowledge bases while generating coherent and engaging narratives opens up exciting possibilities for marketers, writers, and content creators. Whether it's crafting personalized product descriptions or generating compelling blog posts, RAG promises to elevate the art of content creation to new heights. #RAG #ContentCreation #AI #Innovation"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_9():
post = {
'first post': """Python classes are a fundamental concept in object-oriented programming, and mastering them is essential for building robust and scalable applications. Whether you're a beginner eager to learn the basics or an experienced developer looking to deepen your understanding, delving into Python classes is a rewarding journey. From encapsulation and inheritance to polymorphism and abstraction, classes empower you to organize your code effectively and create reusable components. Join me in exploring the intricacies of Python classes as we unlock the full potential of object-oriented programming in Python. Let's level up our coding skills together! #Python #Programming #Classes #OOP"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_post_into_collection_10():
post = {
'first post': """Let's dive into the world of Python classes! As the building blocks of object-oriented programming in Python, classes play a pivotal role in structuring and organizing code for scalability and maintainability. Whether you're developing web applications, data analysis tools, or machine learning models, understanding how to leverage classes effectively is essential. From defining attributes and methods to implementing inheritance and polymorphism, mastering Python classes empowers you to create elegant and efficient solutions to complex problems. Join me in exploring the depths of Python classes as we elevate our coding skills to new heights! #Python #OOP #Coding #Programming"""}
PostDocument(
platform="linkedin",
content=post,
author_id="dbe4200-c33f-4ff7-9908-ee6356fe251f",
).save()
print("Post inserted into collection")


def insert_article_into_collection_1():
content = {"test": "Test data for a article content"}
ArticleDocument(
platform="medium",
Expand All @@ -36,6 +135,20 @@ def insert_article_into_collection():
print("Article inserted into collection")


def insert_article_into_collection_2():
content = {"test": """Retrieval Augmented Generation (RAG) represents a cutting-edge approach in the field of natural language processing (NLP) that merges the strengths of retrieval-based models and generative models. The key concept behind RAG is its ability to dynamically retrieve relevant information from a large corpus or knowledge base and seamlessly incorporate it into generative responses.
In traditional generative models, outputs are generated based solely on the internal knowledge of the model. This can lead to issues with outdated information or hallucinations, where the model creates content that seems plausible but is factually incorrect. RAG mitigates these risks by integrating retrieval components that can access up-to-date and contextually appropriate information in real-time. This hybrid approach enhances accuracy and contextual relevance, making it ideal for applications like question answering, chatbots, and content creation tools.
RAG's impact is transformative, enabling AI systems to provide more reliable and accurate responses while adapting to changing information landscapes. This flexibility opens the door to a range of new possibilities, from personalized virtual assistants to intelligent document summarization. As RAG continues to evolve, it holds great promise for revolutionizing how we interact with AI systems and extract valuable insights from large volumes of data."""}
ArticleDocument(
platform="medium",
link="/htttps/alex/paul",
content=content,
author_id="dbe92510-c33f-4ff7-9908-ee6356fe251f",
).save()

print("Article inserted into collection")


def insert_repo_into_collection():
content = {"test": "Test data for a repo content into db"}
RepositoryDocument(
Expand All @@ -49,6 +162,16 @@ def insert_repo_into_collection():


if __name__ == "__main__":
insert_post_into_collection()
insert_article_into_collection()
insert_post_into_collection_1()
insert_post_into_collection_2()
insert_post_into_collection_3()
insert_post_into_collection_4()
insert_post_into_collection_5()
insert_post_into_collection_6()
insert_post_into_collection_7()
insert_post_into_collection_8()
insert_post_into_collection_9()
insert_post_into_collection_10()
insert_article_into_collection_1()
insert_article_into_collection_2()
insert_repo_into_collection()
Empty file.
10 changes: 10 additions & 0 deletions course/module-3/llm_components/chain.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate


class GeneralChain:
@staticmethod
def get_chain(llm, template: PromptTemplate, output_key: str, verbose=True):
return LLMChain(
llm=llm, prompt=template, output_key=output_key, verbose=verbose
)
64 changes: 64 additions & 0 deletions course/module-3/llm_components/prompt_templates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from abc import ABC, abstractmethod

from langchain.prompts import PromptTemplate
from pydantic import BaseModel


class BasePromptTemplate(ABC, BaseModel):
@abstractmethod
def create_template(self) -> PromptTemplate:
pass


class QueryExpansionTemplate(BasePromptTemplate):
prompt: str = """You are an AI language model assistant. Your task is to generate Five
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions seperated by newlines.
Original question: {question}"""

question: str

def create_template(self) -> PromptTemplate:
template = PromptTemplate(
template=self.prompt, input_variables=["question"], verbose=True
)
template.format(question=self.question)
return template


class SelfQueryTemplate(BasePromptTemplate):
prompt: str = """You are an AI language model assistant. Your task is to extract information from a user question.
The required information that needs to be extracted is the user id.
Your response should consists of only the extracted id (e.g. 1345256), nothing else.
User question: {question}"""

question: str

def create_template(self) -> PromptTemplate:
template = PromptTemplate(
template=self.prompt, input_variables=["question"], verbose=True
)
template.format(question=self.question)
return template


class RerankingTemplate(BasePromptTemplate):
prompt: str = """You are an AI language model assistant. Your task is to rerank passages related to a query
based on their relevance. The most relevant passages should be put at the beginning and at the end.
You should only pick at max 5 passages.
The following are passages related to this query: {question}.
Passages: {passages}
"""

question: str

passages: str

def create_template(self) -> PromptTemplate:
template = PromptTemplate(
template=self.prompt, input_variables=["question", "passages"], verbose=True
)
template.format(question=self.question, passages=self.passages)
return template
Loading

0 comments on commit 9fa8a5d

Please sign in to comment.