retrievalqa #29474

Anand-puthiyapurayil · 2025-01-29T13:39:56Z

Anand-puthiyapurayil
Jan 29, 2025

import os
import time
import re
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from utils.utils import load_config, initialize_embeddings, initialize_llm, load_faiss_store
from logger.logger import get_logger

Initialize logger

logger = get_logger(name)

Define Map-Reduce Prompts

map_prompt = PromptTemplate(
input_variables=["context", "question"],
template="""
You have the following chunk of data (could be a product or service):
{context}

User question: {question}

Summarize any relevant items here, referencing ID and name.
If nothing is relevant, say so.
""",
)

reduce_prompt = PromptTemplate(
input_variables=["summaries", "question"],
template="""
We have partial answers from multiple chunks:
{summaries}

Combine them into a single, cohesive answer to: "{question}"

Requirements:

Start with a short summary referencing relevant products or services (by ID and name).
Provide bullet points referencing IDs and names.
If no relevant items are found, say "No relevant items found."
""",
)

def semantic_search_tool(query: str) -> str:
"""
Enhanced product search that utilizes the LLM's answer and extracts product IDs.
"""
max_retries = 3
for attempt in range(max_retries):
try:
response = qa_chain.invoke(query)
llm_answer = response.get("result", "")
source_docs = response.get("source_documents", [])

        if not source_docs:
            logger.info("No source documents retrieved from QA chain.")
            return llm_answer if llm_answer else "No matching products found."

        seen_ids = set()
        product_ids = []

        for doc in source_docs:
            pid = str(doc.metadata.get("product_id", "")).strip()
            if pid and pid not in seen_ids:
                seen_ids.add(pid)
                product_ids.append(pid)

        # Construct the final response
        if product_ids:
            final_response = f"{llm_answer}\n\nMatching Product IDs:\n" + "\n".join(product_ids)
        else:
            final_response = llm_answer if llm_answer else "No matching products found."

        return final_response

    except Exception as e:
        logger.error(f"Attempt {attempt+1} failed: {str(e)}", exc_info=True)
        if "429" in str(e):
            sleep_time = 2 ** attempt
            time.sleep(sleep_time)
        else:
            break

return "Error processing request after multiple attempts."

def main():
try:
# Configuration setup
this_dir = os.path.dirname(os.path.abspath(file))
config_path = os.path.join(this_dir, "..", "config.yaml")
config = load_config(config_path)

    # Initialize LLM
    llm = initialize_llm(
        config["save_dir"],
        max_new_tokens=500,
        temperature=0.1,
        repetition_penalty=1.3
    )

    # Initialize Embeddings
    embeddings = initialize_embeddings(config["output_dir"])

    # Load FAISS Store
    faiss_store = load_faiss_store(config["product_store_path"], embeddings)

    # Verify FAISS Store Content
    num_vectors = faiss_store.index.ntotal
    if num_vectors == 0:
        logger.warning("FAISS store is empty. Ensure documents are indexed correctly.")
        return

    # Configure retriever with expanded search
    retriever = faiss_store.as_retriever(
        search_kwargs={"k": 10}  # Reduce to 10 for better relevance
    )

    # Initialize QA chain with Map-Reduce
    global qa_chain
    qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="map_reduce",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={
    "question_prompt": map_prompt,
    "combine_prompt": reduce_prompt,
    # Pass additional kwargs to handle larger context
    "verbose": True,  # Optional: for more detailed logging
    "model_kwargs": {
        "max_length": 131072  # Ensure the chain uses the correct max_length
    }
}

)

    # User input for query
    query = input("Enter your product search query: ")
    result = semantic_search_tool(query)
    print("\n" + result + "\n")

except Exception as e:
    logger.error(f"Main execution failed: {str(e)}", exc_info=True)
    raise

if name == "main":
main()

def initialize_llm(
save_dir: str,
max_new_tokens: int = 500,
temperature: float = 0.1,
repetition_penalty: float = 1.2,
top_p: float = 0.95,
do_sample: bool = True
):
"""
Initializes a retrieval-optimized LLM for product ID extraction.
"""
try:
logger.info(f"Initializing retrieval LLM from: {save_dir}")

    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Model directory not found: {save_dir}")

    # Load tokenizer
    logger.info("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    logger.info("Tokenizer loaded with pad_token set")

    # Load model
    logger.info("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        save_dir,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    # Check model context length
    if hasattr(model.config, "max_position_embeddings"):
        logger.info(f"Model context length: {model.config.max_position_embeddings}")

    # Configure text-generation pipeline
    logger.info("Configuring text-generation pipeline")
    llama_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        truncation=True,  # Not enforced
    )

    # Wrap in LangChain pipeline
    logger.info("Creating HuggingFacePipeline")
    hf_pipeline = HuggingFacePipeline(
        pipeline=llama_pipeline,
        model_kwargs={"temperature": temperature}
    )
    print("Actual pipeline max input:", hf_pipeline.pipeline.model.config.max_position_embeddings)

    return HuggingFacePipeline(
        pipeline=llama_pipeline,
        model_kwargs={"temperature": temperature}
    )


except Exception as e:
    logger.error(f"LLM initialization failed: {str(e)}", exc_info=True)
    raise

i have two functions here , once is retrieval function , where i retrieve with retrievalqa , and another function is initializing the llama , , although my model has 131k context window $ python -m mlscripts.product_retrieval
2025-01-29 18:47:33,016 - utils - INFO - Configuration loaded from D:\Anand\Jstore_Ai\usecase1\mlscripts..\config.yaml
2025-01-29 18:47:33,016 - utils - INFO - Initializing retrieval LLM from: output/BGI-llama
2025-01-29 18:47:33,016 - utils - INFO - Loading tokenizer...
2025-01-29 18:47:33,391 - utils - INFO - Tokenizer loaded with pad_token set
2025-01-29 18:47:33,391 - utils - INFO - Loading model...
2025-01-29 18:47:34,694 - utils - INFO - Model context length: 131072
2025-01-29 18:47:34,694 - utils - INFO - Configuring text-generation pipeline
Device set to use cuda:0
2025-01-29 18:47:34,695 - utils - INFO - Creating HuggingFacePipeline
Actual pipeline max input: 131072
2025-01-29 18:47:35,996 - utils - INFO - Embeddings initialized using model at: output/sbert_finetuned on device: cuda
2025-01-29 18:47:35,997 - utils - INFO - Loading FAISS vector store from: output/product_vector_store
2025-01-29 18:47:36,282 - utils - INFO - FAISS vector store loaded successfully.
Enter your product search query: show me some products under 2000
Token indices sequence length is longer than the specified maximum sequence length for this model (6944 > 1024). Running this sequence through the model will result in indexing errors
2025-01-29 18:48:41,656 - main - ERROR - Attempt 1 failed: A single document was longer than the context length, we cannot handle this.
Traceback (most recent call last):
File "D:\Anand\Jstore_Ai\usecase1\mlscripts\product_retrieval.py", line 48, in semantic_search_tool

Token indices sequence length is longer than the specified maximum sequence length for this model (6944 > 1024). Running this sequence through the model will result in indexing errors im getting this error , any help?

@Anand-puthiyapurayil · 2025-01-29T13:42:16Z

dosubot[bot]
bot Jan 29, 2025

Hey @Anand-puthiyapurayil! 👋 I'm here to help you with any bugs, questions, or contributions you have. Feel free to ask me anything while you're waiting for a human maintainer.

I found a similar issue related to token indices sequence length being longer than the specified maximum sequence length for the model. The error you're encountering suggests that the input sequence length exceeds the model's context window, which is causing indexing errors.

To resolve this, you can adjust the n_ctx parameter when initializing the Llama model to match the maximum sequence length you want to handle, or you can truncate or split your input sequences to fit within this limit. Here's an example of how you might adjust the n_ctx parameter:

from langchain_community.llms import LlamaCpp

# Initialize the Llama model with a custom context window
llm = LlamaCpp(model_path="/path/to/llama/model", n_ctx=1024)

# Ensure your input sequences do not exceed the n_ctx value
input_sequence = "Your input text here"
if llm.get_num_tokens(input_sequence) > llm.n_ctx:
    # Truncate or split the input_sequence to fit within the n_ctx limit
    input_sequence = input_sequence[:llm.n_ctx]

# Use the model for retrieval
result = llm.invoke(input_sequence)

Make sure that the n_ctx value you set does not exceed the model's capabilities or the hardware limitations. Adjusting the input sequence length to fit within the n_ctx limit will help resolve the error ^[1].

To continue talking to Dosu, mention @dosu.

^{Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Other}

7 replies

Anand-puthiyapurayil Jan 29, 2025
Author

@dosu `import os
import time
import re
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from utils.utils import load_config, initialize_embeddings, initialize_llm, load_faiss_store
from logger.logger import get_logger

Initialize logger

logger = get_logger(name)

Define Map-Reduce Prompts

map_prompt = PromptTemplate(
input_variables=["context", "question"],
template="""
You have the following chunk of data (could be a product or service):
{context}

User question: {question}

Summarize any relevant items here, referencing ID and name.
If nothing is relevant, say so.
""",
)

reduce_prompt = PromptTemplate(
input_variables=["summaries", "question"],
template="""
We have partial answers from multiple chunks:
{summaries}

Combine them into a single, cohesive answer to: "{question}"

Requirements:

Start with a short summary referencing relevant products or services (by ID and name).
Provide bullet points referencing IDs and names.
If no relevant items are found, say "No relevant items found."
""",
)

def semantic_search_tool(query: str) -> str:
"""
Enhanced product search that utilizes the LLM's answer and extracts product IDs.
"""
max_retries = 3
for attempt in range(max_retries):
try:
response = qa_chain.invoke(query)
llm_answer = response.get("result", "")
source_docs = response.get("source_documents", [])

        if not source_docs:
            logger.info("No source documents retrieved from QA chain.")
            return llm_answer if llm_answer else "No matching products found."

        seen_ids = set()
        product_ids = []

        for doc in source_docs:
            pid = str(doc.metadata.get("product_id", "")).strip()
            if pid and pid not in seen_ids:
                seen_ids.add(pid)
                product_ids.append(pid)

        # Construct the final response
        if product_ids:
            final_response = f"{llm_answer}\n\nMatching Product IDs:\n" + "\n".join(product_ids)
        else:
            final_response = llm_answer if llm_answer else "No matching products found."

        return final_response

    except Exception as e:
        logger.error(f"Attempt {attempt+1} failed: {str(e)}", exc_info=True)
        if "429" in str(e):
            sleep_time = 2 ** attempt
            time.sleep(sleep_time)
        else:
            break

return "Error processing request after multiple attempts."

def main():
try:
# Configuration setup
this_dir = os.path.dirname(os.path.abspath(file))
config_path = os.path.join(this_dir, "..", "config.yaml")
config = load_config(config_path)

    # Initialize LLM
    llm = initialize_llm(
        config["save_dir"],
        max_new_tokens=500,
        temperature=0.1,
        repetition_penalty=1.3
    )

    # Initialize Embeddings
    embeddings = initialize_embeddings(config["output_dir"])

    # Load FAISS Store
    faiss_store = load_faiss_store(config["product_store_path"], embeddings)

    # Verify FAISS Store Content
    num_vectors = faiss_store.index.ntotal
    if num_vectors == 0:
        logger.warning("FAISS store is empty. Ensure documents are indexed correctly.")
        return

    # Configure retriever with expanded search
    retriever = faiss_store.as_retriever(
        search_kwargs={"k": 10}  # Reduce to 10 for better relevance
    )

    # Initialize QA chain with Map-Reduce
    global qa_chain
    qa_chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="map_reduce",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={
    "question_prompt": map_prompt,
    "combine_prompt": reduce_prompt,
    # Pass additional kwargs to handle larger context
    "verbose": True,  # Optional: for more detailed logging
}

)

    # User input for query
    query = input("Enter your product search query: ")
    result = semantic_search_tool(query)
    print("\n" + result + "\n")

except Exception as e:
    logger.error(f"Main execution failed: {str(e)}", exc_info=True)
    raise

if name == "main":
main()
`

`def initialize_llm(
save_dir: str,
max_new_tokens: int = 500,
temperature: float = 0.1,
repetition_penalty: float = 1.2,
top_p: float = 0.95,
do_sample: bool = True
):
"""
Initializes a retrieval-optimized LLM for product ID extraction.
"""
try:
logger.info(f"Initializing retrieval LLM from: {save_dir}")

    if not os.path.exists(save_dir):
        raise FileNotFoundError(f"Model directory not found: {save_dir}")

    # Load tokenizer
    logger.info("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    logger.info("Tokenizer loaded with pad_token set")

    # Load model
    logger.info("Loading model...")
    model = AutoModelForCausalLM.from_pretrained(
        save_dir,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )

    # Check model context length
    if hasattr(model.config, "max_position_embeddings"):
        logger.info(f"Model context length: {model.config.max_position_embeddings}")

    # Configure text-generation pipeline
    logger.info("Configuring text-generation pipeline")
    llama_pipeline = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
        truncation=True,  # Not enforced
    )

    # Wrap in LangChain pipeline
    logger.info("Creating HuggingFacePipeline")
    hf_pipeline = HuggingFacePipeline(
        pipeline=llama_pipeline,
        model_kwargs={"temperature": temperature}
    )
    print("Actual pipeline max input:", hf_pipeline.pipeline.model.config.max_position_embeddings)

    return HuggingFacePipeline(
        pipeline=llama_pipeline,
        model_kwargs={"temperature": temperature}
    )


except Exception as e:
    logger.error(f"LLM initialization failed: {str(e)}", exc_info=True)
    raise`   what is best practice for this and workaround for this?

dosubot[bot] bot Jan 29, 2025

To handle the issue of input sequences exceeding the model's context length in a Retrieval-Augmented Generation (RAG) setup, consider the following best practices and workarounds:

Chunking Input Data: Before passing data to the model, split the input into smaller chunks that fit within the model's context window. This can be done using text splitters that break down the text into manageable pieces based on sentence boundaries or other logical separators.
Sliding Window Approach: Use a sliding window technique to process overlapping chunks of text. This ensures that important context is not lost at the boundaries of chunks.
Truncation: If the input is still too long after chunking, apply truncation to fit the model's context window. Ensure that truncation is done in a way that retains the most relevant information.
Adjusting Model Parameters: Ensure that the n_ctx parameter or equivalent is set correctly to match the model's maximum context length. This parameter should not exceed the model's capabilities.
Error Handling: Implement robust error handling to catch and manage exceptions related to input length. This can include logging errors and providing user feedback to adjust their queries.
Optimize Prompts: Design prompts to be concise and focused, reducing unnecessary verbosity that could contribute to exceeding the context length.
Use of Map-Reduce: In a RAG setup, the map-reduce approach can help manage large inputs by processing them in smaller parts (map) and then combining the results (reduce). Ensure that the prompts used in both phases are optimized for brevity and relevance.

By implementing these strategies, you can effectively manage input sequences and prevent errors related to context length in your RAG-based search engine.

To continue talking to Dosu, mention @dosu.

^{Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Other}

Anand-puthiyapurayil Jan 29, 2025
Author

can you give me all the codes for it ? @dosu

dosubot[bot] bot Jan 29, 2025

This is a tough nut to crack, could really use your help on this, @eyurtsev.

To continue talking to Dosu, mention @dosu.

^{Help Dosu learn! Give it feedback: Great Response | Irrelevant Answer | Incorrect Sources | Too Verbose | Hallucination | Other}

Anand-puthiyapurayil Jan 30, 2025
Author

@eyurtsev , any help in this case?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

retrievalqa #29474

{{title}}

Replies: 1 comment 7 replies

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

retrievalqa #29474

Anand-puthiyapurayil Jan 29, 2025

Initialize logger

Define Map-Reduce Prompts

Replies: 1 comment · 7 replies

dosubot[bot] bot Jan 29, 2025

Anand-puthiyapurayil Jan 29, 2025 Author

Initialize logger

Define Map-Reduce Prompts

dosubot[bot] bot Jan 29, 2025

Anand-puthiyapurayil Jan 29, 2025 Author

dosubot[bot] bot Jan 29, 2025

Anand-puthiyapurayil Jan 30, 2025 Author

Anand-puthiyapurayil
Jan 29, 2025

Replies: 1 comment 7 replies

dosubot[bot]
bot Jan 29, 2025

Anand-puthiyapurayil Jan 29, 2025
Author

Anand-puthiyapurayil Jan 29, 2025
Author

Anand-puthiyapurayil Jan 30, 2025
Author