From e98abdf96ff681341d271bfc056d741d71d4a054 Mon Sep 17 00:00:00 2001 From: motaz m alharbi Date: Sun, 14 Dec 2025 23:26:46 +0200 Subject: [PATCH 1/2] Implement comprehensive tutorial generation framework with LLM integration - Added `nodes.py` to define a series of nodes for fetching repository files, identifying abstractions, analyzing relationships, ordering chapters, and writing tutorial chapters. - Introduced `FetchRepo` node to crawl GitHub repositories or local directories for source files. - Created `IdentifyAbstractions` node to leverage LLM for extracting key abstractions from the codebase. - Developed `AnalyzeRelationships` node to establish connections between identified abstractions. - Implemented `OrderChapters` node to determine the optimal sequence for tutorial chapters based on abstraction relationships. - Added `WriteChapters` node to generate Markdown content for each chapter using LLM. - Created `CombineTutorial` node to compile the generated chapters and create an index file with a Mermaid diagram for visual representation of relationships. - Established logging and caching made --- nodes.py | 48 +- nodes.py.backup | 880 +++++++++++++++++++++++++++++++++++++ nodes.py.backup2 | 919 +++++++++++++++++++++++++++++++++++++++ utils/call_llm.py | 77 ++++ utils/call_llm.py.backup | 185 ++++++++ 5 files changed, 2106 insertions(+), 3 deletions(-) create mode 100644 nodes.py.backup create mode 100644 nodes.py.backup2 create mode 100644 utils/call_llm.py.backup diff --git a/nodes.py b/nodes.py index 0e3fa587..8f6692b7 100644 --- a/nodes.py +++ b/nodes.py @@ -113,6 +113,7 @@ def create_llm_context(files_data): language, use_cache, max_abstraction_num, + files_data, ) # Return all parameters def exec(self, prep_res): @@ -124,6 +125,7 @@ def exec(self, prep_res): language, use_cache, max_abstraction_num, + files_data, ) = prep_res # Unpack all parameters print(f"Identifying abstractions using LLM...") @@ -173,7 +175,25 @@ def exec(self, prep_res): - 5 # path/to/another.js # ... up to {max_abstraction_num} abstractions ```""" - response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + # Build context from most important files (first 5 files by size/relevance) + context_files = sorted( + [(path, content) for path, content in files_data], + key=lambda x: len(x[1]), + reverse=True + )[:5] + + rag_context = "\n\n".join([ + f"=== {path} ===\n{content[:2000]}" # First 2000 chars per file + for path, content in context_files + ]) + + from utils.call_llm import call_llm_with_context + response = call_llm_with_context( + prompt=prompt, + context=rag_context, + use_cache=(use_cache and self.cur_retry == 0), + include_remote_rag=True + ) # --- Validation --- yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() @@ -344,7 +364,18 @@ def exec(self, prep_res): Now, provide the YAML output: """ - response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + # Use already gathered context from prep (relevant_files_content_map has the files) + # But we can add a focused snippet for relationship analysis + from utils.call_llm import call_llm_with_context + + # Context is already built in prep via file_context_str + # We'll pass it as additional context + response = call_llm_with_context( + prompt=prompt, + context="", # Context already in prompt, no need to duplicate + use_cache=(use_cache and self.cur_retry == 0), + include_remote_rag=True + ) # --- Validation --- yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() @@ -723,7 +754,18 @@ def exec(self, item): Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags): """ - chapter_content = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + from utils.call_llm import call_llm_with_context + + # Context from related files is already in file_context_str + # Pass it explicitly as RAG context for better separation + response = call_llm_with_context( + prompt=prompt, + context=file_context_str if file_context_str else "", + use_cache=(use_cache and self.cur_retry == 0), + include_remote_rag=True + ) + + chapter_content = response # Basic validation/cleanup actual_heading = f"# Chapter {chapter_num}: {abstraction_name}" # Use potentially translated name if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"): diff --git a/nodes.py.backup b/nodes.py.backup new file mode 100644 index 00000000..0e3fa587 --- /dev/null +++ b/nodes.py.backup @@ -0,0 +1,880 @@ +import os +import re +import yaml +from pocketflow import Node, BatchNode +from utils.crawl_github_files import crawl_github_files +from utils.call_llm import call_llm +from utils.crawl_local_files import crawl_local_files + + +# Helper to get content for specific file indices +def get_content_for_indices(files_data, indices): + content_map = {} + for i in indices: + if 0 <= i < len(files_data): + path, content = files_data[i] + content_map[f"{i} # {path}"] = ( + content # Use index + path as key for context + ) + return content_map + + +class FetchRepo(Node): + def prep(self, shared): + repo_url = shared.get("repo_url") + local_dir = shared.get("local_dir") + project_name = shared.get("project_name") + + if not project_name: + # Basic name derivation from URL or directory + if repo_url: + project_name = repo_url.split("/")[-1].replace(".git", "") + else: + project_name = os.path.basename(os.path.abspath(local_dir)) + shared["project_name"] = project_name + + # Get file patterns directly from shared + include_patterns = shared["include_patterns"] + exclude_patterns = shared["exclude_patterns"] + max_file_size = shared["max_file_size"] + + return { + "repo_url": repo_url, + "local_dir": local_dir, + "token": shared.get("github_token"), + "include_patterns": include_patterns, + "exclude_patterns": exclude_patterns, + "max_file_size": max_file_size, + "use_relative_paths": True, + } + + def exec(self, prep_res): + if prep_res["repo_url"]: + print(f"Crawling repository: {prep_res['repo_url']}...") + result = crawl_github_files( + repo_url=prep_res["repo_url"], + token=prep_res["token"], + include_patterns=prep_res["include_patterns"], + exclude_patterns=prep_res["exclude_patterns"], + max_file_size=prep_res["max_file_size"], + use_relative_paths=prep_res["use_relative_paths"], + ) + else: + print(f"Crawling directory: {prep_res['local_dir']}...") + + result = crawl_local_files( + directory=prep_res["local_dir"], + include_patterns=prep_res["include_patterns"], + exclude_patterns=prep_res["exclude_patterns"], + max_file_size=prep_res["max_file_size"], + use_relative_paths=prep_res["use_relative_paths"] + ) + + # Convert dict to list of tuples: [(path, content), ...] + files_list = list(result.get("files", {}).items()) + if len(files_list) == 0: + raise (ValueError("Failed to fetch files")) + print(f"Fetched {len(files_list)} files.") + return files_list + + def post(self, shared, prep_res, exec_res): + shared["files"] = exec_res # List of (path, content) tuples + + +class IdentifyAbstractions(Node): + def prep(self, shared): + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + max_abstraction_num = shared.get("max_abstraction_num", 10) # Get max_abstraction_num, default to 10 + + # Helper to create context from files, respecting limits (basic example) + def create_llm_context(files_data): + context = "" + file_info = [] # Store tuples of (index, path) + for i, (path, content) in enumerate(files_data): + entry = f"--- File Index {i}: {path} ---\n{content}\n\n" + context += entry + file_info.append((i, path)) + + return context, file_info # file_info is list of (index, path) + + context, file_info = create_llm_context(files_data) + # Format file info for the prompt (comment is just a hint for LLM) + file_listing_for_prompt = "\n".join( + [f"- {idx} # {path}" for idx, path in file_info] + ) + return ( + context, + file_listing_for_prompt, + len(files_data), + project_name, + language, + use_cache, + max_abstraction_num, + ) # Return all parameters + + def exec(self, prep_res): + ( + context, + file_listing_for_prompt, + file_count, + project_name, + language, + use_cache, + max_abstraction_num, + ) = prep_res # Unpack all parameters + print(f"Identifying abstractions using LLM...") + + # Add language instruction and hints only if not English + language_instruction = "" + name_lang_hint = "" + desc_lang_hint = "" + if language.lower() != "english": + language_instruction = f"IMPORTANT: Generate the `name` and `description` for each abstraction in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n" + # Keep specific hints here as name/description are primary targets + name_lang_hint = f" (value in {language.capitalize()})" + desc_lang_hint = f" (value in {language.capitalize()})" + + prompt = f""" +For the project `{project_name}`: + +Codebase Context: +{context} + +{language_instruction}Analyze the codebase context. +Identify the top 5-{max_abstraction_num} core most important abstractions to help those new to the codebase. + +For each abstraction, provide: +1. A concise `name`{name_lang_hint}. +2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words{desc_lang_hint}. +3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`. + +List of file indices and paths present in the context: +{file_listing_for_prompt} + +Format the output as a YAML list of dictionaries: + +```yaml +- name: | + Query Processing{name_lang_hint} + description: | + Explains what the abstraction does. + It's like a central dispatcher routing requests.{desc_lang_hint} + file_indices: + - 0 # path/to/file1.py + - 3 # path/to/related.py +- name: | + Query Optimization{name_lang_hint} + description: | + Another core concept, similar to a blueprint for objects.{desc_lang_hint} + file_indices: + - 5 # path/to/another.js +# ... up to {max_abstraction_num} abstractions +```""" + response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + abstractions = yaml.safe_load(yaml_str) + + if not isinstance(abstractions, list): + raise ValueError("LLM Output is not a list") + + validated_abstractions = [] + for item in abstractions: + if not isinstance(item, dict) or not all( + k in item for k in ["name", "description", "file_indices"] + ): + raise ValueError(f"Missing keys in abstraction item: {item}") + if not isinstance(item["name"], str): + raise ValueError(f"Name is not a string in item: {item}") + if not isinstance(item["description"], str): + raise ValueError(f"Description is not a string in item: {item}") + if not isinstance(item["file_indices"], list): + raise ValueError(f"file_indices is not a list in item: {item}") + + # Validate indices + validated_indices = [] + for idx_entry in item["file_indices"]: + try: + if isinstance(idx_entry, int): + idx = idx_entry + elif isinstance(idx_entry, str) and "#" in idx_entry: + idx = int(idx_entry.split("#")[0].strip()) + else: + idx = int(str(idx_entry).strip()) + + if not (0 <= idx < file_count): + raise ValueError( + f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}." + ) + validated_indices.append(idx) + except (ValueError, TypeError): + raise ValueError( + f"Could not parse index from entry: {idx_entry} in item {item['name']}" + ) + + item["files"] = sorted(list(set(validated_indices))) + # Store only the required fields + validated_abstractions.append( + { + "name": item["name"], # Potentially translated name + "description": item[ + "description" + ], # Potentially translated description + "files": item["files"], + } + ) + + print(f"Identified {len(validated_abstractions)} abstractions.") + return validated_abstractions + + def post(self, shared, prep_res, exec_res): + shared["abstractions"] = ( + exec_res # List of {"name": str, "description": str, "files": [int]} + ) + + +class AnalyzeRelationships(Node): + def prep(self, shared): + abstractions = shared[ + "abstractions" + ] # Now contains 'files' list of indices, name/description potentially translated + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Get the actual number of abstractions directly + num_abstractions = len(abstractions) + + # Create context with abstraction names, indices, descriptions, and relevant file snippets + context = "Identified Abstractions:\\n" + all_relevant_indices = set() + abstraction_info_for_prompt = [] + for i, abstr in enumerate(abstractions): + # Use 'files' which contains indices directly + file_indices_str = ", ".join(map(str, abstr["files"])) + # Abstraction name and description might be translated already + info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\\n Description: {abstr['description']}" + context += info_line + "\\n" + abstraction_info_for_prompt.append( + f"{i} # {abstr['name']}" + ) # Use potentially translated name here too + all_relevant_indices.update(abstr["files"]) + + context += "\\nRelevant File Snippets (Referenced by Index and Path):\\n" + # Get content for relevant files using helper + relevant_files_content_map = get_content_for_indices( + files_data, sorted(list(all_relevant_indices)) + ) + # Format file content for context + file_context_str = "\\n\\n".join( + f"--- File: {idx_path} ---\\n{content}" + for idx_path, content in relevant_files_content_map.items() + ) + context += file_context_str + + return ( + context, + "\n".join(abstraction_info_for_prompt), + num_abstractions, # Pass the actual count + project_name, + language, + use_cache, + ) # Return use_cache + + def exec(self, prep_res): + ( + context, + abstraction_listing, + num_abstractions, # Receive the actual count + project_name, + language, + use_cache, + ) = prep_res # Unpack use_cache + print(f"Analyzing relationships using LLM...") + + # Add language instruction and hints only if not English + language_instruction = "" + lang_hint = "" + list_lang_note = "" + if language.lower() != "english": + language_instruction = f"IMPORTANT: Generate the `summary` and relationship `label` fields in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n" + lang_hint = f" (in {language.capitalize()})" + list_lang_note = f" (Names might be in {language.capitalize()})" # Note for the input list + + prompt = f""" +Based on the following abstractions and relevant code snippets from the project `{project_name}`: + +List of Abstraction Indices and Names{list_lang_note}: +{abstraction_listing} + +Context (Abstractions, Descriptions, Code): +{context} + +{language_instruction}Please provide: +1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences{lang_hint}. Use markdown formatting with **bold** and *italic* text to highlight important concepts. +2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify: + - `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`) + - `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`) + - `label`: A brief label for the interaction **in just a few words**{lang_hint} (e.g., "Manages", "Inherits", "Uses"). + Ideally the relationship should be backed by one abstraction calling or passing parameters to another. + Simplify the relationship and exclude those non-important ones. + +IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships. + +Format the output as YAML: + +```yaml +summary: | + A brief, simple explanation of the project{lang_hint}. + Can span multiple lines with **bold** and *italic* for emphasis. +relationships: + - from_abstraction: 0 # AbstractionName1 + to_abstraction: 1 # AbstractionName2 + label: "Manages"{lang_hint} + - from_abstraction: 2 # AbstractionName3 + to_abstraction: 0 # AbstractionName1 + label: "Provides config"{lang_hint} + # ... other relationships +``` + +Now, provide the YAML output: +""" + response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + relationships_data = yaml.safe_load(yaml_str) + + if not isinstance(relationships_data, dict) or not all( + k in relationships_data for k in ["summary", "relationships"] + ): + raise ValueError( + "LLM output is not a dict or missing keys ('summary', 'relationships')" + ) + if not isinstance(relationships_data["summary"], str): + raise ValueError("summary is not a string") + if not isinstance(relationships_data["relationships"], list): + raise ValueError("relationships is not a list") + + # Validate relationships structure + validated_relationships = [] + for rel in relationships_data["relationships"]: + # Check for 'label' key + if not isinstance(rel, dict) or not all( + k in rel for k in ["from_abstraction", "to_abstraction", "label"] + ): + raise ValueError( + f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}" + ) + # Validate 'label' is a string + if not isinstance(rel["label"], str): + raise ValueError(f"Relationship label is not a string: {rel}") + + # Validate indices + try: + from_idx = int(str(rel["from_abstraction"]).split("#")[0].strip()) + to_idx = int(str(rel["to_abstraction"]).split("#")[0].strip()) + if not ( + 0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions + ): + raise ValueError( + f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}." + ) + validated_relationships.append( + { + "from": from_idx, + "to": to_idx, + "label": rel["label"], # Potentially translated label + } + ) + except (ValueError, TypeError): + raise ValueError(f"Could not parse indices from relationship: {rel}") + + print("Generated project summary and relationship details.") + return { + "summary": relationships_data["summary"], # Potentially translated summary + "details": validated_relationships, # Store validated, index-based relationships with potentially translated labels + } + + def post(self, shared, prep_res, exec_res): + # Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]} + # Summary and label might be translated + shared["relationships"] = exec_res + + +class OrderChapters(Node): + def prep(self, shared): + abstractions = shared["abstractions"] # Name/description might be translated + relationships = shared["relationships"] # Summary/label might be translated + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Prepare context for the LLM + abstraction_info_for_prompt = [] + for i, a in enumerate(abstractions): + abstraction_info_for_prompt.append( + f"- {i} # {a['name']}" + ) # Use potentially translated name + abstraction_listing = "\n".join(abstraction_info_for_prompt) + + # Use potentially translated summary and labels + summary_note = "" + if language.lower() != "english": + summary_note = ( + f" (Note: Project Summary might be in {language.capitalize()})" + ) + + context = f"Project Summary{summary_note}:\n{relationships['summary']}\n\n" + context += "Relationships (Indices refer to abstractions above):\n" + for rel in relationships["details"]: + from_name = abstractions[rel["from"]]["name"] + to_name = abstractions[rel["to"]]["name"] + # Use potentially translated 'label' + context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n" # Label might be translated + + list_lang_note = "" + if language.lower() != "english": + list_lang_note = f" (Names might be in {language.capitalize()})" + + return ( + abstraction_listing, + context, + len(abstractions), + project_name, + list_lang_note, + use_cache, + ) # Return use_cache + + def exec(self, prep_res): + ( + abstraction_listing, + context, + num_abstractions, + project_name, + list_lang_note, + use_cache, + ) = prep_res # Unpack use_cache + print("Determining chapter order using LLM...") + # No language variation needed here in prompt instructions, just ordering based on structure + # The input names might be translated, hence the note. + prompt = f""" +Given the following project abstractions and their relationships for the project ```` {project_name} ````: + +Abstractions (Index # Name){list_lang_note}: +{abstraction_listing} + +Context about relationships and project summary: +{context} + +If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last? +Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts. + +Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`. + +```yaml +- 2 # FoundationalConcept +- 0 # CoreClassA +- 1 # CoreClassB (uses CoreClassA) +- ... +``` + +Now, provide the YAML output: +""" + response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + ordered_indices_raw = yaml.safe_load(yaml_str) + + if not isinstance(ordered_indices_raw, list): + raise ValueError("LLM output is not a list") + + ordered_indices = [] + seen_indices = set() + for entry in ordered_indices_raw: + try: + if isinstance(entry, int): + idx = entry + elif isinstance(entry, str) and "#" in entry: + idx = int(entry.split("#")[0].strip()) + else: + idx = int(str(entry).strip()) + + if not (0 <= idx < num_abstractions): + raise ValueError( + f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}." + ) + if idx in seen_indices: + raise ValueError(f"Duplicate index {idx} found in ordered list.") + ordered_indices.append(idx) + seen_indices.add(idx) + + except (ValueError, TypeError): + raise ValueError( + f"Could not parse index from ordered list entry: {entry}" + ) + + # Check if all abstractions are included + if len(ordered_indices) != num_abstractions: + raise ValueError( + f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}" + ) + + print(f"Determined chapter order (indices): {ordered_indices}") + return ordered_indices # Return the list of indices + + def post(self, shared, prep_res, exec_res): + # exec_res is already the list of ordered indices + shared["chapter_order"] = exec_res # List of indices + + +class WriteChapters(BatchNode): + def prep(self, shared): + chapter_order = shared["chapter_order"] # List of indices + abstractions = shared[ + "abstractions" + ] # List of {"name": str, "description": str, "files": [int]} + files_data = shared["files"] # List of (path, content) tuples + project_name = shared["project_name"] + language = shared.get("language", "english") + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Get already written chapters to provide context + # We store them temporarily during the batch run, not in shared memory yet + # The 'previous_chapters_summary' will be built progressively in the exec context + self.chapters_written_so_far = ( + [] + ) # Use instance variable for temporary storage across exec calls + + # Create a complete list of all chapters + all_chapters = [] + chapter_filenames = {} # Store chapter filename mapping for linking + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + chapter_num = i + 1 + chapter_name = abstractions[abstraction_index][ + "name" + ] # Potentially translated name + # Create safe filename (from potentially translated name) + safe_name = "".join( + c if c.isalnum() else "_" for c in chapter_name + ).lower() + filename = f"{i+1:02d}_{safe_name}.md" + # Format with link (using potentially translated name) + all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})") + # Store mapping of chapter index to filename for linking + chapter_filenames[abstraction_index] = { + "num": chapter_num, + "name": chapter_name, + "filename": filename, + } + + # Create a formatted string with all chapters + full_chapter_listing = "\n".join(all_chapters) + + items_to_process = [] + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + abstraction_details = abstractions[ + abstraction_index + ] # Contains potentially translated name/desc + # Use 'files' (list of indices) directly + related_file_indices = abstraction_details.get("files", []) + # Get content using helper, passing indices + related_files_content_map = get_content_for_indices( + files_data, related_file_indices + ) + + # Get previous chapter info for transitions (uses potentially translated name) + prev_chapter = None + if i > 0: + prev_idx = chapter_order[i - 1] + prev_chapter = chapter_filenames[prev_idx] + + # Get next chapter info for transitions (uses potentially translated name) + next_chapter = None + if i < len(chapter_order) - 1: + next_idx = chapter_order[i + 1] + next_chapter = chapter_filenames[next_idx] + + items_to_process.append( + { + "chapter_num": i + 1, + "abstraction_index": abstraction_index, + "abstraction_details": abstraction_details, # Has potentially translated name/desc + "related_files_content_map": related_files_content_map, + "project_name": shared["project_name"], # Add project name + "full_chapter_listing": full_chapter_listing, # Add the full chapter listing (uses potentially translated names) + "chapter_filenames": chapter_filenames, # Add chapter filenames mapping (uses potentially translated names) + "prev_chapter": prev_chapter, # Add previous chapter info (uses potentially translated name) + "next_chapter": next_chapter, # Add next chapter info (uses potentially translated name) + "language": language, # Add language for multi-language support + "use_cache": use_cache, # Pass use_cache flag + # previous_chapters_summary will be added dynamically in exec + } + ) + else: + print( + f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping." + ) + + print(f"Preparing to write {len(items_to_process)} chapters...") + return items_to_process # Iterable for BatchNode + + def exec(self, item): + # This runs for each item prepared above + abstraction_name = item["abstraction_details"][ + "name" + ] # Potentially translated name + abstraction_description = item["abstraction_details"][ + "description" + ] # Potentially translated description + chapter_num = item["chapter_num"] + project_name = item.get("project_name") + language = item.get("language", "english") + use_cache = item.get("use_cache", True) # Read use_cache from item + print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...") + + # Prepare file context string from the map + file_context_str = "\n\n".join( + f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}" + for idx_path, content in item["related_files_content_map"].items() + ) + + # Get summary of chapters written *before* this one + # Use the temporary instance variable + previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far) + + # Add language instruction and context notes only if not English + language_instruction = "" + concept_details_note = "" + structure_note = "" + prev_summary_note = "" + instruction_lang_note = "" + mermaid_lang_note = "" + code_comment_note = "" + link_lang_note = "" + tone_note = "" + if language.lower() != "english": + lang_cap = language.capitalize() + language_instruction = f"IMPORTANT: Write this ENTIRE tutorial chapter in **{lang_cap}**. Some input context (like concept name, description, chapter list, previous summary) might already be in {lang_cap}, but you MUST translate ALL other generated content including explanations, examples, technical terms, and potentially code comments into {lang_cap}. DO NOT use English anywhere except in code syntax, required proper nouns, or when specified. The entire output MUST be in {lang_cap}.\n\n" + concept_details_note = f" (Note: Provided in {lang_cap})" + structure_note = f" (Note: Chapter names might be in {lang_cap})" + prev_summary_note = f" (Note: This summary might be in {lang_cap})" + instruction_lang_note = f" (in {lang_cap})" + mermaid_lang_note = f" (Use {lang_cap} for labels/text if appropriate)" + code_comment_note = f" (Translate to {lang_cap} if possible, otherwise keep minimal English for clarity)" + link_lang_note = ( + f" (Use the {lang_cap} chapter title from the structure above)" + ) + tone_note = f" (appropriate for {lang_cap} readers)" + + prompt = f""" +{language_instruction}Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}. + +Concept Details{concept_details_note}: +- Name: {abstraction_name} +- Description: +{abstraction_description} + +Complete Tutorial Structure{structure_note}: +{item["full_chapter_listing"]} + +Context from previous chapters{prev_summary_note}: +{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."} + +Relevant Code Snippets (Code itself remains unchanged): +{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."} + +Instructions for the chapter (Generate content in {language.capitalize()} unless specified otherwise): +- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`). Use the provided concept name. + +- If this is not the first chapter, begin with a brief transition from the previous chapter{instruction_lang_note}, referencing it with a proper Markdown link using its name{link_lang_note}. + +- Begin with a high-level motivation explaining what problem this abstraction solves{instruction_lang_note}. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners. + +- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way{instruction_lang_note}. + +- Explain how to use this abstraction to solve the use case{instruction_lang_note}. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen{instruction_lang_note}). + +- Each code block should be BELOW 10 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments{code_comment_note} to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it{instruction_lang_note}. + +- Describe the internal implementation to help understand what's under the hood{instruction_lang_note}. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called{instruction_lang_note}. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: `participant QP as Query Processing`. {mermaid_lang_note}. + +- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly. Explain{instruction_lang_note}. + +- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename and the chapter title{link_lang_note}. Translate the surrounding text. + +- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format). {mermaid_lang_note}. + +- Heavily use analogies and examples throughout{instruction_lang_note} to help beginners understand. + +- End the chapter with a brief conclusion that summarizes what was learned{instruction_lang_note} and provides a transition to the next chapter{instruction_lang_note}. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename){link_lang_note}. + +- Ensure the tone is welcoming and easy for a newcomer to understand{tone_note}. + +- Output *only* the Markdown content for this chapter. + +Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags): +""" + chapter_content = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + # Basic validation/cleanup + actual_heading = f"# Chapter {chapter_num}: {abstraction_name}" # Use potentially translated name + if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"): + # Add heading if missing or incorrect, trying to preserve content + lines = chapter_content.strip().split("\n") + if lines and lines[0].strip().startswith( + "#" + ): # If there's some heading, replace it + lines[0] = actual_heading + chapter_content = "\n".join(lines) + else: # Otherwise, prepend it + chapter_content = f"{actual_heading}\n\n{chapter_content}" + + # Add the generated content to our temporary list for the next iteration's context + self.chapters_written_so_far.append(chapter_content) + + return chapter_content # Return the Markdown string (potentially translated) + + def post(self, shared, prep_res, exec_res_list): + # exec_res_list contains the generated Markdown for each chapter, in order + shared["chapters"] = exec_res_list + # Clean up the temporary instance variable + del self.chapters_written_so_far + print(f"Finished writing {len(exec_res_list)} chapters.") + + +class CombineTutorial(Node): + def prep(self, shared): + project_name = shared["project_name"] + output_base_dir = shared.get("output_dir", "output") # Default output dir + output_path = os.path.join(output_base_dir, project_name) + repo_url = shared.get("repo_url") # Get the repository URL + # language = shared.get("language", "english") # No longer needed for fixed strings + + # Get potentially translated data + relationships_data = shared[ + "relationships" + ] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]} -> summary/label potentially translated + chapter_order = shared["chapter_order"] # indices + abstractions = shared[ + "abstractions" + ] # list of dicts -> name/description potentially translated + chapters_content = shared[ + "chapters" + ] # list of strings -> content potentially translated + + # --- Generate Mermaid Diagram --- + mermaid_lines = ["flowchart TD"] + # Add nodes for each abstraction using potentially translated names + for i, abstr in enumerate(abstractions): + node_id = f"A{i}" + # Use potentially translated name, sanitize for Mermaid ID and label + sanitized_name = abstr["name"].replace('"', "") + node_label = sanitized_name # Using sanitized name only + mermaid_lines.append( + f' {node_id}["{node_label}"]' + ) # Node label uses potentially translated name + # Add edges for relationships using potentially translated labels + for rel in relationships_data["details"]: + from_node_id = f"A{rel['from']}" + to_node_id = f"A{rel['to']}" + # Use potentially translated label, sanitize + edge_label = ( + rel["label"].replace('"', "").replace("\n", " ") + ) # Basic sanitization + max_label_len = 30 + if len(edge_label) > max_label_len: + edge_label = edge_label[: max_label_len - 3] + "..." + mermaid_lines.append( + f' {from_node_id} -- "{edge_label}" --> {to_node_id}' + ) # Edge label uses potentially translated label + + mermaid_diagram = "\n".join(mermaid_lines) + # --- End Mermaid --- + + # --- Prepare index.md content --- + index_content = f"# Tutorial: {project_name}\n\n" + index_content += f"{relationships_data['summary']}\n\n" # Use the potentially translated summary directly + # Keep fixed strings in English + index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n" + + # Add Mermaid diagram for relationships (diagram itself uses potentially translated names/labels) + index_content += "```mermaid\n" + index_content += mermaid_diagram + "\n" + index_content += "```\n\n" + + # Keep fixed strings in English + index_content += f"## Chapters\n\n" + + chapter_files = [] + # Generate chapter links based on the determined order, using potentially translated names + for i, abstraction_index in enumerate(chapter_order): + # Ensure index is valid and we have content for it + if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content): + abstraction_name = abstractions[abstraction_index][ + "name" + ] # Potentially translated name + # Sanitize potentially translated name for filename + safe_name = "".join( + c if c.isalnum() else "_" for c in abstraction_name + ).lower() + filename = f"{i+1:02d}_{safe_name}.md" + index_content += f"{i+1}. [{abstraction_name}]({filename})\n" # Use potentially translated name in link text + + # Add attribution to chapter content (using English fixed string) + chapter_content = chapters_content[i] # Potentially translated content + if not chapter_content.endswith("\n\n"): + chapter_content += "\n\n" + # Keep fixed strings in English + chapter_content += f"---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + # Store filename and corresponding content + chapter_files.append({"filename": filename, "content": chapter_content}) + else: + print( + f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry." + ) + + # Add attribution to index content (using English fixed string) + index_content += f"\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + return { + "output_path": output_path, + "index_content": index_content, + "chapter_files": chapter_files, # List of {"filename": str, "content": str} + } + + def exec(self, prep_res): + output_path = prep_res["output_path"] + index_content = prep_res["index_content"] + chapter_files = prep_res["chapter_files"] + + print(f"Combining tutorial into directory: {output_path}") + # Rely on Node's built-in retry/fallback + os.makedirs(output_path, exist_ok=True) + + # Write index.md + index_filepath = os.path.join(output_path, "index.md") + with open(index_filepath, "w", encoding="utf-8") as f: + f.write(index_content) + print(f" - Wrote {index_filepath}") + + # Write chapter files + for chapter_info in chapter_files: + chapter_filepath = os.path.join(output_path, chapter_info["filename"]) + with open(chapter_filepath, "w", encoding="utf-8") as f: + f.write(chapter_info["content"]) + print(f" - Wrote {chapter_filepath}") + + return output_path # Return the final path + + def post(self, shared, prep_res, exec_res): + shared["final_output_dir"] = exec_res # Store the output path + print(f"\nTutorial generation complete! Files are in: {exec_res}") diff --git a/nodes.py.backup2 b/nodes.py.backup2 new file mode 100644 index 00000000..91a9adac --- /dev/null +++ b/nodes.py.backup2 @@ -0,0 +1,919 @@ +import os +import re +import yaml +from pocketflow import Node, BatchNode +from utils.crawl_github_files import crawl_github_files +from utils.call_llm import call_llm +from utils.crawl_local_files import crawl_local_files + + +# Helper to get content for specific file indices +def get_content_for_indices(files_data, indices): + content_map = {} + for i in indices: + if 0 <= i < len(files_data): + path, content = files_data[i] + content_map[f"{i} # {path}"] = ( + content # Use index + path as key for context + ) + return content_map + + +class FetchRepo(Node): + def prep(self, shared): + repo_url = shared.get("repo_url") + local_dir = shared.get("local_dir") + project_name = shared.get("project_name") + + if not project_name: + # Basic name derivation from URL or directory + if repo_url: + project_name = repo_url.split("/")[-1].replace(".git", "") + else: + project_name = os.path.basename(os.path.abspath(local_dir)) + shared["project_name"] = project_name + + # Get file patterns directly from shared + include_patterns = shared["include_patterns"] + exclude_patterns = shared["exclude_patterns"] + max_file_size = shared["max_file_size"] + + return { + "repo_url": repo_url, + "local_dir": local_dir, + "token": shared.get("github_token"), + "include_patterns": include_patterns, + "exclude_patterns": exclude_patterns, + "max_file_size": max_file_size, + "use_relative_paths": True, + } + + def exec(self, prep_res): + if prep_res["repo_url"]: + print(f"Crawling repository: {prep_res['repo_url']}...") + result = crawl_github_files( + repo_url=prep_res["repo_url"], + token=prep_res["token"], + include_patterns=prep_res["include_patterns"], + exclude_patterns=prep_res["exclude_patterns"], + max_file_size=prep_res["max_file_size"], + use_relative_paths=prep_res["use_relative_paths"], + ) + else: + print(f"Crawling directory: {prep_res['local_dir']}...") + + result = crawl_local_files( + directory=prep_res["local_dir"], + include_patterns=prep_res["include_patterns"], + exclude_patterns=prep_res["exclude_patterns"], + max_file_size=prep_res["max_file_size"], + use_relative_paths=prep_res["use_relative_paths"] + ) + + # Convert dict to list of tuples: [(path, content), ...] + files_list = list(result.get("files", {}).items()) + if len(files_list) == 0: + raise (ValueError("Failed to fetch files")) + print(f"Fetched {len(files_list)} files.") + return files_list + + def post(self, shared, prep_res, exec_res): + shared["files"] = exec_res # List of (path, content) tuples + + +class IdentifyAbstractions(Node): + def prep(self, shared): + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + max_abstraction_num = shared.get("max_abstraction_num", 10) # Get max_abstraction_num, default to 10 + + # Helper to create context from files, respecting limits (basic example) + def create_llm_context(files_data): + context = "" + file_info = [] # Store tuples of (index, path) + for i, (path, content) in enumerate(files_data): + entry = f"--- File Index {i}: {path} ---\n{content}\n\n" + context += entry + file_info.append((i, path)) + + return context, file_info # file_info is list of (index, path) + + context, file_info = create_llm_context(files_data) + # Format file info for the prompt (comment is just a hint for LLM) + file_listing_for_prompt = "\n".join( + [f"- {idx} # {path}" for idx, path in file_info] + ) + return ( + context, + file_listing_for_prompt, + len(files_data), + project_name, + language, + use_cache, + max_abstraction_num, + files_data, + ) # Return all parameters + + def exec(self, prep_res): + ( + context, + file_listing_for_prompt, + file_count, + project_name, + language, + use_cache, + max_abstraction_num, + files_data, + ) = prep_res # Unpack all parameters + print(f"Identifying abstractions using LLM...") + + # Add language instruction and hints only if not English + language_instruction = "" + name_lang_hint = "" + desc_lang_hint = "" + if language.lower() != "english": + language_instruction = f"IMPORTANT: Generate the `name` and `description` for each abstraction in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n" + # Keep specific hints here as name/description are primary targets + name_lang_hint = f" (value in {language.capitalize()})" + desc_lang_hint = f" (value in {language.capitalize()})" + + prompt = f""" +For the project `{project_name}`: + +Codebase Context: +{context} + +{language_instruction}Analyze the codebase context. +Identify the top 5-{max_abstraction_num} core most important abstractions to help those new to the codebase. + +For each abstraction, provide: +1. A concise `name`{name_lang_hint}. +2. A beginner-friendly `description` explaining what it is with a simple analogy, in around 100 words{desc_lang_hint}. +3. A list of relevant `file_indices` (integers) using the format `idx # path/comment`. + +List of file indices and paths present in the context: +{file_listing_for_prompt} + +Format the output as a YAML list of dictionaries: + +```yaml +- name: | + Query Processing{name_lang_hint} + description: | + Explains what the abstraction does. + It's like a central dispatcher routing requests.{desc_lang_hint} + file_indices: + - 0 # path/to/file1.py + - 3 # path/to/related.py +- name: | + Query Optimization{name_lang_hint} + description: | + Another core concept, similar to a blueprint for objects.{desc_lang_hint} + file_indices: + - 5 # path/to/another.js +# ... up to {max_abstraction_num} abstractions +```""" + # Build context from most important files (first 5 files by size/relevance) + context_files = sorted( + [(path, content) for path, content in files_data], + key=lambda x: len(x[1]), + reverse=True + )[:5] + + rag_context = "\n\n".join([ + f"=== {path} ===\n{content[:2000]}" # First 2000 chars per file + for path, content in context_files + ]) + + from utils.call_llm import call_llm_with_context + response = call_llm_with_context( + prompt=prompt, + context=rag_context, + use_cache=(use_cache and self.cur_retry == 0) + ) + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + abstractions = yaml.safe_load(yaml_str) + + if not isinstance(abstractions, list): + raise ValueError("LLM Output is not a list") + + validated_abstractions = [] + for item in abstractions: + if not isinstance(item, dict) or not all( + k in item for k in ["name", "description", "file_indices"] + ): + raise ValueError(f"Missing keys in abstraction item: {item}") + if not isinstance(item["name"], str): + raise ValueError(f"Name is not a string in item: {item}") + if not isinstance(item["description"], str): + raise ValueError(f"Description is not a string in item: {item}") + if not isinstance(item["file_indices"], list): + raise ValueError(f"file_indices is not a list in item: {item}") + + # Validate indices + validated_indices = [] + for idx_entry in item["file_indices"]: + try: + if isinstance(idx_entry, int): + idx = idx_entry + elif isinstance(idx_entry, str) and "#" in idx_entry: + idx = int(idx_entry.split("#")[0].strip()) + else: + idx = int(str(idx_entry).strip()) + + if not (0 <= idx < file_count): + raise ValueError( + f"Invalid file index {idx} found in item {item['name']}. Max index is {file_count - 1}." + ) + validated_indices.append(idx) + except (ValueError, TypeError): + raise ValueError( + f"Could not parse index from entry: {idx_entry} in item {item['name']}" + ) + + item["files"] = sorted(list(set(validated_indices))) + # Store only the required fields + validated_abstractions.append( + { + "name": item["name"], # Potentially translated name + "description": item[ + "description" + ], # Potentially translated description + "files": item["files"], + } + ) + + print(f"Identified {len(validated_abstractions)} abstractions.") + return validated_abstractions + + def post(self, shared, prep_res, exec_res): + shared["abstractions"] = ( + exec_res # List of {"name": str, "description": str, "files": [int]} + ) + + +class AnalyzeRelationships(Node): + def prep(self, shared): + abstractions = shared[ + "abstractions" + ] # Now contains 'files' list of indices, name/description potentially translated + files_data = shared["files"] + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Get the actual number of abstractions directly + num_abstractions = len(abstractions) + + # Create context with abstraction names, indices, descriptions, and relevant file snippets + context = "Identified Abstractions:\\n" + all_relevant_indices = set() + abstraction_info_for_prompt = [] + for i, abstr in enumerate(abstractions): + # Use 'files' which contains indices directly + file_indices_str = ", ".join(map(str, abstr["files"])) + # Abstraction name and description might be translated already + info_line = f"- Index {i}: {abstr['name']} (Relevant file indices: [{file_indices_str}])\\n Description: {abstr['description']}" + context += info_line + "\\n" + abstraction_info_for_prompt.append( + f"{i} # {abstr['name']}" + ) # Use potentially translated name here too + all_relevant_indices.update(abstr["files"]) + + context += "\\nRelevant File Snippets (Referenced by Index and Path):\\n" + # Get content for relevant files using helper + relevant_files_content_map = get_content_for_indices( + files_data, sorted(list(all_relevant_indices)) + ) + # Format file content for context + file_context_str = "\\n\\n".join( + f"--- File: {idx_path} ---\\n{content}" + for idx_path, content in relevant_files_content_map.items() + ) + context += file_context_str + + return ( + context, + "\n".join(abstraction_info_for_prompt), + num_abstractions, # Pass the actual count + project_name, + language, + use_cache, + ) # Return use_cache + + def exec(self, prep_res): + ( + context, + abstraction_listing, + num_abstractions, # Receive the actual count + project_name, + language, + use_cache, + ) = prep_res # Unpack use_cache + print(f"Analyzing relationships using LLM...") + + # Add language instruction and hints only if not English + language_instruction = "" + lang_hint = "" + list_lang_note = "" + if language.lower() != "english": + language_instruction = f"IMPORTANT: Generate the `summary` and relationship `label` fields in **{language.capitalize()}** language. Do NOT use English for these fields.\n\n" + lang_hint = f" (in {language.capitalize()})" + list_lang_note = f" (Names might be in {language.capitalize()})" # Note for the input list + + prompt = f""" +Based on the following abstractions and relevant code snippets from the project `{project_name}`: + +List of Abstraction Indices and Names{list_lang_note}: +{abstraction_listing} + +Context (Abstractions, Descriptions, Code): +{context} + +{language_instruction}Please provide: +1. A high-level `summary` of the project's main purpose and functionality in a few beginner-friendly sentences{lang_hint}. Use markdown formatting with **bold** and *italic* text to highlight important concepts. +2. A list (`relationships`) describing the key interactions between these abstractions. For each relationship, specify: + - `from_abstraction`: Index of the source abstraction (e.g., `0 # AbstractionName1`) + - `to_abstraction`: Index of the target abstraction (e.g., `1 # AbstractionName2`) + - `label`: A brief label for the interaction **in just a few words**{lang_hint} (e.g., "Manages", "Inherits", "Uses"). + Ideally the relationship should be backed by one abstraction calling or passing parameters to another. + Simplify the relationship and exclude those non-important ones. + +IMPORTANT: Make sure EVERY abstraction is involved in at least ONE relationship (either as source or target). Each abstraction index must appear at least once across all relationships. + +Format the output as YAML: + +```yaml +summary: | + A brief, simple explanation of the project{lang_hint}. + Can span multiple lines with **bold** and *italic* for emphasis. +relationships: + - from_abstraction: 0 # AbstractionName1 + to_abstraction: 1 # AbstractionName2 + label: "Manages"{lang_hint} + - from_abstraction: 2 # AbstractionName3 + to_abstraction: 0 # AbstractionName1 + label: "Provides config"{lang_hint} + # ... other relationships +``` + +Now, provide the YAML output: +""" + # Use already gathered context from prep (relevant_files_content_map has the files) + # But we can add a focused snippet for relationship analysis + from utils.call_llm import call_llm_with_context + + # Context is already built in prep via file_context_str + # We'll pass it as additional context + response = call_llm_with_context( + prompt=prompt, + context="", # Context already in prompt, no need to duplicate + use_cache=(use_cache and self.cur_retry == 0) + ) + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + relationships_data = yaml.safe_load(yaml_str) + + if not isinstance(relationships_data, dict) or not all( + k in relationships_data for k in ["summary", "relationships"] + ): + raise ValueError( + "LLM output is not a dict or missing keys ('summary', 'relationships')" + ) + if not isinstance(relationships_data["summary"], str): + raise ValueError("summary is not a string") + if not isinstance(relationships_data["relationships"], list): + raise ValueError("relationships is not a list") + + # Validate relationships structure + validated_relationships = [] + for rel in relationships_data["relationships"]: + # Check for 'label' key + if not isinstance(rel, dict) or not all( + k in rel for k in ["from_abstraction", "to_abstraction", "label"] + ): + raise ValueError( + f"Missing keys (expected from_abstraction, to_abstraction, label) in relationship item: {rel}" + ) + # Validate 'label' is a string + if not isinstance(rel["label"], str): + raise ValueError(f"Relationship label is not a string: {rel}") + + # Validate indices + try: + from_idx = int(str(rel["from_abstraction"]).split("#")[0].strip()) + to_idx = int(str(rel["to_abstraction"]).split("#")[0].strip()) + if not ( + 0 <= from_idx < num_abstractions and 0 <= to_idx < num_abstractions + ): + raise ValueError( + f"Invalid index in relationship: from={from_idx}, to={to_idx}. Max index is {num_abstractions-1}." + ) + validated_relationships.append( + { + "from": from_idx, + "to": to_idx, + "label": rel["label"], # Potentially translated label + } + ) + except (ValueError, TypeError): + raise ValueError(f"Could not parse indices from relationship: {rel}") + + print("Generated project summary and relationship details.") + return { + "summary": relationships_data["summary"], # Potentially translated summary + "details": validated_relationships, # Store validated, index-based relationships with potentially translated labels + } + + def post(self, shared, prep_res, exec_res): + # Structure is now {"summary": str, "details": [{"from": int, "to": int, "label": str}]} + # Summary and label might be translated + shared["relationships"] = exec_res + + +class OrderChapters(Node): + def prep(self, shared): + abstractions = shared["abstractions"] # Name/description might be translated + relationships = shared["relationships"] # Summary/label might be translated + project_name = shared["project_name"] # Get project name + language = shared.get("language", "english") # Get language + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Prepare context for the LLM + abstraction_info_for_prompt = [] + for i, a in enumerate(abstractions): + abstraction_info_for_prompt.append( + f"- {i} # {a['name']}" + ) # Use potentially translated name + abstraction_listing = "\n".join(abstraction_info_for_prompt) + + # Use potentially translated summary and labels + summary_note = "" + if language.lower() != "english": + summary_note = ( + f" (Note: Project Summary might be in {language.capitalize()})" + ) + + context = f"Project Summary{summary_note}:\n{relationships['summary']}\n\n" + context += "Relationships (Indices refer to abstractions above):\n" + for rel in relationships["details"]: + from_name = abstractions[rel["from"]]["name"] + to_name = abstractions[rel["to"]]["name"] + # Use potentially translated 'label' + context += f"- From {rel['from']} ({from_name}) to {rel['to']} ({to_name}): {rel['label']}\n" # Label might be translated + + list_lang_note = "" + if language.lower() != "english": + list_lang_note = f" (Names might be in {language.capitalize()})" + + return ( + abstraction_listing, + context, + len(abstractions), + project_name, + list_lang_note, + use_cache, + ) # Return use_cache + + def exec(self, prep_res): + ( + abstraction_listing, + context, + num_abstractions, + project_name, + list_lang_note, + use_cache, + ) = prep_res # Unpack use_cache + print("Determining chapter order using LLM...") + # No language variation needed here in prompt instructions, just ordering based on structure + # The input names might be translated, hence the note. + prompt = f""" +Given the following project abstractions and their relationships for the project ```` {project_name} ````: + +Abstractions (Index # Name){list_lang_note}: +{abstraction_listing} + +Context about relationships and project summary: +{context} + +If you are going to make a tutorial for ```` {project_name} ````, what is the best order to explain these abstractions, from first to last? +Ideally, first explain those that are the most important or foundational, perhaps user-facing concepts or entry points. Then move to more detailed, lower-level implementation details or supporting concepts. + +Output the ordered list of abstraction indices, including the name in a comment for clarity. Use the format `idx # AbstractionName`. + +```yaml +- 2 # FoundationalConcept +- 0 # CoreClassA +- 1 # CoreClassB (uses CoreClassA) +- ... +``` + +Now, provide the YAML output: +""" + response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + + # --- Validation --- + yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() + ordered_indices_raw = yaml.safe_load(yaml_str) + + if not isinstance(ordered_indices_raw, list): + raise ValueError("LLM output is not a list") + + ordered_indices = [] + seen_indices = set() + for entry in ordered_indices_raw: + try: + if isinstance(entry, int): + idx = entry + elif isinstance(entry, str) and "#" in entry: + idx = int(entry.split("#")[0].strip()) + else: + idx = int(str(entry).strip()) + + if not (0 <= idx < num_abstractions): + raise ValueError( + f"Invalid index {idx} in ordered list. Max index is {num_abstractions-1}." + ) + if idx in seen_indices: + raise ValueError(f"Duplicate index {idx} found in ordered list.") + ordered_indices.append(idx) + seen_indices.add(idx) + + except (ValueError, TypeError): + raise ValueError( + f"Could not parse index from ordered list entry: {entry}" + ) + + # Check if all abstractions are included + if len(ordered_indices) != num_abstractions: + raise ValueError( + f"Ordered list length ({len(ordered_indices)}) does not match number of abstractions ({num_abstractions}). Missing indices: {set(range(num_abstractions)) - seen_indices}" + ) + + print(f"Determined chapter order (indices): {ordered_indices}") + return ordered_indices # Return the list of indices + + def post(self, shared, prep_res, exec_res): + # exec_res is already the list of ordered indices + shared["chapter_order"] = exec_res # List of indices + + +class WriteChapters(BatchNode): + def prep(self, shared): + chapter_order = shared["chapter_order"] # List of indices + abstractions = shared[ + "abstractions" + ] # List of {"name": str, "description": str, "files": [int]} + files_data = shared["files"] # List of (path, content) tuples + project_name = shared["project_name"] + language = shared.get("language", "english") + use_cache = shared.get("use_cache", True) # Get use_cache flag, default to True + + # Get already written chapters to provide context + # We store them temporarily during the batch run, not in shared memory yet + # The 'previous_chapters_summary' will be built progressively in the exec context + self.chapters_written_so_far = ( + [] + ) # Use instance variable for temporary storage across exec calls + + # Create a complete list of all chapters + all_chapters = [] + chapter_filenames = {} # Store chapter filename mapping for linking + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + chapter_num = i + 1 + chapter_name = abstractions[abstraction_index][ + "name" + ] # Potentially translated name + # Create safe filename (from potentially translated name) + safe_name = "".join( + c if c.isalnum() else "_" for c in chapter_name + ).lower() + filename = f"{i+1:02d}_{safe_name}.md" + # Format with link (using potentially translated name) + all_chapters.append(f"{chapter_num}. [{chapter_name}]({filename})") + # Store mapping of chapter index to filename for linking + chapter_filenames[abstraction_index] = { + "num": chapter_num, + "name": chapter_name, + "filename": filename, + } + + # Create a formatted string with all chapters + full_chapter_listing = "\n".join(all_chapters) + + items_to_process = [] + for i, abstraction_index in enumerate(chapter_order): + if 0 <= abstraction_index < len(abstractions): + abstraction_details = abstractions[ + abstraction_index + ] # Contains potentially translated name/desc + # Use 'files' (list of indices) directly + related_file_indices = abstraction_details.get("files", []) + # Get content using helper, passing indices + related_files_content_map = get_content_for_indices( + files_data, related_file_indices + ) + + # Get previous chapter info for transitions (uses potentially translated name) + prev_chapter = None + if i > 0: + prev_idx = chapter_order[i - 1] + prev_chapter = chapter_filenames[prev_idx] + + # Get next chapter info for transitions (uses potentially translated name) + next_chapter = None + if i < len(chapter_order) - 1: + next_idx = chapter_order[i + 1] + next_chapter = chapter_filenames[next_idx] + + items_to_process.append( + { + "chapter_num": i + 1, + "abstraction_index": abstraction_index, + "abstraction_details": abstraction_details, # Has potentially translated name/desc + "related_files_content_map": related_files_content_map, + "project_name": shared["project_name"], # Add project name + "full_chapter_listing": full_chapter_listing, # Add the full chapter listing (uses potentially translated names) + "chapter_filenames": chapter_filenames, # Add chapter filenames mapping (uses potentially translated names) + "prev_chapter": prev_chapter, # Add previous chapter info (uses potentially translated name) + "next_chapter": next_chapter, # Add next chapter info (uses potentially translated name) + "language": language, # Add language for multi-language support + "use_cache": use_cache, # Pass use_cache flag + # previous_chapters_summary will be added dynamically in exec + } + ) + else: + print( + f"Warning: Invalid abstraction index {abstraction_index} in chapter_order. Skipping." + ) + + print(f"Preparing to write {len(items_to_process)} chapters...") + return items_to_process # Iterable for BatchNode + + def exec(self, item): + # This runs for each item prepared above + abstraction_name = item["abstraction_details"][ + "name" + ] # Potentially translated name + abstraction_description = item["abstraction_details"][ + "description" + ] # Potentially translated description + chapter_num = item["chapter_num"] + project_name = item.get("project_name") + language = item.get("language", "english") + use_cache = item.get("use_cache", True) # Read use_cache from item + print(f"Writing chapter {chapter_num} for: {abstraction_name} using LLM...") + + # Prepare file context string from the map + file_context_str = "\n\n".join( + f"--- File: {idx_path.split('# ')[1] if '# ' in idx_path else idx_path} ---\n{content}" + for idx_path, content in item["related_files_content_map"].items() + ) + + # Get summary of chapters written *before* this one + # Use the temporary instance variable + previous_chapters_summary = "\n---\n".join(self.chapters_written_so_far) + + # Add language instruction and context notes only if not English + language_instruction = "" + concept_details_note = "" + structure_note = "" + prev_summary_note = "" + instruction_lang_note = "" + mermaid_lang_note = "" + code_comment_note = "" + link_lang_note = "" + tone_note = "" + if language.lower() != "english": + lang_cap = language.capitalize() + language_instruction = f"IMPORTANT: Write this ENTIRE tutorial chapter in **{lang_cap}**. Some input context (like concept name, description, chapter list, previous summary) might already be in {lang_cap}, but you MUST translate ALL other generated content including explanations, examples, technical terms, and potentially code comments into {lang_cap}. DO NOT use English anywhere except in code syntax, required proper nouns, or when specified. The entire output MUST be in {lang_cap}.\n\n" + concept_details_note = f" (Note: Provided in {lang_cap})" + structure_note = f" (Note: Chapter names might be in {lang_cap})" + prev_summary_note = f" (Note: This summary might be in {lang_cap})" + instruction_lang_note = f" (in {lang_cap})" + mermaid_lang_note = f" (Use {lang_cap} for labels/text if appropriate)" + code_comment_note = f" (Translate to {lang_cap} if possible, otherwise keep minimal English for clarity)" + link_lang_note = ( + f" (Use the {lang_cap} chapter title from the structure above)" + ) + tone_note = f" (appropriate for {lang_cap} readers)" + + prompt = f""" +{language_instruction}Write a very beginner-friendly tutorial chapter (in Markdown format) for the project `{project_name}` about the concept: "{abstraction_name}". This is Chapter {chapter_num}. + +Concept Details{concept_details_note}: +- Name: {abstraction_name} +- Description: +{abstraction_description} + +Complete Tutorial Structure{structure_note}: +{item["full_chapter_listing"]} + +Context from previous chapters{prev_summary_note}: +{previous_chapters_summary if previous_chapters_summary else "This is the first chapter."} + +Relevant Code Snippets (Code itself remains unchanged): +{file_context_str if file_context_str else "No specific code snippets provided for this abstraction."} + +Instructions for the chapter (Generate content in {language.capitalize()} unless specified otherwise): +- Start with a clear heading (e.g., `# Chapter {chapter_num}: {abstraction_name}`). Use the provided concept name. + +- If this is not the first chapter, begin with a brief transition from the previous chapter{instruction_lang_note}, referencing it with a proper Markdown link using its name{link_lang_note}. + +- Begin with a high-level motivation explaining what problem this abstraction solves{instruction_lang_note}. Start with a central use case as a concrete example. The whole chapter should guide the reader to understand how to solve this use case. Make it very minimal and friendly to beginners. + +- If the abstraction is complex, break it down into key concepts. Explain each concept one-by-one in a very beginner-friendly way{instruction_lang_note}. + +- Explain how to use this abstraction to solve the use case{instruction_lang_note}. Give example inputs and outputs for code snippets (if the output isn't values, describe at a high level what will happen{instruction_lang_note}). + +- Each code block should be BELOW 10 lines! If longer code blocks are needed, break them down into smaller pieces and walk through them one-by-one. Aggresively simplify the code to make it minimal. Use comments{code_comment_note} to skip non-important implementation details. Each code block should have a beginner friendly explanation right after it{instruction_lang_note}. + +- Describe the internal implementation to help understand what's under the hood{instruction_lang_note}. First provide a non-code or code-light walkthrough on what happens step-by-step when the abstraction is called{instruction_lang_note}. It's recommended to use a simple sequenceDiagram with a dummy example - keep it minimal with at most 5 participants to ensure clarity. If participant name has space, use: `participant QP as Query Processing`. {mermaid_lang_note}. + +- Then dive deeper into code for the internal implementation with references to files. Provide example code blocks, but make them similarly simple and beginner-friendly. Explain{instruction_lang_note}. + +- IMPORTANT: When you need to refer to other core abstractions covered in other chapters, ALWAYS use proper Markdown links like this: [Chapter Title](filename.md). Use the Complete Tutorial Structure above to find the correct filename and the chapter title{link_lang_note}. Translate the surrounding text. + +- Use mermaid diagrams to illustrate complex concepts (```mermaid``` format). {mermaid_lang_note}. + +- Heavily use analogies and examples throughout{instruction_lang_note} to help beginners understand. + +- End the chapter with a brief conclusion that summarizes what was learned{instruction_lang_note} and provides a transition to the next chapter{instruction_lang_note}. If there is a next chapter, use a proper Markdown link: [Next Chapter Title](next_chapter_filename){link_lang_note}. + +- Ensure the tone is welcoming and easy for a newcomer to understand{tone_note}. + +- Output *only* the Markdown content for this chapter. + +Now, directly provide a super beginner-friendly Markdown output (DON'T need ```markdown``` tags): +""" + from utils.call_llm import call_llm_with_context + + # Context from related files is already in file_context_str + # Pass it explicitly as RAG context for better separation + response = call_llm_with_context( + prompt=prompt, + context=file_context_str if file_context_str else "", + use_cache=(use_cache and self.cur_retry == 0) + ) + + chapter_content = response + # Basic validation/cleanup + actual_heading = f"# Chapter {chapter_num}: {abstraction_name}" # Use potentially translated name + if not chapter_content.strip().startswith(f"# Chapter {chapter_num}"): + # Add heading if missing or incorrect, trying to preserve content + lines = chapter_content.strip().split("\n") + if lines and lines[0].strip().startswith( + "#" + ): # If there's some heading, replace it + lines[0] = actual_heading + chapter_content = "\n".join(lines) + else: # Otherwise, prepend it + chapter_content = f"{actual_heading}\n\n{chapter_content}" + + # Add the generated content to our temporary list for the next iteration's context + self.chapters_written_so_far.append(chapter_content) + + return chapter_content # Return the Markdown string (potentially translated) + + def post(self, shared, prep_res, exec_res_list): + # exec_res_list contains the generated Markdown for each chapter, in order + shared["chapters"] = exec_res_list + # Clean up the temporary instance variable + del self.chapters_written_so_far + print(f"Finished writing {len(exec_res_list)} chapters.") + + +class CombineTutorial(Node): + def prep(self, shared): + project_name = shared["project_name"] + output_base_dir = shared.get("output_dir", "output") # Default output dir + output_path = os.path.join(output_base_dir, project_name) + repo_url = shared.get("repo_url") # Get the repository URL + # language = shared.get("language", "english") # No longer needed for fixed strings + + # Get potentially translated data + relationships_data = shared[ + "relationships" + ] # {"summary": str, "details": [{"from": int, "to": int, "label": str}]} -> summary/label potentially translated + chapter_order = shared["chapter_order"] # indices + abstractions = shared[ + "abstractions" + ] # list of dicts -> name/description potentially translated + chapters_content = shared[ + "chapters" + ] # list of strings -> content potentially translated + + # --- Generate Mermaid Diagram --- + mermaid_lines = ["flowchart TD"] + # Add nodes for each abstraction using potentially translated names + for i, abstr in enumerate(abstractions): + node_id = f"A{i}" + # Use potentially translated name, sanitize for Mermaid ID and label + sanitized_name = abstr["name"].replace('"', "") + node_label = sanitized_name # Using sanitized name only + mermaid_lines.append( + f' {node_id}["{node_label}"]' + ) # Node label uses potentially translated name + # Add edges for relationships using potentially translated labels + for rel in relationships_data["details"]: + from_node_id = f"A{rel['from']}" + to_node_id = f"A{rel['to']}" + # Use potentially translated label, sanitize + edge_label = ( + rel["label"].replace('"', "").replace("\n", " ") + ) # Basic sanitization + max_label_len = 30 + if len(edge_label) > max_label_len: + edge_label = edge_label[: max_label_len - 3] + "..." + mermaid_lines.append( + f' {from_node_id} -- "{edge_label}" --> {to_node_id}' + ) # Edge label uses potentially translated label + + mermaid_diagram = "\n".join(mermaid_lines) + # --- End Mermaid --- + + # --- Prepare index.md content --- + index_content = f"# Tutorial: {project_name}\n\n" + index_content += f"{relationships_data['summary']}\n\n" # Use the potentially translated summary directly + # Keep fixed strings in English + index_content += f"**Source Repository:** [{repo_url}]({repo_url})\n\n" + + # Add Mermaid diagram for relationships (diagram itself uses potentially translated names/labels) + index_content += "```mermaid\n" + index_content += mermaid_diagram + "\n" + index_content += "```\n\n" + + # Keep fixed strings in English + index_content += f"## Chapters\n\n" + + chapter_files = [] + # Generate chapter links based on the determined order, using potentially translated names + for i, abstraction_index in enumerate(chapter_order): + # Ensure index is valid and we have content for it + if 0 <= abstraction_index < len(abstractions) and i < len(chapters_content): + abstraction_name = abstractions[abstraction_index][ + "name" + ] # Potentially translated name + # Sanitize potentially translated name for filename + safe_name = "".join( + c if c.isalnum() else "_" for c in abstraction_name + ).lower() + filename = f"{i+1:02d}_{safe_name}.md" + index_content += f"{i+1}. [{abstraction_name}]({filename})\n" # Use potentially translated name in link text + + # Add attribution to chapter content (using English fixed string) + chapter_content = chapters_content[i] # Potentially translated content + if not chapter_content.endswith("\n\n"): + chapter_content += "\n\n" + # Keep fixed strings in English + chapter_content += f"---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + # Store filename and corresponding content + chapter_files.append({"filename": filename, "content": chapter_content}) + else: + print( + f"Warning: Mismatch between chapter order, abstractions, or content at index {i} (abstraction index {abstraction_index}). Skipping file generation for this entry." + ) + + # Add attribution to index content (using English fixed string) + index_content += f"\n\n---\n\nGenerated by [AI Codebase Knowledge Builder](https://github.com/The-Pocket/Tutorial-Codebase-Knowledge)" + + return { + "output_path": output_path, + "index_content": index_content, + "chapter_files": chapter_files, # List of {"filename": str, "content": str} + } + + def exec(self, prep_res): + output_path = prep_res["output_path"] + index_content = prep_res["index_content"] + chapter_files = prep_res["chapter_files"] + + print(f"Combining tutorial into directory: {output_path}") + # Rely on Node's built-in retry/fallback + os.makedirs(output_path, exist_ok=True) + + # Write index.md + index_filepath = os.path.join(output_path, "index.md") + with open(index_filepath, "w", encoding="utf-8") as f: + f.write(index_content) + print(f" - Wrote {index_filepath}") + + # Write chapter files + for chapter_info in chapter_files: + chapter_filepath = os.path.join(output_path, chapter_info["filename"]) + with open(chapter_filepath, "w", encoding="utf-8") as f: + f.write(chapter_info["content"]) + print(f" - Wrote {chapter_filepath}") + + return output_path # Return the final path + + def post(self, shared, prep_res, exec_res): + shared["final_output_dir"] = exec_res # Store the output path + print(f"\nTutorial generation complete! Files are in: {exec_res}") diff --git a/utils/call_llm.py b/utils/call_llm.py index 70c9e83a..fc437f18 100644 --- a/utils/call_llm.py +++ b/utils/call_llm.py @@ -158,6 +158,83 @@ def call_llm(prompt: str, use_cache: bool = True) -> str: return response_text + + +def get_open_webui_context(prompt: str, collection_name: str = "#csharpdocs") -> str: + """ + Retrieve knowledge from Open WebUI RAG server. + """ + api_key = os.getenv("OPEN_WEBUI_API_KEY") + base_url = os.getenv("OPEN_WEBUI_ENDPOINT", "http://localhost:3000") + + if not api_key: + logger.warning("OPEN_WEBUI_API_KEY not set. Skipping remote RAG.") + return "" + + url = f"{base_url}/api/chat/completions" + headers = { + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" + } + + # We use the prompt with the collection hash to trigger RAG + # We ask the model to "Provide relevant information about: ..." + search_prompt = f"{collection_name} Provide relevant information / documentation context for the following task/query: {prompt}" + + payload = { + "model": os.getenv("OLLAMA_MODEL", "qwen3:8b"), # Default to model in env + "messages": [{"role": "user", "content": search_prompt}], + "stream": False + } + + try: + logger.info(f"Querying Open WebUI RAG: {search_prompt[:50]}...") + response = requests.post(url, headers=headers, json=payload, timeout=300) # Longer timeout for RAG + response.raise_for_status() + result = response.json() + content = result["choices"][0]["message"]["content"] + logger.info(f"RAG Retrieval successful. Length: {len(content)}") + return content + except Exception as e: + logger.error(f"Open WebUI RAG Query failed: {e}") + return "" + + +def call_llm_with_context(prompt: str, context: str = "", use_cache: bool = True, include_remote_rag: bool = False) -> str: + """ + Call LLM with optional RAG context (Local + Remote) injected into the prompt. + + Args: + prompt: The main question/instruction for the LLM + context: Additional code snippets or relevant information (Local files) + use_cache: Whether to cache the LLM response + include_remote_rag: Whether to fetch context from Open WebUI RAG server + + Returns: + str: LLM response text + """ + rag_context = context + + if include_remote_rag: + remote_knowledge = get_open_webui_context(prompt) + if remote_knowledge: + rag_context += f"\n\n### Remote Library Knowledge (.NET/C# Docs):\n{remote_knowledge}" + + if rag_context: + # Build enhanced prompt with context + full_prompt = f"""### Relevant Code/Library Context: +{rag_context} + +### Task: +{prompt} + +Use the code context above to provide accurate, specific answers.""" + else: + full_prompt = prompt + + return call_llm(full_prompt, use_cache) + + def _call_llm_gemini(prompt: str) -> str: if os.getenv("GEMINI_PROJECT_ID"): client = genai.Client( diff --git a/utils/call_llm.py.backup b/utils/call_llm.py.backup new file mode 100644 index 00000000..70c9e83a --- /dev/null +++ b/utils/call_llm.py.backup @@ -0,0 +1,185 @@ +from google import genai +import os +import logging +import json +import requests +from datetime import datetime + +# Configure logging +log_directory = os.getenv("LOG_DIR", "logs") +os.makedirs(log_directory, exist_ok=True) +log_file = os.path.join( + log_directory, f"llm_calls_{datetime.now().strftime('%Y%m%d')}.log" +) + +# Set up logger +logger = logging.getLogger("llm_logger") +logger.setLevel(logging.INFO) +logger.propagate = False # Prevent propagation to root logger +file_handler = logging.FileHandler(log_file, encoding='utf-8') +file_handler.setFormatter( + logging.Formatter("%(asctime)s - %(levelname)s - %(message)s") +) +logger.addHandler(file_handler) + +# Simple cache configuration +cache_file = "llm_cache.json" + + +def load_cache(): + try: + with open(cache_file, 'r') as f: + return json.load(f) + except: + logger.warning(f"Failed to load cache.") + return {} + + +def save_cache(cache): + try: + with open(cache_file, 'w') as f: + json.dump(cache, f) + except: + logger.warning(f"Failed to save cache") + + +def get_llm_provider(): + provider = os.getenv("LLM_PROVIDER") + if not provider and (os.getenv("GEMINI_PROJECT_ID") or os.getenv("GEMINI_API_KEY")): + provider = "GEMINI" + # if necessary, add ANTHROPIC/OPENAI + return provider + + +def _call_llm_provider(prompt: str) -> str: + """ + Call an LLM provider based on environment variables. + Environment variables: + - LLM_PROVIDER: "OLLAMA" or "XAI" + - _MODEL: Model name (e.g., OLLAMA_MODEL, XAI_MODEL) + - _BASE_URL: Base URL without endpoint (e.g., OLLAMA_BASE_URL, XAI_BASE_URL) + - _API_KEY: API key (e.g., OLLAMA_API_KEY, XAI_API_KEY; optional for providers that don't require it) + The endpoint /v1/chat/completions will be appended to the base URL. + """ + logger.info(f"PROMPT: {prompt}") # log the prompt + + # Read the provider from environment variable + provider = os.environ.get("LLM_PROVIDER") + if not provider: + raise ValueError("LLM_PROVIDER environment variable is required") + + # Construct the names of the other environment variables + model_var = f"{provider}_MODEL" + base_url_var = f"{provider}_BASE_URL" + api_key_var = f"{provider}_API_KEY" + + # Read the provider-specific variables + model = os.environ.get(model_var) + base_url = os.environ.get(base_url_var) + api_key = os.environ.get(api_key_var, "") # API key is optional, default to empty string + + # Validate required variables + if not model: + raise ValueError(f"{model_var} environment variable is required") + if not base_url: + raise ValueError(f"{base_url_var} environment variable is required") + + # Append the endpoint to the base URL + url = f"{base_url.rstrip('/')}/v1/chat/completions" + + # Configure headers and payload based on provider + headers = { + "Content-Type": "application/json", + } + if api_key: # Only add Authorization header if API key is provided + headers["Authorization"] = f"Bearer {api_key}" + + payload = { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.7, + } + + try: + response = requests.post(url, headers=headers, json=payload) + response_json = response.json() # Log the response + logger.info("RESPONSE:\n%s", json.dumps(response_json, indent=2)) + #logger.info(f"RESPONSE: {response.json()}") + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"] + except requests.exceptions.HTTPError as e: + error_message = f"HTTP error occurred: {e}" + try: + error_details = response.json().get("error", "No additional details") + error_message += f" (Details: {error_details})" + except: + pass + raise Exception(error_message) + except requests.exceptions.ConnectionError: + raise Exception(f"Failed to connect to {provider} API. Check your network connection.") + except requests.exceptions.Timeout: + raise Exception(f"Request to {provider} API timed out.") + except requests.exceptions.RequestException as e: + raise Exception(f"An error occurred while making the request to {provider}: {e}") + except ValueError: + raise Exception(f"Failed to parse response as JSON from {provider}. The server might have returned an invalid response.") + +# By default, we Google Gemini 2.5 pro, as it shows great performance for code understanding +def call_llm(prompt: str, use_cache: bool = True) -> str: + # Log the prompt + logger.info(f"PROMPT: {prompt}") + + # Check cache if enabled + if use_cache: + # Load cache from disk + cache = load_cache() + # Return from cache if exists + if prompt in cache: + logger.info(f"RESPONSE: {cache[prompt]}") + return cache[prompt] + + provider = get_llm_provider() + if provider == "GEMINI": + response_text = _call_llm_gemini(prompt) + else: # generic method using a URL that is OpenAI compatible API (Ollama, ...) + response_text = _call_llm_provider(prompt) + + # Log the response + logger.info(f"RESPONSE: {response_text}") + + # Update cache if enabled + if use_cache: + # Load cache again to avoid overwrites + cache = load_cache() + # Add to cache and save + cache[prompt] = response_text + save_cache(cache) + + return response_text + + +def _call_llm_gemini(prompt: str) -> str: + if os.getenv("GEMINI_PROJECT_ID"): + client = genai.Client( + vertexai=True, + project=os.getenv("GEMINI_PROJECT_ID"), + location=os.getenv("GEMINI_LOCATION", "us-central1") + ) + elif os.getenv("GEMINI_API_KEY"): + client = genai.Client(api_key=os.getenv("GEMINI_API_KEY")) + else: + raise ValueError("Either GEMINI_PROJECT_ID or GEMINI_API_KEY must be set in the environment") + model = os.getenv("GEMINI_MODEL", "gemini-2.5-pro-exp-03-25") + response = client.models.generate_content( + model=model, + contents=[prompt] + ) + return response.text + +if __name__ == "__main__": + test_prompt = "Hello, how are you?" + + # First call - should hit the API + print("Making call...") + response1 = call_llm(test_prompt, use_cache=False) + print(f"Response: {response1}") From 01b125bcc792b48509aff6aed83a37c800d34099 Mon Sep 17 00:00:00 2001 From: motaz m alharbi Date: Sun, 14 Dec 2025 23:53:04 +0200 Subject: [PATCH 2/2] Add Remote RAG integration for open webui --- README.md | 25 +++++++++++++ nodes.py | 8 ++++- utils/call_llm.py | 91 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 105 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index cc8ad4e8..ba0a6071 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,31 @@ To run this project in a Docker container, you'll need to pass your API keys as ``` +
+ +
+ + New! Remote RAG Integration (Open WebUI) + +**Supercharge your tutorials with external knowledge!** + +We've added support for **Remote RAG** (Retrieval-Augmented Generation). This allows PocketFlow to access your private documentation or the latest tech specs (like .NET 10, C# 14) that aren't in the codebase yet. + +**Capabilities:** +- **Hybrid Analysis**: Combines local code analysis with your remote Knowledge Base. +- **Always Up-to-Date**: Tutorials reference the latest documentation from your Open WebUI server. +- **Plug & Play**: Just set your credentials in `.env`, and it works automatically alongside the standard analysis. + +**Setup:** +Add to your `.env`: +```bash +# Open WebUI Configuration +OPEN_WEBUI_ENDPOINT=http://localhost:3000 +OPEN_WEBUI_JWT_TOKEN=your_token_here +OPEN_WEBUI_COLLECTION=your_collection_here +``` +
+ ## 💡 Development Tutorial - I built using [**Agentic Coding**](https://zacharyhuang.substack.com/p/agentic-coding-the-most-fun-way-to), the fastest development paradigm, where humans simply [design](docs/design.md) and agents [code](flow.py). diff --git a/nodes.py b/nodes.py index 8f6692b7..82ff580f 100644 --- a/nodes.py +++ b/nodes.py @@ -517,7 +517,13 @@ def exec(self, prep_res): Now, provide the YAML output: """ - response = call_llm(prompt, use_cache=(use_cache and self.cur_retry == 0)) # Use cache only if enabled and not retrying + from utils.call_llm import call_llm_with_context + response = call_llm_with_context( + prompt=prompt, + context="", # Context is already inside the prompt + use_cache=(use_cache and self.cur_retry == 0), + include_remote_rag=True + ) # --- Validation --- yaml_str = response.strip().split("```yaml")[1].split("```")[0].strip() diff --git a/utils/call_llm.py b/utils/call_llm.py index fc437f18..241f0a14 100644 --- a/utils/call_llm.py +++ b/utils/call_llm.py @@ -160,46 +160,93 @@ def call_llm(prompt: str, use_cache: bool = True) -> str: -def get_open_webui_context(prompt: str, collection_name: str = "#csharpdocs") -> str: + + +def get_open_webui_context( + prompt: str, + collection_name: str = "csharpdocs", + jwt_token: str = None +) -> str: """ - Retrieve knowledge from Open WebUI RAG server. + Query Open WebUI knowledge collection with RAG enabled. + The RAG retrieval happens server-side in Open WebUI using the 'files' parameter or proper knowledge retrieval. """ - api_key = os.getenv("OPEN_WEBUI_API_KEY") - base_url = os.getenv("OPEN_WEBUI_ENDPOINT", "http://localhost:3000") + # 1. Setup Auth and Config + if not jwt_token: + # Try JWT token first, fallback to API Key (which is often a Bearer token anyway) + jwt_token = os.getenv("OPEN_WEBUI_JWT_TOKEN") or os.getenv("OPEN_WEBUI_API_KEY") - if not api_key: - logger.warning("OPEN_WEBUI_API_KEY not set. Skipping remote RAG.") + if not jwt_token: + logger.warning("OPEN_WEBUI_JWT_TOKEN/API_KEY not set. Skipping RAG.") return "" + + # Allow overriding collection name from env + collection_name = os.getenv("OPEN_WEBUI_COLLECTION", collection_name) - url = f"{base_url}/api/chat/completions" + base_url = os.getenv("OPEN_WEBUI_ENDPOINT", "http://localhost:3000") headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json" + 'Authorization': f'Bearer {jwt_token}', + 'Content-Type': 'application/json' } - - # We use the prompt with the collection hash to trigger RAG - # We ask the model to "Provide relevant information about: ..." - search_prompt = f"{collection_name} Provide relevant information / documentation context for the following task/query: {prompt}" + + # 2. Get Collection ID from Name + collection_id = None + try: + collections_url = f"{base_url}/api/v1/knowledge/" + # Use short timeout for list + resp = requests.get(collections_url, headers=headers, timeout=10) + resp.raise_for_status() + collections = resp.json() + + for col in collections: + if col.get('name', '').lower() == collection_name.lower(): + collection_id = col['id'] + break + + if not collection_id: + logger.warning(f"Collection '{collection_name}' not found in Open WebUI. Returning empty RAG context.") + return "" + + logger.info(f"Found Knowledge Collection: {collection_name} -> {collection_id}") + + except Exception as e: + logger.error(f"Failed to lookup Knowledge Collection ID: {e}") + return "" + + # 3. Query Chat Completions with RAG context + # This triggers the server-side RAG engine because we pass the 'files' parameter + chat_url = f"{base_url}/api/chat/completions" payload = { - "model": os.getenv("OLLAMA_MODEL", "qwen3:8b"), # Default to model in env - "messages": [{"role": "user", "content": search_prompt}], + "model": os.getenv("OLLAMA_MODEL", "qwen3:8b"), + "messages": [ + {"role": "user", "content": prompt} + ], + "files": [ + { + "type": "collection", + "id": collection_id + } + ], "stream": False } try: - logger.info(f"Querying Open WebUI RAG: {search_prompt[:50]}...") - response = requests.post(url, headers=headers, json=payload, timeout=300) # Longer timeout for RAG + logger.info(f"Querying Open WebUI RAG for: '{prompt[:50]}...'") + response = requests.post(chat_url, headers=headers, json=payload, timeout=300) # 5 min timeout for RAG response.raise_for_status() result = response.json() + content = result["choices"][0]["message"]["content"] - logger.info(f"RAG Retrieval successful. Length: {len(content)}") + logger.info(f"RAG Retrieval Successful. Response Length: {len(content)}") return content + except Exception as e: logger.error(f"Open WebUI RAG Query failed: {e}") return "" + def call_llm_with_context(prompt: str, context: str = "", use_cache: bool = True, include_remote_rag: bool = False) -> str: """ Call LLM with optional RAG context (Local + Remote) injected into the prompt. @@ -234,6 +281,14 @@ def call_llm_with_context(prompt: str, context: str = "", use_cache: bool = True return call_llm(full_prompt, use_cache) +if __name__ == "__main__": + # Test block as requested + from dotenv import load_dotenv + load_dotenv() + print("Testing get_open_webui_context...") + res = get_open_webui_context("file-based programs .NET 10") + print(f"Retrieved: {res[:200]}") + def _call_llm_gemini(prompt: str) -> str: if os.getenv("GEMINI_PROJECT_ID"):