From a6d5fe8ddedff5a801e7cdf4a8bdaba2b7b924e0 Mon Sep 17 00:00:00 2001
From: Pranav C <pranavxc@gmail.com>
Date: Sun, 8 Jun 2025 22:54:11 +0530
Subject: [PATCH] feat: llms file generation using chatgpt(WIP)

---
 package.json                             |   2 +
 scripts/README-gpt-enhanced.md           | 175 ++++++++
 scripts/generate-llm-content-with-gpt.ts | 521 +++++++++++++++++++++++
 3 files changed, 698 insertions(+)
 create mode 100644 scripts/README-gpt-enhanced.md
 create mode 100644 scripts/generate-llm-content-with-gpt.ts

diff --git a/package.json b/package.json
index 77bea43..f072579 100644
--- a/package.json
+++ b/package.json
@@ -11,6 +11,7 @@
     "generate-blog-sitemap": "tsx scripts/generate-blog-sitemap.ts",
     "generate-overview": "tsx scripts/generate-llm-content.ts",
     "generate-llm-content": "tsx scripts/generate-llm-content.ts",
+    "generate-llm-gpt": "tsx scripts/generate-llm-content-with-gpt.ts",
     "postbuild": "next-sitemap && npm run generate-blog-sitemap && npm run generate-llm-content"
   },
   "dependencies": {
@@ -33,6 +34,7 @@
     "next-recaptcha-v3": "^1.5.2",
     "next-sitemap": "^4.2.3",
     "next-themes": "^0.4.6",
+    "openai": "^4.67.3",
     "react": "^19.1.0",
     "react-dom": "^19.1.0",
     "tailwind-merge": "^3.2.0",
diff --git a/scripts/README-gpt-enhanced.md b/scripts/README-gpt-enhanced.md
new file mode 100644
index 0000000..4bb5e6b
--- /dev/null
+++ b/scripts/README-gpt-enhanced.md
@@ -0,0 +1,175 @@
+# GPT-Enhanced LLM Content Generator
+
+This script generates an enhanced `llms.txt` file using OpenAI's ChatGPT API to create better structured and more comprehensive documentation for LLM training.
+
+## Features
+
+- **AI-Enhanced Content**: Uses GPT-4 to analyze and improve the structure of documentation
+- **Rich Content Extraction**: Extracts not just metadata but actual content from HTML pages
+- **Intelligent Categorization**: Better categorization of documentation sections
+- **Comprehensive Output**: Generates detailed sections including:
+  - Project Overview
+  - Key Features
+  - Getting Started Guide
+  - Detailed Documentation Structure
+  - Common Use Cases
+  - FAQ Section
+- **Fallback System**: If the API fails, falls back to manual content generation
+- **Rate Limiting**: Respects API limits with built-in delays and batch processing
+
+## Prerequisites
+
+1. **OpenAI API Key**: You need an active OpenAI API key
+2. **Built Project**: The script reads from the Next.js build output, so run `npm run build` first
+
+## Setup
+
+1. **Install dependencies**:
+   ```bash
+   npm install
+   ```
+
+2. **Set your OpenAI API Key**:
+   ```bash
+   export OPENAI_API_KEY="your-api-key-here"
+   ```
+
+3. **Build the project** (if not already done):
+   ```bash
+   npm run build
+   ```
+
+## Usage
+
+### Basic Usage
+```bash
+npm run generate-llm-gpt
+```
+
+### With Environment Variables
+```bash
+OPENAI_API_KEY="your-key" SITE_URL="https://docs.nocodb.com" OUTPUT_FILE="enhanced-llms.txt" npm run generate-llm-gpt
+```
+
+### Direct Script Execution
+```bash
+tsx scripts/generate-llm-content-with-gpt.ts
+```
+
+## Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | *required* | Your OpenAI API key |
+| `SITE_URL` | `https://docs.nocodb.com` | Base URL for the documentation site |
+| `OUTPUT_FILE` | `llms-enhanced.txt` | Output filename for the generated file |
+
+## Output
+
+The script generates a comprehensive markdown file with the following structure:
+
+```
+# Project Title
+
+## Overview
+[AI-generated comprehensive overview]
+
+## Key Features
+- Feature 1
+- Feature 2
+...
+
+## Quick Start Guide
+1. Step 1
+2. Step 2
+...
+
+## Documentation
+### Category 1
+#### Page Title
+[Description and key points]
+
+### Category 2
+[More documentation sections]
+
+## Common Use Cases
+- Use case 1
+- Use case 2
+...
+
+## Frequently Asked Questions
+**Q: Question 1**
+A: Answer 1
+...
+```
+
+## How It Works
+
+1. **Content Extraction**: Scans the built HTML files and extracts:
+   - Page titles and metadata
+   - Main content text (first 2000 characters)
+   - URLs and categorization
+
+2. **AI Enhancement**: Sends a comprehensive summary to GPT-4 with instructions to:
+   - Create a compelling project overview
+   - Organize content into logical sections
+   - Generate helpful getting started guides
+   - Create FAQ sections
+   - Identify common use cases
+
+3. **Document Generation**: Combines the AI-generated structure with detailed page information to create the final document
+
+## Comparison with Standard Script
+
+| Feature | Standard Script | GPT-Enhanced Script |
+|---------|----------------|-------------------|
+| Content Analysis | Metadata only | Full content + metadata |
+| Structure | Basic | AI-optimized |
+| Overview Generation | Template-based | AI-generated |
+| Getting Started | Simple list | Structured guide |
+| FAQ Section | None | AI-generated |
+| Use Cases | Basic | AI-identified |
+| Fallback | None | Full fallback system |
+
+## API Costs
+
+The script uses GPT-4 which costs approximately:
+- ~$0.03 per 1K tokens for input
+- ~$0.06 per 1K tokens for output
+
+For a typical documentation site with 50-100 pages, expect costs around $0.50-$2.00 per run.
+
+## Troubleshooting
+
+### "OPENAI_API_KEY environment variable is required"
+Set your API key: `export OPENAI_API_KEY="your-key"`
+
+### "No build output found"
+Run `npm run build` first to generate the HTML files
+
+### "Error with GPT API"
+- Check your API key is valid
+- Ensure you have sufficient API credits
+- The script will fall back to manual generation if the API fails
+
+### Rate Limiting
+The script includes built-in rate limiting. If you hit limits:
+- Wait a few minutes before retrying
+- Consider reducing the `batchSize` in the script
+
+## Customization
+
+You can modify the script to:
+- Change the GPT model (currently uses `gpt-4`)
+- Adjust the content extraction length
+- Modify the categorization logic
+- Customize the output format
+- Add additional AI prompts for specific content types
+
+## Contributing
+
+To improve the script:
+1. Test with different documentation structures
+2. Enhance the content extraction logic
+3. Improve the AI prompts for better output
+4. Add support for additional content types 
\ No newline at end of file
diff --git a/scripts/generate-llm-content-with-gpt.ts b/scripts/generate-llm-content-with-gpt.ts
new file mode 100644
index 0000000..d613edb
--- /dev/null
+++ b/scripts/generate-llm-content-with-gpt.ts
@@ -0,0 +1,521 @@
+import fs from 'fs';
+import path from 'path';
+import { JSDOM } from 'jsdom';
+import OpenAI from 'openai';
+
+interface SitemapUrl {
+  url: string;
+  title?: string;
+  metaDescription?: string;
+  ogTitle?: string;
+  ogDescription?: string;
+  keywords?: string;
+  category?: string;
+  content?: string;
+}
+
+interface EnhancedStructuredContent {
+  title: string;
+  tagline: string;
+  overview: string;
+  keyFeatures: string[];
+  gettingStarted: {
+    title: string;
+    steps: string[];
+  };
+  documentation: {
+    [category: string]: {
+      title: string;
+      description: string;
+      pages: {
+        title: string;
+        url: string;
+        description: string;
+        keyPoints: string[];
+      }[];
+    };
+  };
+  faq: {
+    question: string;
+    answer: string;
+  }[];
+  useCases: string[];
+}
+
+class GPTEnhancedLLMGenerator {
+  private openai: OpenAI;
+  private siteUrl: string;
+  private outputFile: string;
+  private maxConcurrent: number = 5;
+
+  constructor(apiKey: string, siteUrl: string, outputFile: string = 'llms-enhanced.txt') {
+    this.openai = new OpenAI({ apiKey });
+    this.siteUrl = siteUrl;
+    this.outputFile = outputFile;
+  }
+
+  async generateEnhancedLLMContent() {
+    console.log('🚀 Starting GPT-enhanced LLM content generation...');
+    
+    // Extract content from built files
+    const processedUrls = await this.extractFromBuiltFiles();
+    console.log(`📄 Found ${processedUrls.length} documentation files`);
+    
+    // Enhance content with GPT
+    const enhancedContent = await this.enhanceWithGPT(processedUrls);
+    
+    // Generate the enhanced LLM file
+    await this.generateEnhancedDocument(enhancedContent);
+    
+    console.log(`✅ Enhanced llms.txt file generated: ${this.outputFile}`);
+  }
+
+  private async extractFromBuiltFiles(): Promise<SitemapUrl[]> {
+    const results: SitemapUrl[] = [];
+    
+    // Check for Next.js build output
+    const nextAppDir = path.join(process.cwd(), '.next', 'server', 'app');
+    const outDir = path.join(process.cwd(), 'out');
+    
+    let buildDir: string;
+    
+    if (fs.existsSync(outDir)) {
+      buildDir = outDir;
+    } else if (fs.existsSync(nextAppDir)) {
+      buildDir = nextAppDir;
+    } else {
+      throw new Error('No build output found. Please run "npm run build" first.');
+    }
+    
+    console.log(`📁 Reading from build directory: ${buildDir}`);
+    
+    // Find HTML files recursively
+    const htmlFiles = await this.findHtmlFiles(buildDir);
+    console.log(`📄 Found ${htmlFiles.length} HTML files`);
+    
+    // Process files in batches to avoid overwhelming the API
+    const batchSize = 10;
+    for (let i = 0; i < htmlFiles.length; i += batchSize) {
+      const batch = htmlFiles.slice(i, i + batchSize);
+      const batchPromises = batch.map(filePath => this.extractContentFromFile(filePath, buildDir));
+      const batchResults = await Promise.all(batchPromises);
+      
+      results.push(...batchResults.filter(Boolean) as SitemapUrl[]);
+      
+      // Add a small delay to be respectful to the API
+      if (i + batchSize < htmlFiles.length) {
+        await new Promise(resolve => setTimeout(resolve, 1000));
+      }
+    }
+    
+    return results;
+  }
+
+  private async findHtmlFiles(dir: string): Promise<string[]> {
+    const htmlFiles: string[] = [];
+    
+    const traverse = (currentDir: string) => {
+      if (!fs.existsSync(currentDir)) return;
+      
+      const items = fs.readdirSync(currentDir);
+      
+      for (const item of items) {
+        const itemPath = path.join(currentDir, item);
+        const stat = fs.statSync(itemPath);
+        
+        if (stat.isDirectory()) {
+          if (['node_modules', '.git', '_next', 'static'].includes(item)) {
+            continue;
+          }
+          traverse(itemPath);
+        } else if (item.endsWith('.html')) {
+          htmlFiles.push(itemPath);
+        }
+      }
+    };
+    
+    traverse(dir);
+    return htmlFiles;
+  }
+
+  private async extractContentFromFile(filePath: string, buildDir: string): Promise<SitemapUrl | null> {
+    try {
+      const html = fs.readFileSync(filePath, 'utf-8');
+      const dom = new JSDOM(html);
+      const document = dom.window.document;
+
+      // Extract metadata
+      const title = document.querySelector('title')?.textContent?.trim() || '';
+      const metaDescription = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || '';
+      const ogTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content')?.trim() || '';
+      const ogDescription = document.querySelector('meta[property="og:description"]')?.getAttribute('content')?.trim() || '';
+      const keywords = document.querySelector('meta[name="keywords"]')?.getAttribute('content')?.trim() || '';
+
+      // Extract main content text
+      const mainContent = document.querySelector('main, [role="main"], .main-content, #main-content, article');
+      let content = '';
+      
+      if (mainContent) {
+        // Remove script tags, style tags, and navigation elements
+        const elementsToRemove = mainContent.querySelectorAll('script, style, nav, .nav, .navigation, .sidebar, .breadcrumb, .header, .footer');
+        elementsToRemove.forEach(el => el.remove());
+        
+        content = mainContent.textContent?.replace(/\s+/g, ' ').trim() || '';
+      }
+
+      // Skip if no meaningful content found
+      if (!title && !metaDescription && !content) {
+        return null;
+      }
+
+      const url = this.filePathToUrl(filePath, buildDir);
+      
+      // Focus on documentation pages
+      if (!url.includes('/docs') && !url.includes('/blog')) {
+        return null;
+      }
+
+      console.log(`📖 Processing: ${url}`);
+
+      const category = this.categorizeUrl(url);
+
+      return {
+        url,
+        title: title || ogTitle || 'Untitled',
+        metaDescription,
+        ogTitle,
+        ogDescription,
+        keywords,
+        category,
+        content: content.substring(0, 2000) // Limit content length for API efficiency
+      };
+
+    } catch (error) {
+      console.error(`❌ Error processing ${filePath}:`, error);
+      return null;
+    }
+  }
+
+  private filePathToUrl(filePath: string, buildDir: string): string {
+    let urlPath = path.relative(buildDir, filePath);
+    
+    if (urlPath.endsWith('.html')) {
+      urlPath = urlPath.replace('.html', '');
+    }
+    
+    if (urlPath === 'index') {
+      urlPath = '';
+    }
+    
+    urlPath = urlPath.replace(/\\/g, '/');
+    
+    if (urlPath && !urlPath.startsWith('/')) {
+      urlPath = '/' + urlPath;
+    }
+    
+    return this.siteUrl + urlPath;
+  }
+
+  private categorizeUrl(url: string): string {
+    const path = url.toLowerCase();
+    
+    if (path.includes('/blog')) return 'blog';
+    if (path.includes('/docs/scripts')) return 'automation-scripts';
+    if (path.includes('/docs/self-hosting')) return 'deployment';
+    if (path.includes('/docs/product-docs')) return 'core-features';
+    if (path.includes('/docs/changelog')) return 'updates';
+    if (path.includes('/docs/getting-started')) return 'getting-started';
+    if (path.includes('/docs/api')) return 'api-reference';
+    if (path.includes('/docs')) return 'documentation';
+    
+    return 'general';
+  }
+
+  private async enhanceWithGPT(urls: SitemapUrl[]): Promise<EnhancedStructuredContent> {
+    console.log('🤖 Enhancing content with GPT...');
+
+    // Group content by category
+    const categorizedContent = this.groupByCategory(urls);
+    
+    // Create a comprehensive content summary for GPT
+    const contentSummary = this.createContentSummary(categorizedContent);
+    
+    const prompt = `
+You are an expert technical writer creating comprehensive documentation for NocoDB, a no-code database platform. 
+
+Based on the following content from the NocoDB documentation website, create a well-structured, comprehensive overview that would be perfect for an LLM training document.
+
+Content Summary:
+${contentSummary}
+
+Please create a structured response with the following sections:
+
+1. **Project Overview**: A clear, compelling description of what NocoDB is and what problems it solves
+2. **Key Features**: List the main features and capabilities 
+3. **Getting Started**: Step-by-step guide for new users
+4. **Documentation Structure**: Organized breakdown of different documentation areas with descriptions
+5. **Common Use Cases**: Real-world scenarios where NocoDB excels
+6. **FAQ**: Address common questions and concerns
+
+Make the content:
+- Clear and accessible to both technical and non-technical users
+- Well-organized with proper hierarchy
+- Comprehensive but concise
+- Professional and engaging
+- Include specific examples where relevant
+
+Format your response as a JSON object with the structure matching the EnhancedStructuredContent interface.
+`;
+
+    try {
+      const response = await this.openai.chat.completions.create({
+        model: 'gpt-4',
+        messages: [
+          {
+            role: 'system',
+            content: 'You are an expert technical writer specializing in creating comprehensive documentation for software platforms. Respond with valid JSON only.'
+          },
+          {
+            role: 'user',
+            content: prompt
+          }
+        ],
+        max_tokens: 4000,
+        temperature: 0.7
+      });
+
+      const gptResponse = response.choices[0]?.message?.content;
+      if (!gptResponse) {
+        throw new Error('Empty response from GPT');
+      }
+
+      // Parse GPT response
+      let enhancedContent: EnhancedStructuredContent;
+      try {
+        enhancedContent = JSON.parse(gptResponse);
+      } catch (parseError) {
+        console.warn('⚠️ Failed to parse GPT JSON response, using fallback structure');
+        enhancedContent = this.createFallbackStructure(urls);
+      }
+
+      // Enhance with detailed page information
+      enhancedContent.documentation = await this.enhanceDocumentationSections(categorizedContent, enhancedContent.documentation);
+
+      return enhancedContent;
+
+    } catch (error) {
+      console.error('❌ Error with GPT API:', error);
+      console.log('📝 Falling back to manual content generation...');
+      return this.createFallbackStructure(urls);
+    }
+  }
+
+  private groupByCategory(urls: SitemapUrl[]): { [category: string]: SitemapUrl[] } {
+    const grouped: { [category: string]: SitemapUrl[] } = {};
+    
+    for (const url of urls) {
+      const category = url.category || 'general';
+      if (!grouped[category]) {
+        grouped[category] = [];
+      }
+      grouped[category].push(url);
+    }
+    
+    return grouped;
+  }
+
+  private createContentSummary(categorizedContent: { [category: string]: SitemapUrl[] }): string {
+    let summary = '';
+    
+    for (const [category, pages] of Object.entries(categorizedContent)) {
+      summary += `\n## ${category.toUpperCase()}\n`;
+      for (const page of pages.slice(0, 5)) { // Limit to avoid token limits
+        summary += `- ${page.title}: ${page.metaDescription || page.content?.substring(0, 200) || 'No description available'}\n`;
+      }
+    }
+    
+    return summary;
+  }
+
+  private async enhanceDocumentationSections(
+    categorizedContent: { [category: string]: SitemapUrl[] },
+    baseDocumentation: any
+  ): Promise<any> {
+    const enhanced: any = {};
+    
+    for (const [category, pages] of Object.entries(categorizedContent)) {
+      if (category === 'blog') continue; // Skip blog posts for documentation
+      
+      const categoryInfo = baseDocumentation?.[category] || {
+        title: this.formatCategoryTitle(category),
+        description: `Documentation for ${category}`,
+      };
+      
+      enhanced[category] = {
+        ...categoryInfo,
+        pages: pages.map(page => ({
+          title: page.title || 'Untitled',
+          url: page.url,
+          description: page.metaDescription || page.ogDescription || 'No description available',
+          keyPoints: this.extractKeyPoints(page.content || '')
+        }))
+      };
+    }
+    
+    return enhanced;
+  }
+
+  private formatCategoryTitle(category: string): string {
+    return category
+      .split('-')
+      .map(word => word.charAt(0).toUpperCase() + word.slice(1))
+      .join(' ');
+  }
+
+  private extractKeyPoints(content: string): string[] {
+    if (!content) return [];
+    
+    // Simple extraction of key points from content
+    const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 20);
+    return sentences.slice(0, 3).map(s => s.trim());
+  }
+
+  private createFallbackStructure(urls: SitemapUrl[]): EnhancedStructuredContent {
+    const categorized = this.groupByCategory(urls);
+    
+    return {
+      title: "NocoDB - Open Source Airtable Alternative",
+      tagline: "Turn any database into a smart spreadsheet with a no-code platform",
+      overview: "NocoDB is an open-source no-code platform that transforms any database into a collaborative workspace similar to Airtable. It provides a spreadsheet interface for your databases, making data management accessible to both technical and non-technical users.",
+      keyFeatures: [
+        "Connect to existing databases (MySQL, PostgreSQL, SQLite, SQL Server, etc.)",
+        "Rich spreadsheet interface with collaborative features",
+        "REST & GraphQL APIs with JWT authentication",
+        "Team collaboration with fine-grained access control",
+        "App store for third-party integrations",
+        "Programmatic access via APIs and SDKs"
+      ],
+      gettingStarted: {
+        title: "Quick Start Guide",
+        steps: [
+          "Install NocoDB using Docker, npm, or our cloud service",
+          "Connect your existing database or create a new one",
+          "Configure your tables and relationships",
+          "Set up user access and permissions",
+          "Start collaborating with your team"
+        ]
+      },
+      documentation: {},
+      faq: [
+        {
+          question: "What databases does NocoDB support?",
+          answer: "NocoDB supports MySQL, PostgreSQL, SQLite, SQL Server, and many other popular databases."
+        },
+        {
+          question: "Is NocoDB really free?",
+          answer: "Yes, NocoDB is open-source and free to use. We also offer cloud hosting and enterprise features."
+        }
+      ],
+      useCases: [
+        "Project management and tracking",
+        "Customer relationship management (CRM)",
+        "Content management systems",
+        "Inventory and asset management",
+        "Data analysis and reporting"
+      ]
+    };
+  }
+
+  private async generateEnhancedDocument(content: EnhancedStructuredContent): Promise<void> {
+    let document = '';
+    
+    // Header
+    document += `# ${content.title}\n\n`;
+    document += `${content.tagline}\n\n`;
+    
+    // Overview
+    document += `## Overview\n\n${content.overview}\n\n`;
+    
+    // Key Features
+    document += `## Key Features\n\n`;
+    content.keyFeatures.forEach(feature => {
+      document += `- ${feature}\n`;
+    });
+    document += '\n';
+    
+    // Getting Started
+    document += `## ${content.gettingStarted.title}\n\n`;
+    content.gettingStarted.steps.forEach((step, index) => {
+      document += `${index + 1}. ${step}\n`;
+    });
+    document += '\n';
+    
+    // Documentation
+    document += `## Documentation\n\n`;
+    for (const [category, info] of Object.entries(content.documentation)) {
+      document += `### ${info.title}\n`;
+      document += `${info.description}\n\n`;
+      
+      info.pages.forEach((page: any) => {
+        document += `#### ${page.title}\n`;
+        document += `${page.description}\n`;
+        document += `URL: ${page.url}\n`;
+        if (page.keyPoints.length > 0) {
+          document += `Key Points:\n`;
+          page.keyPoints.forEach((point: string) => {
+            document += `- ${point}\n`;
+          });
+        }
+        document += '\n';
+      });
+    }
+    
+    // Use Cases
+    document += `## Common Use Cases\n\n`;
+    content.useCases.forEach(useCase => {
+      document += `- ${useCase}\n`;
+    });
+    document += '\n';
+    
+    // FAQ
+    document += `## Frequently Asked Questions\n\n`;
+    content.faq.forEach(item => {
+      document += `**Q: ${item.question}**\n`;
+      document += `A: ${item.answer}\n\n`;
+    });
+    
+    // Footer
+    document += `---\n`;
+    document += `Generated on: ${new Date().toISOString()}\n`;
+    document += `Total pages processed: ${Object.values(content.documentation).reduce((acc: number, cat: any) => acc + cat.pages.length, 0)}\n`;
+    
+    // Write to file
+    fs.writeFileSync(this.outputFile, document, 'utf-8');
+  }
+}
+
+async function main() {
+  const apiKey = process.env.OPENAI_API_KEY;
+  
+  if (!apiKey) {
+    console.error('❌ OPENAI_API_KEY environment variable is required');
+    console.log('💡 Set your OpenAI API key: export OPENAI_API_KEY="your-api-key-here"');
+    process.exit(1);
+  }
+  
+  const siteUrl = process.env.SITE_URL || 'https://docs.nocodb.com';
+  const outputFile = process.env.OUTPUT_FILE || 'llms-enhanced.txt';
+  
+  const generator = new GPTEnhancedLLMGenerator(apiKey, siteUrl, outputFile);
+  
+  try {
+    await generator.generateEnhancedLLMContent();
+  } catch (error) {
+    console.error('❌ Generation failed:', error);
+    process.exit(1);
+  }
+}
+
+if (require.main === module) {
+  main();
+} 
\ No newline at end of file