From a6d5fe8ddedff5a801e7cdf4a8bdaba2b7b924e0 Mon Sep 17 00:00:00 2001 From: Pranav C Date: Sun, 8 Jun 2025 22:54:11 +0530 Subject: [PATCH] feat: llms file generation using chatgpt(WIP) --- package.json | 2 + scripts/README-gpt-enhanced.md | 175 ++++++++ scripts/generate-llm-content-with-gpt.ts | 521 +++++++++++++++++++++++ 3 files changed, 698 insertions(+) create mode 100644 scripts/README-gpt-enhanced.md create mode 100644 scripts/generate-llm-content-with-gpt.ts diff --git a/package.json b/package.json index 77bea43..f072579 100644 --- a/package.json +++ b/package.json @@ -11,6 +11,7 @@ "generate-blog-sitemap": "tsx scripts/generate-blog-sitemap.ts", "generate-overview": "tsx scripts/generate-llm-content.ts", "generate-llm-content": "tsx scripts/generate-llm-content.ts", + "generate-llm-gpt": "tsx scripts/generate-llm-content-with-gpt.ts", "postbuild": "next-sitemap && npm run generate-blog-sitemap && npm run generate-llm-content" }, "dependencies": { @@ -33,6 +34,7 @@ "next-recaptcha-v3": "^1.5.2", "next-sitemap": "^4.2.3", "next-themes": "^0.4.6", + "openai": "^4.67.3", "react": "^19.1.0", "react-dom": "^19.1.0", "tailwind-merge": "^3.2.0", diff --git a/scripts/README-gpt-enhanced.md b/scripts/README-gpt-enhanced.md new file mode 100644 index 0000000..4bb5e6b --- /dev/null +++ b/scripts/README-gpt-enhanced.md @@ -0,0 +1,175 @@ +# GPT-Enhanced LLM Content Generator + +This script generates an enhanced `llms.txt` file using OpenAI's ChatGPT API to create better structured and more comprehensive documentation for LLM training. + +## Features + +- **AI-Enhanced Content**: Uses GPT-4 to analyze and improve the structure of documentation +- **Rich Content Extraction**: Extracts not just metadata but actual content from HTML pages +- **Intelligent Categorization**: Better categorization of documentation sections +- **Comprehensive Output**: Generates detailed sections including: + - Project Overview + - Key Features + - Getting Started Guide + - Detailed Documentation Structure + - Common Use Cases + - FAQ Section +- **Fallback System**: If the API fails, falls back to manual content generation +- **Rate Limiting**: Respects API limits with built-in delays and batch processing + +## Prerequisites + +1. **OpenAI API Key**: You need an active OpenAI API key +2. **Built Project**: The script reads from the Next.js build output, so run `npm run build` first + +## Setup + +1. **Install dependencies**: + ```bash + npm install + ``` + +2. **Set your OpenAI API Key**: + ```bash + export OPENAI_API_KEY="your-api-key-here" + ``` + +3. **Build the project** (if not already done): + ```bash + npm run build + ``` + +## Usage + +### Basic Usage +```bash +npm run generate-llm-gpt +``` + +### With Environment Variables +```bash +OPENAI_API_KEY="your-key" SITE_URL="https://docs.nocodb.com" OUTPUT_FILE="enhanced-llms.txt" npm run generate-llm-gpt +``` + +### Direct Script Execution +```bash +tsx scripts/generate-llm-content-with-gpt.ts +``` + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `OPENAI_API_KEY` | *required* | Your OpenAI API key | +| `SITE_URL` | `https://docs.nocodb.com` | Base URL for the documentation site | +| `OUTPUT_FILE` | `llms-enhanced.txt` | Output filename for the generated file | + +## Output + +The script generates a comprehensive markdown file with the following structure: + +``` +# Project Title + +## Overview +[AI-generated comprehensive overview] + +## Key Features +- Feature 1 +- Feature 2 +... + +## Quick Start Guide +1. Step 1 +2. Step 2 +... + +## Documentation +### Category 1 +#### Page Title +[Description and key points] + +### Category 2 +[More documentation sections] + +## Common Use Cases +- Use case 1 +- Use case 2 +... + +## Frequently Asked Questions +**Q: Question 1** +A: Answer 1 +... +``` + +## How It Works + +1. **Content Extraction**: Scans the built HTML files and extracts: + - Page titles and metadata + - Main content text (first 2000 characters) + - URLs and categorization + +2. **AI Enhancement**: Sends a comprehensive summary to GPT-4 with instructions to: + - Create a compelling project overview + - Organize content into logical sections + - Generate helpful getting started guides + - Create FAQ sections + - Identify common use cases + +3. **Document Generation**: Combines the AI-generated structure with detailed page information to create the final document + +## Comparison with Standard Script + +| Feature | Standard Script | GPT-Enhanced Script | +|---------|----------------|-------------------| +| Content Analysis | Metadata only | Full content + metadata | +| Structure | Basic | AI-optimized | +| Overview Generation | Template-based | AI-generated | +| Getting Started | Simple list | Structured guide | +| FAQ Section | None | AI-generated | +| Use Cases | Basic | AI-identified | +| Fallback | None | Full fallback system | + +## API Costs + +The script uses GPT-4 which costs approximately: +- ~$0.03 per 1K tokens for input +- ~$0.06 per 1K tokens for output + +For a typical documentation site with 50-100 pages, expect costs around $0.50-$2.00 per run. + +## Troubleshooting + +### "OPENAI_API_KEY environment variable is required" +Set your API key: `export OPENAI_API_KEY="your-key"` + +### "No build output found" +Run `npm run build` first to generate the HTML files + +### "Error with GPT API" +- Check your API key is valid +- Ensure you have sufficient API credits +- The script will fall back to manual generation if the API fails + +### Rate Limiting +The script includes built-in rate limiting. If you hit limits: +- Wait a few minutes before retrying +- Consider reducing the `batchSize` in the script + +## Customization + +You can modify the script to: +- Change the GPT model (currently uses `gpt-4`) +- Adjust the content extraction length +- Modify the categorization logic +- Customize the output format +- Add additional AI prompts for specific content types + +## Contributing + +To improve the script: +1. Test with different documentation structures +2. Enhance the content extraction logic +3. Improve the AI prompts for better output +4. Add support for additional content types \ No newline at end of file diff --git a/scripts/generate-llm-content-with-gpt.ts b/scripts/generate-llm-content-with-gpt.ts new file mode 100644 index 0000000..d613edb --- /dev/null +++ b/scripts/generate-llm-content-with-gpt.ts @@ -0,0 +1,521 @@ +import fs from 'fs'; +import path from 'path'; +import { JSDOM } from 'jsdom'; +import OpenAI from 'openai'; + +interface SitemapUrl { + url: string; + title?: string; + metaDescription?: string; + ogTitle?: string; + ogDescription?: string; + keywords?: string; + category?: string; + content?: string; +} + +interface EnhancedStructuredContent { + title: string; + tagline: string; + overview: string; + keyFeatures: string[]; + gettingStarted: { + title: string; + steps: string[]; + }; + documentation: { + [category: string]: { + title: string; + description: string; + pages: { + title: string; + url: string; + description: string; + keyPoints: string[]; + }[]; + }; + }; + faq: { + question: string; + answer: string; + }[]; + useCases: string[]; +} + +class GPTEnhancedLLMGenerator { + private openai: OpenAI; + private siteUrl: string; + private outputFile: string; + private maxConcurrent: number = 5; + + constructor(apiKey: string, siteUrl: string, outputFile: string = 'llms-enhanced.txt') { + this.openai = new OpenAI({ apiKey }); + this.siteUrl = siteUrl; + this.outputFile = outputFile; + } + + async generateEnhancedLLMContent() { + console.log('🚀 Starting GPT-enhanced LLM content generation...'); + + // Extract content from built files + const processedUrls = await this.extractFromBuiltFiles(); + console.log(`📄 Found ${processedUrls.length} documentation files`); + + // Enhance content with GPT + const enhancedContent = await this.enhanceWithGPT(processedUrls); + + // Generate the enhanced LLM file + await this.generateEnhancedDocument(enhancedContent); + + console.log(`✅ Enhanced llms.txt file generated: ${this.outputFile}`); + } + + private async extractFromBuiltFiles(): Promise { + const results: SitemapUrl[] = []; + + // Check for Next.js build output + const nextAppDir = path.join(process.cwd(), '.next', 'server', 'app'); + const outDir = path.join(process.cwd(), 'out'); + + let buildDir: string; + + if (fs.existsSync(outDir)) { + buildDir = outDir; + } else if (fs.existsSync(nextAppDir)) { + buildDir = nextAppDir; + } else { + throw new Error('No build output found. Please run "npm run build" first.'); + } + + console.log(`📁 Reading from build directory: ${buildDir}`); + + // Find HTML files recursively + const htmlFiles = await this.findHtmlFiles(buildDir); + console.log(`📄 Found ${htmlFiles.length} HTML files`); + + // Process files in batches to avoid overwhelming the API + const batchSize = 10; + for (let i = 0; i < htmlFiles.length; i += batchSize) { + const batch = htmlFiles.slice(i, i + batchSize); + const batchPromises = batch.map(filePath => this.extractContentFromFile(filePath, buildDir)); + const batchResults = await Promise.all(batchPromises); + + results.push(...batchResults.filter(Boolean) as SitemapUrl[]); + + // Add a small delay to be respectful to the API + if (i + batchSize < htmlFiles.length) { + await new Promise(resolve => setTimeout(resolve, 1000)); + } + } + + return results; + } + + private async findHtmlFiles(dir: string): Promise { + const htmlFiles: string[] = []; + + const traverse = (currentDir: string) => { + if (!fs.existsSync(currentDir)) return; + + const items = fs.readdirSync(currentDir); + + for (const item of items) { + const itemPath = path.join(currentDir, item); + const stat = fs.statSync(itemPath); + + if (stat.isDirectory()) { + if (['node_modules', '.git', '_next', 'static'].includes(item)) { + continue; + } + traverse(itemPath); + } else if (item.endsWith('.html')) { + htmlFiles.push(itemPath); + } + } + }; + + traverse(dir); + return htmlFiles; + } + + private async extractContentFromFile(filePath: string, buildDir: string): Promise { + try { + const html = fs.readFileSync(filePath, 'utf-8'); + const dom = new JSDOM(html); + const document = dom.window.document; + + // Extract metadata + const title = document.querySelector('title')?.textContent?.trim() || ''; + const metaDescription = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim() || ''; + const ogTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content')?.trim() || ''; + const ogDescription = document.querySelector('meta[property="og:description"]')?.getAttribute('content')?.trim() || ''; + const keywords = document.querySelector('meta[name="keywords"]')?.getAttribute('content')?.trim() || ''; + + // Extract main content text + const mainContent = document.querySelector('main, [role="main"], .main-content, #main-content, article'); + let content = ''; + + if (mainContent) { + // Remove script tags, style tags, and navigation elements + const elementsToRemove = mainContent.querySelectorAll('script, style, nav, .nav, .navigation, .sidebar, .breadcrumb, .header, .footer'); + elementsToRemove.forEach(el => el.remove()); + + content = mainContent.textContent?.replace(/\s+/g, ' ').trim() || ''; + } + + // Skip if no meaningful content found + if (!title && !metaDescription && !content) { + return null; + } + + const url = this.filePathToUrl(filePath, buildDir); + + // Focus on documentation pages + if (!url.includes('/docs') && !url.includes('/blog')) { + return null; + } + + console.log(`📖 Processing: ${url}`); + + const category = this.categorizeUrl(url); + + return { + url, + title: title || ogTitle || 'Untitled', + metaDescription, + ogTitle, + ogDescription, + keywords, + category, + content: content.substring(0, 2000) // Limit content length for API efficiency + }; + + } catch (error) { + console.error(`❌ Error processing ${filePath}:`, error); + return null; + } + } + + private filePathToUrl(filePath: string, buildDir: string): string { + let urlPath = path.relative(buildDir, filePath); + + if (urlPath.endsWith('.html')) { + urlPath = urlPath.replace('.html', ''); + } + + if (urlPath === 'index') { + urlPath = ''; + } + + urlPath = urlPath.replace(/\\/g, '/'); + + if (urlPath && !urlPath.startsWith('/')) { + urlPath = '/' + urlPath; + } + + return this.siteUrl + urlPath; + } + + private categorizeUrl(url: string): string { + const path = url.toLowerCase(); + + if (path.includes('/blog')) return 'blog'; + if (path.includes('/docs/scripts')) return 'automation-scripts'; + if (path.includes('/docs/self-hosting')) return 'deployment'; + if (path.includes('/docs/product-docs')) return 'core-features'; + if (path.includes('/docs/changelog')) return 'updates'; + if (path.includes('/docs/getting-started')) return 'getting-started'; + if (path.includes('/docs/api')) return 'api-reference'; + if (path.includes('/docs')) return 'documentation'; + + return 'general'; + } + + private async enhanceWithGPT(urls: SitemapUrl[]): Promise { + console.log('🤖 Enhancing content with GPT...'); + + // Group content by category + const categorizedContent = this.groupByCategory(urls); + + // Create a comprehensive content summary for GPT + const contentSummary = this.createContentSummary(categorizedContent); + + const prompt = ` +You are an expert technical writer creating comprehensive documentation for NocoDB, a no-code database platform. + +Based on the following content from the NocoDB documentation website, create a well-structured, comprehensive overview that would be perfect for an LLM training document. + +Content Summary: +${contentSummary} + +Please create a structured response with the following sections: + +1. **Project Overview**: A clear, compelling description of what NocoDB is and what problems it solves +2. **Key Features**: List the main features and capabilities +3. **Getting Started**: Step-by-step guide for new users +4. **Documentation Structure**: Organized breakdown of different documentation areas with descriptions +5. **Common Use Cases**: Real-world scenarios where NocoDB excels +6. **FAQ**: Address common questions and concerns + +Make the content: +- Clear and accessible to both technical and non-technical users +- Well-organized with proper hierarchy +- Comprehensive but concise +- Professional and engaging +- Include specific examples where relevant + +Format your response as a JSON object with the structure matching the EnhancedStructuredContent interface. +`; + + try { + const response = await this.openai.chat.completions.create({ + model: 'gpt-4', + messages: [ + { + role: 'system', + content: 'You are an expert technical writer specializing in creating comprehensive documentation for software platforms. Respond with valid JSON only.' + }, + { + role: 'user', + content: prompt + } + ], + max_tokens: 4000, + temperature: 0.7 + }); + + const gptResponse = response.choices[0]?.message?.content; + if (!gptResponse) { + throw new Error('Empty response from GPT'); + } + + // Parse GPT response + let enhancedContent: EnhancedStructuredContent; + try { + enhancedContent = JSON.parse(gptResponse); + } catch (parseError) { + console.warn('⚠️ Failed to parse GPT JSON response, using fallback structure'); + enhancedContent = this.createFallbackStructure(urls); + } + + // Enhance with detailed page information + enhancedContent.documentation = await this.enhanceDocumentationSections(categorizedContent, enhancedContent.documentation); + + return enhancedContent; + + } catch (error) { + console.error('❌ Error with GPT API:', error); + console.log('📝 Falling back to manual content generation...'); + return this.createFallbackStructure(urls); + } + } + + private groupByCategory(urls: SitemapUrl[]): { [category: string]: SitemapUrl[] } { + const grouped: { [category: string]: SitemapUrl[] } = {}; + + for (const url of urls) { + const category = url.category || 'general'; + if (!grouped[category]) { + grouped[category] = []; + } + grouped[category].push(url); + } + + return grouped; + } + + private createContentSummary(categorizedContent: { [category: string]: SitemapUrl[] }): string { + let summary = ''; + + for (const [category, pages] of Object.entries(categorizedContent)) { + summary += `\n## ${category.toUpperCase()}\n`; + for (const page of pages.slice(0, 5)) { // Limit to avoid token limits + summary += `- ${page.title}: ${page.metaDescription || page.content?.substring(0, 200) || 'No description available'}\n`; + } + } + + return summary; + } + + private async enhanceDocumentationSections( + categorizedContent: { [category: string]: SitemapUrl[] }, + baseDocumentation: any + ): Promise { + const enhanced: any = {}; + + for (const [category, pages] of Object.entries(categorizedContent)) { + if (category === 'blog') continue; // Skip blog posts for documentation + + const categoryInfo = baseDocumentation?.[category] || { + title: this.formatCategoryTitle(category), + description: `Documentation for ${category}`, + }; + + enhanced[category] = { + ...categoryInfo, + pages: pages.map(page => ({ + title: page.title || 'Untitled', + url: page.url, + description: page.metaDescription || page.ogDescription || 'No description available', + keyPoints: this.extractKeyPoints(page.content || '') + })) + }; + } + + return enhanced; + } + + private formatCategoryTitle(category: string): string { + return category + .split('-') + .map(word => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); + } + + private extractKeyPoints(content: string): string[] { + if (!content) return []; + + // Simple extraction of key points from content + const sentences = content.split(/[.!?]+/).filter(s => s.trim().length > 20); + return sentences.slice(0, 3).map(s => s.trim()); + } + + private createFallbackStructure(urls: SitemapUrl[]): EnhancedStructuredContent { + const categorized = this.groupByCategory(urls); + + return { + title: "NocoDB - Open Source Airtable Alternative", + tagline: "Turn any database into a smart spreadsheet with a no-code platform", + overview: "NocoDB is an open-source no-code platform that transforms any database into a collaborative workspace similar to Airtable. It provides a spreadsheet interface for your databases, making data management accessible to both technical and non-technical users.", + keyFeatures: [ + "Connect to existing databases (MySQL, PostgreSQL, SQLite, SQL Server, etc.)", + "Rich spreadsheet interface with collaborative features", + "REST & GraphQL APIs with JWT authentication", + "Team collaboration with fine-grained access control", + "App store for third-party integrations", + "Programmatic access via APIs and SDKs" + ], + gettingStarted: { + title: "Quick Start Guide", + steps: [ + "Install NocoDB using Docker, npm, or our cloud service", + "Connect your existing database or create a new one", + "Configure your tables and relationships", + "Set up user access and permissions", + "Start collaborating with your team" + ] + }, + documentation: {}, + faq: [ + { + question: "What databases does NocoDB support?", + answer: "NocoDB supports MySQL, PostgreSQL, SQLite, SQL Server, and many other popular databases." + }, + { + question: "Is NocoDB really free?", + answer: "Yes, NocoDB is open-source and free to use. We also offer cloud hosting and enterprise features." + } + ], + useCases: [ + "Project management and tracking", + "Customer relationship management (CRM)", + "Content management systems", + "Inventory and asset management", + "Data analysis and reporting" + ] + }; + } + + private async generateEnhancedDocument(content: EnhancedStructuredContent): Promise { + let document = ''; + + // Header + document += `# ${content.title}\n\n`; + document += `${content.tagline}\n\n`; + + // Overview + document += `## Overview\n\n${content.overview}\n\n`; + + // Key Features + document += `## Key Features\n\n`; + content.keyFeatures.forEach(feature => { + document += `- ${feature}\n`; + }); + document += '\n'; + + // Getting Started + document += `## ${content.gettingStarted.title}\n\n`; + content.gettingStarted.steps.forEach((step, index) => { + document += `${index + 1}. ${step}\n`; + }); + document += '\n'; + + // Documentation + document += `## Documentation\n\n`; + for (const [category, info] of Object.entries(content.documentation)) { + document += `### ${info.title}\n`; + document += `${info.description}\n\n`; + + info.pages.forEach((page: any) => { + document += `#### ${page.title}\n`; + document += `${page.description}\n`; + document += `URL: ${page.url}\n`; + if (page.keyPoints.length > 0) { + document += `Key Points:\n`; + page.keyPoints.forEach((point: string) => { + document += `- ${point}\n`; + }); + } + document += '\n'; + }); + } + + // Use Cases + document += `## Common Use Cases\n\n`; + content.useCases.forEach(useCase => { + document += `- ${useCase}\n`; + }); + document += '\n'; + + // FAQ + document += `## Frequently Asked Questions\n\n`; + content.faq.forEach(item => { + document += `**Q: ${item.question}**\n`; + document += `A: ${item.answer}\n\n`; + }); + + // Footer + document += `---\n`; + document += `Generated on: ${new Date().toISOString()}\n`; + document += `Total pages processed: ${Object.values(content.documentation).reduce((acc: number, cat: any) => acc + cat.pages.length, 0)}\n`; + + // Write to file + fs.writeFileSync(this.outputFile, document, 'utf-8'); + } +} + +async function main() { + const apiKey = process.env.OPENAI_API_KEY; + + if (!apiKey) { + console.error('❌ OPENAI_API_KEY environment variable is required'); + console.log('💡 Set your OpenAI API key: export OPENAI_API_KEY="your-api-key-here"'); + process.exit(1); + } + + const siteUrl = process.env.SITE_URL || 'https://docs.nocodb.com'; + const outputFile = process.env.OUTPUT_FILE || 'llms-enhanced.txt'; + + const generator = new GPTEnhancedLLMGenerator(apiKey, siteUrl, outputFile); + + try { + await generator.generateEnhancedLLMContent(); + } catch (error) { + console.error('❌ Generation failed:', error); + process.exit(1); + } +} + +if (require.main === module) { + main(); +} \ No newline at end of file