timescale · orbisai0security · Dec 31, 2025
diff --git a/ingest/postgres_docs.py b/ingest/postgres_docs.py
@@ -533,16 +533,32 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
     print(f"Processed {page_count} pages.")
 
 
+def get_required_env(name: str) -> str:
+    """Get a required environment variable, raising an error if not set."""
+    value = os.environ.get(name)
+    if not value:
+        raise ValueError(f"Required environment variable {name} is not set")
+    return value
+
+
 def main():
     parser = argparse.ArgumentParser(
         description="Ingest Postgres documentation into the database."
     )
     parser.add_argument("version", type=int, help="Postgres version to ingest")
     args = parser.parse_args()
     version = args.version
+
+    # Validate required environment variables before proceeding
+    pg_user = get_required_env('PGUSER')
+    pg_password = get_required_env('PGPASSWORD')
+    pg_host = get_required_env('PGHOST')
+    pg_port = get_required_env('PGPORT')
+    pg_database = get_required_env('PGDATABASE')
+
     update_repo()
     tag = get_version_tag(version)
-    db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}"
+    db_uri = f"postgresql://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_database}"
     with psycopg.connect(db_uri) as conn:
         print(f"Building Postgres {version} ({tag}) documentation...")
         checkout_tag(tag)

diff --git a/ingest/tiger_docs.py b/ingest/tiger_docs.py
@@ -387,16 +387,34 @@ def get_text_embeddings(self, texts):
     def get_sitemap_urls(self, domain):
         """Get sitemap URLs from robots.txt, fallback to common locations"""
         sitemap_urls = []
+        # Maximum size for robots.txt (1MB should be more than enough)
+        MAX_ROBOTS_SIZE = 1024 * 1024
 
         # Try to get sitemaps from robots.txt
         robots_url = f'https://{domain}/robots.txt'
         try:
             self.logger.info(f'Checking robots.txt at: {robots_url}')
-            response = requests.get(robots_url, timeout=10)
+            response = requests.get(robots_url, timeout=10, stream=True)
             response.raise_for_status()
 
+            # Check content length before reading
+            content_length = response.headers.get('Content-Length')
+            if content_length and int(content_length) > MAX_ROBOTS_SIZE:
+                self.logger.warning(f'robots.txt too large ({content_length} bytes), skipping')
+                response.close()
+                raise ValueError(f'robots.txt exceeds maximum size of {MAX_ROBOTS_SIZE} bytes')
+
+            # Read with size limit
+            content = response.raw.read(MAX_ROBOTS_SIZE + 1, decode_content=True)
+            response.close()
+            if len(content) > MAX_ROBOTS_SIZE:
+                self.logger.warning(f'robots.txt too large, skipping')
+                raise ValueError(f'robots.txt exceeds maximum size of {MAX_ROBOTS_SIZE} bytes')
+
+            robots_text = content.decode('utf-8', errors='replace')
+
             # Parse robots.txt for sitemap entries
-            for line in response.text.split('\n'):
+            for line in robots_text.split('\n'):
                 line = line.strip()
                 if line.lower().startswith('sitemap:'):
                     sitemap_url = line.split(':', 1)[1].strip()
@@ -930,8 +948,8 @@ def generate_filename(self, url):
 
         # Ensure filename isn't too long
         if len(safe_path) > 100:
-            # Create hash of original path and truncate
-            hash_suffix = hashlib.md5(path.encode()).hexdigest()[:8]
+            # Create hash of original path using SHA256 (cryptographically secure)
+            hash_suffix = hashlib.sha256(path.encode()).hexdigest()[:16]
             safe_path = safe_path[:80] + '_' + hash_suffix
 
         return f"{safe_path}.md"
@@ -965,9 +983,12 @@ def generate_filename(self, url):
                        default='scraped_docs',
                        help='Output directory for scraped files (default: scraped_docs)')
 
+    # Maximum allowed pages to prevent resource exhaustion
+    MAX_PAGES_LIMIT = 10000
+
     parser.add_argument('-m', '--max-pages',
                        type=int,
-                       help='Maximum number of pages to scrape (default: unlimited)')
+                       help=f'Maximum number of pages to scrape (default: unlimited, max: {MAX_PAGES_LIMIT})')
 
     parser.add_argument('--strip-images',
                        action='store_true',