Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion ingest/postgres_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,16 +533,32 @@ def chunk_files(conn: psycopg.Connection, version: int) -> None:
print(f"Processed {page_count} pages.")


def get_required_env(name: str) -> str:
"""Get a required environment variable, raising an error if not set."""
value = os.environ.get(name)
if not value:
raise ValueError(f"Required environment variable {name} is not set")
return value


def main():
parser = argparse.ArgumentParser(
description="Ingest Postgres documentation into the database."
)
parser.add_argument("version", type=int, help="Postgres version to ingest")
args = parser.parse_args()
version = args.version

# Validate required environment variables before proceeding
pg_user = get_required_env('PGUSER')
pg_password = get_required_env('PGPASSWORD')
pg_host = get_required_env('PGHOST')
pg_port = get_required_env('PGPORT')
pg_database = get_required_env('PGDATABASE')

update_repo()
tag = get_version_tag(version)
db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}"
db_uri = f"postgresql://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_database}"
with psycopg.connect(db_uri) as conn:
print(f"Building Postgres {version} ({tag}) documentation...")
checkout_tag(tag)
Expand Down
31 changes: 26 additions & 5 deletions ingest/tiger_docs.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,16 +387,34 @@ def get_text_embeddings(self, texts):
def get_sitemap_urls(self, domain):
"""Get sitemap URLs from robots.txt, fallback to common locations"""
sitemap_urls = []
# Maximum size for robots.txt (1MB should be more than enough)
MAX_ROBOTS_SIZE = 1024 * 1024

# Try to get sitemaps from robots.txt
robots_url = f'https://{domain}/robots.txt'
try:
self.logger.info(f'Checking robots.txt at: {robots_url}')
response = requests.get(robots_url, timeout=10)
response = requests.get(robots_url, timeout=10, stream=True)
response.raise_for_status()

# Check content length before reading
content_length = response.headers.get('Content-Length')
if content_length and int(content_length) > MAX_ROBOTS_SIZE:
self.logger.warning(f'robots.txt too large ({content_length} bytes), skipping')
response.close()
raise ValueError(f'robots.txt exceeds maximum size of {MAX_ROBOTS_SIZE} bytes')

# Read with size limit
content = response.raw.read(MAX_ROBOTS_SIZE + 1, decode_content=True)
response.close()
if len(content) > MAX_ROBOTS_SIZE:
self.logger.warning(f'robots.txt too large, skipping')
raise ValueError(f'robots.txt exceeds maximum size of {MAX_ROBOTS_SIZE} bytes')

robots_text = content.decode('utf-8', errors='replace')

# Parse robots.txt for sitemap entries
for line in response.text.split('\n'):
for line in robots_text.split('\n'):
line = line.strip()
if line.lower().startswith('sitemap:'):
sitemap_url = line.split(':', 1)[1].strip()
Expand Down Expand Up @@ -930,8 +948,8 @@ def generate_filename(self, url):

# Ensure filename isn't too long
if len(safe_path) > 100:
# Create hash of original path and truncate
hash_suffix = hashlib.md5(path.encode()).hexdigest()[:8]
# Create hash of original path using SHA256 (cryptographically secure)
hash_suffix = hashlib.sha256(path.encode()).hexdigest()[:16]
safe_path = safe_path[:80] + '_' + hash_suffix

return f"{safe_path}.md"
Expand Down Expand Up @@ -965,9 +983,12 @@ def generate_filename(self, url):
default='scraped_docs',
help='Output directory for scraped files (default: scraped_docs)')

# Maximum allowed pages to prevent resource exhaustion
MAX_PAGES_LIMIT = 10000

parser.add_argument('-m', '--max-pages',
type=int,
help='Maximum number of pages to scrape (default: unlimited)')
help=f'Maximum number of pages to scrape (default: unlimited, max: {MAX_PAGES_LIMIT})')

parser.add_argument('--strip-images',
action='store_true',
Expand Down