diff --git a/code/python/data_loading/db_load.py b/code/python/data_loading/db_load.py old mode 100644 new mode 100755 index c44fb5701..953863cf0 --- a/code/python/data_loading/db_load.py +++ b/code/python/data_loading/db_load.py @@ -15,6 +15,10 @@ import tempfile import traceback from urllib.parse import urlparse +import textwrap +import copy +import re +from bs4 import BeautifulSoup, Tag from typing import List, Dict, Any, Tuple, Union, Optional @@ -24,6 +28,8 @@ read_file_lines, prepare_documents_from_json, documents_from_csv_line, + int64_hash, + get_item_name ) # Import vector database client directly @@ -499,59 +505,234 @@ async def process_csv_file(file_path: str, site: str) -> List[Dict[str, Any]]: return documents - async def process_rss_feed(file_path: str, site: str) -> List[Dict[str, Any]]: """ Process an RSS/Atom feed into document objects. - + Args: file_path: Path to the RSS file or URL site: Site identifier - + Returns: List of document objects """ print(f"Processing RSS/Atom feed: {file_path}") - + try: # Convert feed to schema.org format - podcast_episodes = rss2schema.feed_to_schema(file_path) - + items = rss2schema.feed_to_schema(file_path) + documents = [] - - # Process each episode in the feed - for episode in podcast_episodes: - # Extract URL + + # Process each item in the feed + for episode in items: + # Extract (or synthesise) the canonical URL url = episode.get("url") - - # Generate a synthetic URL if needed if not url and "name" in episode: url = f"synthetic:{site}:{episode['name']}" episode["url"] = url - print(f"Generated synthetic URL for episode: {episode['name']}") - elif not url: - # Skip items without any identifiable information + print(f"Generated synthetic URL for item: {episode['name']}") + elif not url: # Skip items with no identifier at all continue + + # CHUNKING Introduction ------------------------------------------------------------------------------ + """ + Heading-aware chunking (v2) — Misha + - Splits long HTML content into retrieval-friendly chunks + - H1 → pageTitle (metadata); H2 → parentSection (context only). + - Preserves

blocks intact + - Promotes

/

to their own “lowerHeading” chunks + - Never cuts inside a

,