diff --git a/app/database/models.py b/app/database/models.py index b99eafd..6d93393 100644 --- a/app/database/models.py +++ b/app/database/models.py @@ -393,6 +393,7 @@ class Resource(SQLModel, table=True): """ FIELDS """ id: Optional[int] = Field(default=None, primary_key=True) name: str = Field(max_length=100) + table_of_contents: Optional[dict[str, Any]] = Field(default=None, sa_column=Column(JSON)) type: Optional[enums.ResourceType] = Field(max_length=30) authors: Optional[list[str]] = Field(sa_column=Column(ARRAY(String(50)))) created_at: Optional[datetime] = Field( @@ -406,9 +407,6 @@ class Resource(SQLModel, table=True): resource_classes: Optional[list["ClassResource"]] = Relationship( back_populates="resource_", cascade_delete=True ) - # resource_sections: Optional[list["Section"]] = Relationship( - # back_populates="resource_", cascade_delete=True - # ) resource_chunks: Optional[list["Chunk"]] = Relationship( back_populates="resource_", cascade_delete=True ) @@ -443,18 +441,11 @@ class Chunk(SQLModel, table=True): """ FIELDS """ id: Optional[int] = Field(default=None, primary_key=True) resource_id: int = Field(foreign_key="resources.id", index=True, ondelete="CASCADE") - # section_id: Optional[int] = Field( - # foreign_key="sections.id", index=True, ondelete="CASCADE", default=None - # ) content: str page_number: Optional[int] = Field(default=None) - # TODO: Define the different types of chunks in an enum + chapter_number: Optional[int] = Field(default=None) + subchapter_number: Optional[int] = Field(default=None) chunk_type: Optional[enums.ChunkType] = Field(max_length=30, default=None) - """ - XXX: FILL IN THE EMBEDDING LENGTH FOR YOUR EMBEDDINGS - - Default is set to 1024 (for bge-large vectors) - - Replace with 1536 for text-embedding-3-small if using OpenAI's embedder - """ embedding: Any = Field(sa_column=Column(Vector(1024))) top_level_section_index: Optional[str] = Field(max_length=10, default=None) top_level_section_title: Optional[str] = Field(max_length=100, default=None) @@ -467,48 +458,3 @@ class Chunk(SQLModel, table=True): """ RELATIONSHIPS """ resource_: Optional["Resource"] = Relationship(back_populates="resource_chunks") - # section_: Optional["Section"] = Relationship(back_populates="section_chunks") - - -# class Section(SQLModel, table=True): -# __tablename__ = "sections" - -# """ FIELDS """ -# id: Optional[int] = Field(default=None, primary_key=True) -# resource_id: int = Field(foreign_key="resources.id", index=True, ondelete="CASCADE") -# parent_section_id: Optional[int] = Field( -# default=None, foreign_key="sections.id", nullable=True -# ) -# section_index: Optional[str] = Field(max_length=20, default=None) -# section_title: Optional[str] = Field(max_length=100, default=None) -# section_type: Optional[str] = Field(max_length=15, default=None) -# section_order: int -# page_range: Optional[list[int]] = Field(sa_column=Column(ARRAY(Integer))) -# summary: Optional[str] = Field(default=None) -# created_at: Optional[datetime] = Field( -# default_factory=lambda: datetime.now(timezone.utc), -# sa_type=DateTime(timezone=True), # type: ignore -# sa_column_kwargs={"server_default": sa.func.now()}, -# nullable=False, -# ) - -# """ RELATIONSHIPS """ -# resource_: Resource = Relationship(back_populates="resource_sections") -# parent: Optional["Section"] = Relationship( -# back_populates="children", -# sa_relationship_kwargs={ -# "remote_side": "[Section.id]" # Quote wrapped to handle forward references -# }, -# ) - -# # Only part I'm not too sure about -# children: Optional[list["Section"]] = Relationship( -# back_populates="parent", -# cascade_delete=True, -# sa_relationship_kwargs={ -# "single_parent": True, # This ensures a child can only have one parent -# }, -# ) -# section_chunks: Optional[list["Chunk"]] = Relationship( -# back_populates="section_", cascade_delete=True -# ) diff --git a/migrations/versions/cdd298ac2606_.py b/migrations/versions/cdd298ac2606_.py new file mode 100644 index 0000000..021aa5a --- /dev/null +++ b/migrations/versions/cdd298ac2606_.py @@ -0,0 +1,31 @@ +""" + +Revision ID: cdd298ac2606 +Revises: a1f4c7e2d3b9 +Create Date: 2026-06-03 17:25:20.071735 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +import sqlmodel + + +# revision identifiers, used by Alembic. +revision: str = 'cdd298ac2606' +down_revision: Union[str, None] = 'a1f4c7e2d3b9' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column('chunks', sa.Column('chapter_number', sa.Integer(), nullable=True)) + op.add_column('chunks', sa.Column('subchapter_number', sa.Integer(), nullable=True)) + op.add_column('resources', sa.Column('table_of_contents', sa.JSON(), nullable=True)) + + +def downgrade() -> None: + op.drop_column('resources', 'table_of_contents') + op.drop_column('chunks', 'subchapter_number') + op.drop_column('chunks', 'chapter_number') diff --git a/scripts/database/resource_ingestion.py b/scripts/database/resource_ingestion.py index c082741..4bfa922 100644 --- a/scripts/database/resource_ingestion.py +++ b/scripts/database/resource_ingestion.py @@ -39,12 +39,20 @@ class Chunk(BaseModel): embedding: list[float] page_number: int chapter_number: int + subchapter_number: int | None = None + + +class SubChapter(BaseModel): + name: str + number: str + start_page: int class Chapter(BaseModel): name: str number: int start_page: int + subchapters: list[SubChapter] | None = None class TableOfContents(BaseModel): @@ -85,6 +93,14 @@ def get_parsed_book(file_name: str) -> ParsedBook: name=chapter["name"], number=chapter["number"], start_page=chapter["start_page"], + subchapters=[ + SubChapter( + name=subchapter["name"], + number=subchapter["number"], + start_page=subchapter["start_page"] + ) + for subchapter in chapter.get("subchapters", []) + ] if chapter.get("subchapters") else None, ) for chapter in book_content_raw["table_of_contents"]["chapters"] ] @@ -95,6 +111,7 @@ def get_parsed_book(file_name: str) -> ParsedBook: embedding=chunk_raw["embedding"], page_number=chunk_raw["page_number"], chapter_number=chunk_raw["chapter_number"], + subchapter_number=chunk_raw.get("subchapter_number"), ) for chunk_raw in book_content_raw["chunks"] ], @@ -147,6 +164,7 @@ async def inject_subject_class_and_resource_data(parsed_book: ParsedBook): name=parsed_book.resource.name, type=parsed_book.resource.type, authors=parsed_book.resource.authors, + table_of_contents=parsed_book.table_of_contents.dict(), ) session.add(resource) await session.flush() @@ -202,6 +220,8 @@ async def process_chunks( content=item.content, chunk_type=ChunkType.text, # TODO: include in json page_number=item.page_number, + chapter_number=item.chapter_number, + subchapter_number=item.subchapter_number, top_level_section_index=str(item.chapter_number), top_level_section_title=title[:100] if title else None, embedding=item.embedding,