Tanzania-AI-Community · alvaro-mazcu · Jun 3, 2026
diff --git a/app/database/models.py b/app/database/models.py
@@ -393,6 +393,7 @@ class Resource(SQLModel, table=True):
     """ FIELDS """
     id: Optional[int] = Field(default=None, primary_key=True)
     name: str = Field(max_length=100)
+    table_of_contents: Optional[dict[str, Any]] = Field(default=None, sa_column=Column(JSON))
     type: Optional[enums.ResourceType] = Field(max_length=30)
     authors: Optional[list[str]] = Field(sa_column=Column(ARRAY(String(50))))
     created_at: Optional[datetime] = Field(
@@ -406,9 +407,6 @@ class Resource(SQLModel, table=True):
     resource_classes: Optional[list["ClassResource"]] = Relationship(
         back_populates="resource_", cascade_delete=True
     )
-    # resource_sections: Optional[list["Section"]] = Relationship(
-    #     back_populates="resource_", cascade_delete=True
-    # )
     resource_chunks: Optional[list["Chunk"]] = Relationship(
         back_populates="resource_", cascade_delete=True
     )
@@ -443,18 +441,11 @@ class Chunk(SQLModel, table=True):
     """ FIELDS """
     id: Optional[int] = Field(default=None, primary_key=True)
     resource_id: int = Field(foreign_key="resources.id", index=True, ondelete="CASCADE")
-    # section_id: Optional[int] = Field(
-    #     foreign_key="sections.id", index=True, ondelete="CASCADE", default=None
-    # )
     content: str
     page_number: Optional[int] = Field(default=None)
-    # TODO: Define the different types of chunks in an enum
+    chapter_number: Optional[int] = Field(default=None)
+    subchapter_number: Optional[int] = Field(default=None)
     chunk_type: Optional[enums.ChunkType] = Field(max_length=30, default=None)
-    """
-    XXX: FILL IN THE EMBEDDING LENGTH FOR YOUR EMBEDDINGS
-    - Default is set to 1024 (for bge-large vectors)
-    - Replace with 1536 for text-embedding-3-small if using OpenAI's embedder
-    """
     embedding: Any = Field(sa_column=Column(Vector(1024)))
     top_level_section_index: Optional[str] = Field(max_length=10, default=None)
     top_level_section_title: Optional[str] = Field(max_length=100, default=None)
@@ -467,48 +458,3 @@ class Chunk(SQLModel, table=True):
 
     """ RELATIONSHIPS """
     resource_: Optional["Resource"] = Relationship(back_populates="resource_chunks")
-    # section_: Optional["Section"] = Relationship(back_populates="section_chunks")
-
-
-# class Section(SQLModel, table=True):
-#     __tablename__ = "sections"
-
-#     """ FIELDS """
-#     id: Optional[int] = Field(default=None, primary_key=True)
-#     resource_id: int = Field(foreign_key="resources.id", index=True, ondelete="CASCADE")
-#     parent_section_id: Optional[int] = Field(
-#         default=None, foreign_key="sections.id", nullable=True
-#     )
-#     section_index: Optional[str] = Field(max_length=20, default=None)
-#     section_title: Optional[str] = Field(max_length=100, default=None)
-#     section_type: Optional[str] = Field(max_length=15, default=None)
-#     section_order: int
-#     page_range: Optional[list[int]] = Field(sa_column=Column(ARRAY(Integer)))
-#     summary: Optional[str] = Field(default=None)
-#     created_at: Optional[datetime] = Field(
-#         default_factory=lambda: datetime.now(timezone.utc),
-#         sa_type=DateTime(timezone=True),  # type: ignore
-#         sa_column_kwargs={"server_default": sa.func.now()},
-#         nullable=False,
-#     )
-
-#     """ RELATIONSHIPS """
-#     resource_: Resource = Relationship(back_populates="resource_sections")
-#     parent: Optional["Section"] = Relationship(
-#         back_populates="children",
-#         sa_relationship_kwargs={
-#             "remote_side": "[Section.id]"  # Quote wrapped to handle forward references
-#         },
-#     )
-
-#     # Only part I'm not too sure about
-#     children: Optional[list["Section"]] = Relationship(
-#         back_populates="parent",
-#         cascade_delete=True,
-#         sa_relationship_kwargs={
-#             "single_parent": True,  # This ensures a child can only have one parent
-#         },
-#     )
-#     section_chunks: Optional[list["Chunk"]] = Relationship(
-#         back_populates="section_", cascade_delete=True
-#     )
diff --git a/migrations/versions/cdd298ac2606_.py b/migrations/versions/cdd298ac2606_.py
@@ -0,0 +1,31 @@
+"""
+
+Revision ID: cdd298ac2606
+Revises: a1f4c7e2d3b9
+Create Date: 2026-06-03 17:25:20.071735
+
+"""
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+import sqlmodel
+
+
+# revision identifiers, used by Alembic.
+revision: str = 'cdd298ac2606'
+down_revision: Union[str, None] = 'a1f4c7e2d3b9'
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    op.add_column('chunks', sa.Column('chapter_number', sa.Integer(), nullable=True))
+    op.add_column('chunks', sa.Column('subchapter_number', sa.Integer(), nullable=True))
+    op.add_column('resources', sa.Column('table_of_contents', sa.JSON(), nullable=True))
+
+
+def downgrade() -> None:
+    op.drop_column('resources', 'table_of_contents')
+    op.drop_column('chunks', 'subchapter_number')
+    op.drop_column('chunks', 'chapter_number')
diff --git a/scripts/database/resource_ingestion.py b/scripts/database/resource_ingestion.py
@@ -39,12 +39,20 @@ class Chunk(BaseModel):
     embedding: list[float]
     page_number: int
     chapter_number: int
+    subchapter_number: int | None = None
+
+
+class SubChapter(BaseModel):
+    name: str
+    number: str
+    start_page: int
 
 
 class Chapter(BaseModel):
     name: str
     number: int
     start_page: int
+    subchapters: list[SubChapter] | None = None
 
 
 class TableOfContents(BaseModel):
@@ -85,6 +93,14 @@ def get_parsed_book(file_name: str) -> ParsedBook:
                     name=chapter["name"],
                     number=chapter["number"],
                     start_page=chapter["start_page"],
+                    subchapters=[
+                        SubChapter(
+                            name=subchapter["name"],
+                            number=subchapter["number"],
+                            start_page=subchapter["start_page"]
+                        )
+                        for subchapter in chapter.get("subchapters", [])
+                    ] if chapter.get("subchapters") else None,
                 )
                 for chapter in book_content_raw["table_of_contents"]["chapters"]
             ]
@@ -95,6 +111,7 @@ def get_parsed_book(file_name: str) -> ParsedBook:
                 embedding=chunk_raw["embedding"],
                 page_number=chunk_raw["page_number"],
                 chapter_number=chunk_raw["chapter_number"],
+                subchapter_number=chunk_raw.get("subchapter_number"),
             )
             for chunk_raw in book_content_raw["chunks"]
         ],
@@ -147,6 +164,7 @@ async def inject_subject_class_and_resource_data(parsed_book: ParsedBook):
                     name=parsed_book.resource.name,
                     type=parsed_book.resource.type,
                     authors=parsed_book.resource.authors,
+                    table_of_contents=parsed_book.table_of_contents.dict(),
                 )
                 session.add(resource)
                 await session.flush()
@@ -202,6 +220,8 @@ async def process_chunks(
                     content=item.content,
                     chunk_type=ChunkType.text,  # TODO: include in json
                     page_number=item.page_number,
+                    chapter_number=item.chapter_number,
+                    subchapter_number=item.subchapter_number,
                     top_level_section_index=str(item.chapter_number),
                     top_level_section_title=title[:100] if title else None,
                     embedding=item.embedding,