cocoindex-io · wykrrr · Jul 9, 2025
diff --git a/examples/notion/.env.example b/examples/notion/.env.example
@@ -0,0 +1,7 @@
+# Database Configuration
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Notion Configuration
+NOTION_TOKEN=secret_your_notion_integration_token_here
+NOTION_DATABASE_IDS=database_id_1,database_id_2
+NOTION_PAGE_IDS=page_id_1,page_id_2
diff --git a/examples/notion/README.md b/examples/notion/README.md
@@ -0,0 +1,69 @@
+This example builds an embedding index based on Notion databases and pages.
+It continuously updates the index as content is added / updated / deleted in Notion:
+it keeps the index in sync with your Notion workspace effortlessly.
+
+## Prerequisite
+
+Before running the example, you need to:
+
+1.  [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+
+2.  Prepare for Notion integration.
+    - Create a Notion integration at https://www.notion.so/my-integrations
+    - Copy the integration token (starts with `secret_`)
+    - Share your databases and pages with the integration
+
+3.  Create a `.env` file with your Notion token and database/page IDs.
+    Start from copying the `.env.example`, and then edit it to fill in your configuration.
+
+    ```bash
+    cp .env.example .env
+    $EDITOR .env
+    ```
+
+    Example `.env` file:
+    ```
+    # Database Configuration
+    COCOINDEX_DATABASE_URL=postgresql://localhost:5432/cocoindex
+
+    # Notion Configuration
+    NOTION_TOKEN=secret_your_notion_integration_token_here
+    NOTION_DATABASE_IDS=database_id_1,database_id_2
+    NOTION_PAGE_IDS=page_id_1,page_id_2
+    ```
+
+    Note: You can specify either database IDs, page IDs, or both. The system will index all specified resources.
+
+## Run
+
+Install dependencies:
+
+```sh
+pip install -e .
+```
+
+Run:
+
+```sh
+python main.py
+```
+
+During running, it will keep observing changes in your Notion workspace and update the index automatically.
+At the same time, it accepts queries from the terminal, and performs search on top of the up-to-date index.
+
+## CocoInsight
+CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9).
+
+Run CocoInsight to understand your RAG data pipeline:
+
+```sh
+cocoindex server -ci main.py
+```
+
+You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time:
+
+```sh
+cocoindex server -ci -L main.py
+```
+
+Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight).
diff --git a/examples/notion/main.py b/examples/notion/main.py
@@ -0,0 +1,160 @@
+from dotenv import load_dotenv
+from psycopg_pool import ConnectionPool
+import cocoindex
+import os
+from typing import Any
+
+
+@cocoindex.transform_flow()
+def text_to_embedding(
+    text: cocoindex.DataSlice[str],
+) -> cocoindex.DataSlice[list[float]]:
+    """
+    Embed the text using a SentenceTransformer model.
+    This is a shared logic between indexing and querying, so extract it as a function.
+    """
+    return text.transform(
+        cocoindex.functions.SentenceTransformerEmbed(
+            model="sentence-transformers/all-MiniLM-L6-v2"
+        )
+    )
+
+
+@cocoindex.flow_def(name="NotionTextEmbedding")
+def notion_text_embedding_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+) -> None:
+    """
+    Define an example flow that embeds text from Notion databases and pages into a vector database.
+    """
+    notion_token = os.environ["NOTION_TOKEN"]
+
+    # Add Notion source
+    database_ids = (
+        os.environ.get("NOTION_DATABASE_IDS", "").split(",")
+        if os.environ.get("NOTION_DATABASE_IDS")
+        else []
+    )
+    page_ids = (
+        os.environ.get("NOTION_PAGE_IDS", "").split(",")
+        if os.environ.get("NOTION_PAGE_IDS")
+        else []
+    )
+
+    # For now, let's use only one type at a time to avoid conflicts
+    if database_ids:
+        data_scope["notion_content"] = flow_builder.add_source(
+            cocoindex.sources.Notion(
+                token=notion_token,
+                source_type="database",
+                database_ids=database_ids,
+            )
+        )
+    elif page_ids:
+        data_scope["notion_content"] = flow_builder.add_source(
+            cocoindex.sources.Notion(
+                token=notion_token,
+                source_type="page",
+                page_ids=page_ids,
+            )
+        )
+    else:
+        # If no IDs provided, create a dummy source that won't produce any data
+        data_scope["notion_content"] = flow_builder.add_source(
+            cocoindex.sources.Notion(
+                token=notion_token,
+                source_type="page",
+                page_ids=[],
+            )
+        )
+
+    doc_embeddings = data_scope.add_collector()
+
+    # Process Notion content
+    with data_scope["notion_content"].row() as notion_entry:
+        print(f"""DEBUG: Processing notion entry content {notion_entry["content"]}""")
+
+        notion_entry["chunks"] = notion_entry["content"].transform(
+            cocoindex.functions.SplitRecursively(),
+            language="markdown",
+            chunk_size=200,
+            chunk_overlap=0,
+        )
+
+        with notion_entry["chunks"].row() as chunk:
+            print("row")
+            chunk["embedding"] = text_to_embedding(chunk["text"])
+            doc_embeddings.collect(
+                notion_id=notion_entry["id"],
+                title=notion_entry["title"],
+                location=chunk["location"],
+                text=chunk["text"],
+                embedding=chunk["embedding"],
+            )
+
+    doc_embeddings.export(
+        "doc_embeddings",
+        cocoindex.targets.Postgres(),
+        primary_key_fields=["notion_id", "location"],
+        vector_indexes=[
+            cocoindex.VectorIndexDef(
+                field_name="embedding",
+                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY,
+            )
+        ],
+    )
+
+
+def search(pool: ConnectionPool, query: str, top_k: int = 5) -> list[dict[str, Any]]:
+    # Get the table name, for the export target in the notion_text_embedding_flow above.
+    table_name = cocoindex.utils.get_target_default_name(
+        notion_text_embedding_flow, "doc_embeddings"
+    )
+    # Evaluate the transform flow defined above with the input query, to get the embedding.
+    query_vector = text_to_embedding.eval(query)
+    # Run the query and get the results.
+    with pool.connection() as conn:
+        with conn.cursor() as cur:
+            cur.execute(
+                f"""
+                SELECT title, text, embedding <=> %s::vector AS distance
+                FROM {table_name} ORDER BY distance LIMIT %s
+            """,
+                (query_vector, top_k),
+            )
+            return [
+                {
+                    "title": row[0],
+                    "text": row[1],
+                    "score": 1.0 - row[2],
+                }
+                for row in cur.fetchall()
+            ]
+
+
+def _main() -> None:
+    # Initialize the database connection pool.
+    pool = ConnectionPool(os.getenv("COCOINDEX_DATABASE_URL"))
+
+    notion_text_embedding_flow.setup()
+    # with cocoindex.FlowLiveUpdater(notion_text_embedding_flow):
+    if 1:
+        # Run queries in a loop to demonstrate the query capabilities.
+        while True:
+            query = input("Enter search query (or Enter to quit): ")
+            if query == "":
+                break
+            # Run the query function with the database connection pool and the query.
+            results = search(pool, query)
+            print("\nSearch results:")
+            for result in results:
+                print(f"[{result['score']:.3f}] {result['title']}")
+                print(f"    {result['text']}")
+                print("---")
+            print()
+
+
+if __name__ == "__main__":
+    load_dotenv()
+    cocoindex.init()
+    _main()
diff --git a/examples/notion/pyproject.toml b/examples/notion/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "notion-text-embedding"
+version = "0.1.0"
+description = "Simple example for cocoindex: build embedding index based on Notion databases and pages."
+requires-python = ">=3.11"
+dependencies = ["cocoindex[embeddings]>=0.1.63", "python-dotenv>=1.0.1"]
+
+[tool.setuptools]
+packages = []
diff --git a/python/cocoindex/sources.py b/python/cocoindex/sources.py
@@ -43,3 +43,14 @@ class AmazonS3(op.SourceSpec):
     included_patterns: list[str] | None = None
     excluded_patterns: list[str] | None = None
     sqs_queue_url: str | None = None
+
+
+class Notion(op.SourceSpec):
+    """Import data from Notion databases and pages."""
+
+    _op_category = op.OpCategory.SOURCE
+
+    token: str
+    source_type: str  # "database" or "page"
+    database_ids: list[str] | None = None
+    page_ids: list[str] | None = None
diff --git a/src/ops/registration.rs b/src/ops/registration.rs
@@ -11,6 +11,7 @@ fn register_executor_factories(registry: &mut ExecutorFactoryRegistry) -> Result
     sources::local_file::Factory.register(registry)?;
     sources::google_drive::Factory.register(registry)?;
     sources::amazon_s3::Factory.register(registry)?;
+    sources::notion::Factory.register(registry)?;
 
     functions::parse_json::Factory.register(registry)?;
     functions::split_recursively::register(registry)?;

diff --git a/src/ops/sources/mod.rs b/src/ops/sources/mod.rs
@@ -1,3 +1,4 @@
 pub mod amazon_s3;
 pub mod google_drive;
 pub mod local_file;
+pub mod notion;