feat: new pinecone API (#285)

gotochkin · web-flow · commit db14c9f4a6b8 · 2025-02-13T14:21:16.000-05:00
diff --git a/infrastructure/movie-search-app/README.md b/infrastructure/movie-search-app/README.md
@@ -96,7 +96,59 @@ Here is the [link to the documentation for AlloyDB](https://cloud.google.com/all
 Create a database with the name movies and the user movies_owner. You can choose your own names for the database and the user. The application takes it from environment variables. Optionally you can modify the application to use secret manager in Google Cloud as more secured approach.
 
 ### Migrate data from Pinecone to AlloyDB
-- Move the data from Pinecone to AlloyDB
+Move the data from Pinecone to AlloyDB
+- Pinecone index structure consists primarily from 3 main parts:
+  ID - unique row ID
+  VALUES	- vector embedding value (text-embedding-004 from Google)
+  METADATA	- Supplemental information about the data in key/value format
+
+- The future AlloyDB/PostreSQL table as it is defined in the app will have the following structure:
+   ```
+                     Table "public.alloydb_table"
+         Column       |    Type     | Collation | Nullable | Default
+   --------------------+-------------+-----------+----------+---------
+   langchain_id       | uuid        |           | not null |
+   content            | text        |           | not null |
+   embedding          | vector(768) |           | not null |
+   langchain_metadata | json        |           |          |
+   Indexes:
+      "alloydb_table_pkey" PRIMARY KEY, btree (langchain_id)
+   ```
+   And here is the json keys for the langchain_metadata column (from the movie dataset):
+   ```
+     jsonb_object_keys
+   ---------------------
+   tags
+   genre
+   image
+   title
+   actors
+   poster
+   writer
+   runtime
+   summary
+   director
+   imdblink
+   boxoffice
+   imdbscore
+   imdbvotes
+   languages
+   viewrating
+   netflixlink
+   releasedate
+   tmdbtrailer
+   trailersite
+   seriesormovie
+   awardsreceived
+   hiddengemscore
+   metacriticscore
+   productionhouse
+   awardsnominatedfor
+   netflixreleasedate
+   countryavailability
+   rottentomatoesscore
+   ```
+- All the metadata keys are taken from the Pinecone metadata keeping the same structure.
 
 ### Enable virtual environment for Python
 You can use either your laptop or a virtual machnie for deployment. Using a VM deployed in the same Google Cloud project simplifies deployeent and network configuration. On a Debian Linux you can enable it in the shell using the following command:
@@ -126,9 +178,9 @@ pip install -r requirements.txt
 export PINECONE_INDEX_NAME=netflix-index-01
 export PORT=8080
 export DB_USER=movies_owner
-export DB_PASS=DatabasePassword
+export DB_PASS={DATABASEPASSSWORD}
 export DB_NAME=movies
-export INSTANCE_HOST=ALLOYDB_IP
+export INSTANCE_HOST={ALLOYDB_IP}
 export DB_PORT=5432
 ```
 - Here is the command used to start the application
diff --git a/infrastructure/movie-search-app/movie_search.py b/infrastructure/movie-search-app/movie_search.py
@@ -209,13 +209,13 @@ def get_movies(db: sqlalchemy.engine.base.Engine, embeddings: str) -> dict:
     stmt = sqlalchemy.text(
         """
         SELECT
-                mj.metadata->'title' as title,
-                mj.metadata->'summary' as summary,
-                mj.metadata->'director' as director,
-                mj.metadata->'actors' as actors,
+                mj.langchain_metadata->'title' as title,
+                mj.langchain_metadata->'summary' as summary,
+                mj.langchain_metadata->'director' as director,
+                mj.langchain_metadata->'actors' as actors,
                 (mj.embedding <=> (:embeddings)::vector) as distance
         FROM
-                movies_json mj
+                alloydb_table mj
         ORDER BY
                 distance ASC
         LIMIT 5;
diff --git a/infrastructure/movie-search-app/pinecone_model.py b/infrastructure/movie-search-app/pinecone_model.py
@@ -14,7 +14,7 @@
 
 import google.generativeai as genai
 from typing import Iterable
-from pinecone import Pinecone # as Pinecone
+from pinecone.grpc import PineconeGRPC as Pinecone
 import logging
 import os
 from data_model import ChatMessage, State
@@ -58,10 +58,10 @@ def get_movies(embedding: list[float]) -> dict:
         logging.warning("PINECONE_INDEX_NAME not set, using default: %s", PINECONE_INDEX_NAME)
     pc = Pinecone(api_key=state.pinecone_api_key)
     index = pc.Index(name=PINECONE_INDEX_NAME)
-    query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5)
+    query_resp = index.query(vector=embedding, namespace="sandpaper", top_k=5, include_metadata=True)
     movies_list = []
     for match in query_resp.matches:
-        meta = index.fetch(ids=[match['id']], namespace="sandpaper")["vectors"][match['id']]["metadata"]
+        meta = match["metadata"]
         movies_list.append({"title":meta["title"],"summary":meta["summary"],"director":meta["director"],"genre": meta["genre"],"actors": meta["actors"]})
     return movies_list