Adding GCP Bigquery vector search with ChatGPT cookbook (#1344)

pap-openai · aaronwilkowitz-openai · web-flow · commit 872a32286811 · 2024-08-07T15:21:46.000-07:00
Co-authored-by: Aaron Wilkowitz &lt;157151487+aaronwilkowitz-openai@users.noreply.github.com&gt;
diff --git a/examples/chatgpt/rag-quickstart/gcp/Getting_started_with_bigquery_vector_search_and_openai.ipynb b/examples/chatgpt/rag-quickstart/gcp/Getting_started_with_bigquery_vector_search_and_openai.ipynb
diff --git a/examples/chatgpt/rag-quickstart/gcp/main.py b/examples/chatgpt/rag-quickstart/gcp/main.py
@@ -0,0 +1,77 @@
+from google.cloud import bigquery
+import functions_framework
+import os
+from openai import OpenAI
+import json
+
+openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+embeddings_model = os.getenv('EMBEDDINGS_MODEL')
+project_id = os.getenv('PROJECT_ID')
+dataset_id = os.getenv('DATASET_ID')
+table_id = os.getenv('TABLE_ID')
+
+def generate_embeddings(text, model):
+    print(f'Generating embedding for: {text}')
+    # Generate embeddings for the provided text using the specified model
+    embeddings_response = openai_client.embeddings.create(model=model, input=text)
+    # Extract the embedding data from the response
+    embedding = embeddings_response.data[0].embedding
+    return embedding
+
+@functions_framework.http
+def openai_docs_search(request):
+    print('received a request')
+    client = bigquery.Client()
+    
+    request_json = request.get_json(silent=True)
+    print(request_json)
+    
+    if not request_json:
+        return json.dumps({"error": "Invalid JSON in request"}), 400, {'Content-Type': 'application/json'}
+    
+    query = request_json.get('query')
+    top_k = request_json.get('top_k', 3)
+    category = request_json.get('category', '')
+
+    if not query:
+        return json.dumps({"error": "Query parameter is required"}), 400, {'Content-Type': 'application/json'}
+    
+    embedding_query = generate_embeddings(query, embeddings_model)
+    embedding_query_list = ', '.join(map(str, embedding_query))
+    
+    sql_query = f"""
+    WITH search_results AS (
+        SELECT query.id AS query_id, base.id AS base_id, distance
+        FROM VECTOR_SEARCH(
+            TABLE `{project_id}.{dataset_id}.{table_id}`, 'content_vector',
+            (SELECT ARRAY[{embedding_query_list}] AS content_vector, 'query_vector' AS id),
+            top_k => {top_k}, distance_type => 'COSINE', options => '{{"use_brute_force": true}}')
+    )
+    SELECT sr.query_id, sr.base_id, sr.distance, ed.text, ed.title, ed.category
+    FROM search_results sr
+    JOIN `{project_id}.{dataset_id}.{table_id}` ed ON sr.base_id = ed.id
+    """
+    
+    if category:
+        sql_query += f" WHERE ed.category = '{category}'"
+
+    sql_query += " ORDER BY sr.distance;"
+    
+    query_job = client.query(sql_query)  # Make an API request.
+    
+    rows = []
+    for row in query_job:
+        print(row.title)
+        rows.append({
+            "text": row.text,
+            "title": row.title,
+            "distance": row.distance,
+            "category": row.category
+        })
+
+    response = {
+        "items": rows
+    }
+    print('sending response')
+    print(len(rows))
+    return json.dumps(response), 200
diff --git a/examples/chatgpt/rag-quickstart/gcp/requirements.txt b/examples/chatgpt/rag-quickstart/gcp/requirements.txt
@@ -0,0 +1,3 @@
+google-cloud-bigquery
+functions-framework
+openai
diff --git a/registry.yaml b/registry.yaml
@@ -1457,6 +1457,18 @@
     - gpt-actions-library
     - chatgpt
 
+- title: GCP BigQuery Vector Search with GCP Functions and GPT Actions in ChatGPT
+  path: examples/chatgpt/rag-quickstart/gcp/Getting_started_with_bigquery_vector_search_and_openai.ipynb
+  date: 2024-08-02
+  authors:
+    - pap-openai
+    - maxreid-openai
+  tags:
+    - embeddings
+    - chatgpt
+    - tiktoken
+    - completions
+
 - title: GPT Actions library - Zapier
   path: examples/chatgpt/gpt_actions_library/gpt_action_zapier.ipynb
   date: 2024-08-05

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+google-cloud-bigquery`
	`2`	`+functions-framework`
	`3`	`+openai`