cohere-ai · trentfowlercohere · Jan 17, 2025 · Jan 17, 2025
@@ -30,7 +30,7 @@ As you've seen before, semantic search goes way beyond keyword search. The appli
 ### 1. Download the Dependencies
 
 ```python PYTHON
-#title Import libraries (Run this cell to execute required code) {display-mode: "form"}
+# title Import libraries (Run this cell to execute required code) {display-mode: "form"}
 
 import cohere
 import numpy as np
@@ -43,8 +43,9 @@ import altair as alt
 from sklearn.metrics.pairwise import cosine_similarity
 from annoy import AnnoyIndex
 import warnings
-warnings.filterwarnings('ignore')
-pd.set_option('display.max_colwidth', None)
+
+warnings.filterwarnings("ignore")
+pd.set_option("display.max_colwidth", None)
 ```
 
 ### 2. Get the Archive of Questions
@@ -83,14 +84,15 @@ To get a thousand embeddings of this length should take a few seconds.
 
 ```python PYTHON
 # Paste your API key here. Remember to not share publicly
-api_key = ''
+api_key = ""
 
 # Create and retrieve a Cohere API key from dashboard.cohere.ai/welcome/register
 co = cohere.Client(api_key)
 
 # Get the embeddings
-embeds = co.embed(texts=list(df['text']),
-                  model='embed-english-v2.0').embeddings
+embeds = co.embed(
+    texts=list(df["text"]), model="embed-english-v2.0"
+).embeddings
 ```
 
 ### 4. Build the Index, search Using an Index and Conduct Nearest Neighbour Search
@@ -101,12 +103,12 @@ Let's build an index using the library called <a target="_blank" href="https://g
 
 ```python PYTHON
 # Create the search index, pass the size of embedding
-search_index = AnnoyIndex(np.array(embeds).shape[1], 'angular')
+search_index = AnnoyIndex(np.array(embeds).shape[1], "angular")
 # Add all the vectors to the search index
 for i in range(len(embeds)):
     search_index.add_item(i, embeds[i])
-search_index.build(10) # 10 trees
-search_index.save('test.ann')
+search_index.build(10)  # 10 trees
+search_index.save("test.ann")
 ```
 
 After building the index, we can use it to retrieve the nearest neighbours either of existing questions (section 3.1), or of new questions that we embed (section 3.2).
@@ -119,11 +121,16 @@ If we're only interested in measuring the similarities between the questions in
 # Choose an example (we'll retrieve others similar to it)
 example_id = 92
 # Retrieve nearest neighbors
-similar_item_ids = search_index.get_nns_by_item(example_id,10,
-                                                include_distances=True)
+similar_item_ids = search_index.get_nns_by_item(
+    example_id, 10, include_distances=True
+)
 # Format and print the text and distances
-results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'],
-                             'distance': similar_item_ids[1]}).drop(example_id)
+results = pd.DataFrame(
+    data={
+        "texts": df.iloc[similar_item_ids[0]]["text"],
+        "distance": similar_item_ids[1],
+    }
+).drop(example_id)
 print(f"Question:'{df.iloc[example_id]['text']}'\nNearest neighbors:")
 results
 ```
@@ -154,15 +161,21 @@ We're not limited to searching using existing items. If we get a query, we can e
 query = "What is the tallest mountain in the world?"
 
 # Get the query's embedding
-query_embed = co.embed(texts=[query],
-                  model="embed-english-v2.0").embeddings
+query_embed = co.embed(
+    texts=[query], model="embed-english-v2.0"
+).embeddings
 
 # Retrieve the nearest neighbors
-similar_item_ids = search_index.get_nns_by_vector(query_embed[0],10,
-                                                include_distances=True)
+similar_item_ids = search_index.get_nns_by_vector(
+    query_embed[0], 10, include_distances=True
+)
 # Format the results
-results = pd.DataFrame(data={'texts': df.iloc[similar_item_ids[0]]['text'], 
-                             'distance': similar_item_ids[1]})
+results = pd.DataFrame(
+    data={
+        "texts": df.iloc[similar_item_ids[0]]["text"],
+        "distance": similar_item_ids[1],
+    }
+)
 
 
 print(f"Query:'{query}'\nNearest neighbors:")
@@ -185,31 +198,27 @@ results
 ### 5. Visualize the archive
 
 ```python PYTHON
-#@title Plot the archive {display-mode: "form"}
+# @title Plot the archive {display-mode: "form"}
 
 # UMAP reduces the dimensions from 1024 to 2 dimensions that we can plot
-reducer = umap.UMAP(n_neighbors=20) 
+reducer = umap.UMAP(n_neighbors=20)
 umap_embeds = reducer.fit_transform(embeds)
 # Prepare the data to plot and interactive visualization
 # using Altair
-df_explore = pd.DataFrame(data={'text': df['text']})
-df_explore['x'] = umap_embeds[:,0]
-df_explore['y'] = umap_embeds[:,1]
+df_explore = pd.DataFrame(data={"text": df["text"]})
+df_explore["x"] = umap_embeds[:, 0]
+df_explore["y"] = umap_embeds[:, 1]
 
 # Plot
-chart = alt.Chart(df_explore).mark_circle(size=60).encode(
-    x=#'x',
-    alt.X('x',
-        scale=alt.Scale(zero=False)
-    ),
-    y=
-    alt.Y('y',
-        scale=alt.Scale(zero=False)
-    ),
-    tooltip=['text']
-).properties(
-    width=700,
-    height=400
+chart = (
+    alt.Chart(df_explore)
+    .mark_circle(size=60)
+    .encode(
+        x=alt.X("x", scale=alt.Scale(zero=False)),  #'x',
+        y=alt.Y("y", scale=alt.Scale(zero=False)),
+        tooltip=["text"],
+    )
+    .properties(width=700, height=400)
 )
 chart.interactive()
 ```

@@ -47,7 +47,9 @@ We'll work with [the CaseHOLD dataset](https://huggingface.co/datasets/casehold/
 We'll work with an [IterableDataset](https://huggingface.co/docs/datasets/en/about_mapstyle_vs_iterable) and load only a small fraction of examples at a time to avoid loading the entire dataset in memory.
 
 ```python PYTHON
-iterable_dataset = load_dataset("casehold/casehold", split="train", streaming=True)
+iterable_dataset = load_dataset(
+    "casehold/casehold", split="train", streaming=True
+)
 ```
 
 For this example, we'll use a subset of only 420 data points, to be split across training, validation and test sets. 
@@ -73,14 +75,18 @@ d = []
 # Store each dataset entry as dictionary within Python list
 for example in iterable_dataset.take(num_examples):
     selected_passage_idx = "holding_{}".format(example["label"])
-    hard_negatives_idx = [x for x in all_labels if x != selected_passage_idx]
+    hard_negatives_idx = [
+        x for x in all_labels if x != selected_passage_idx
+    ]
     d.append(
         {
-            'query': example["citing_prompt"],
-            'docs': [example.get(key) for key in all_labels],
-            'label': int(example["label"]),
-            'relevant_passages': [example[selected_passage_idx]],
-            'hard_negatives': [example.get(key) for key in hard_negatives_idx]
+            "query": example["citing_prompt"],
+            "docs": [example.get(key) for key in all_labels],
+            "label": int(example["label"]),
+            "relevant_passages": [example[selected_passage_idx]],
+            "hard_negatives": [
+                example.get(key) for key in hard_negatives_idx
+            ],
         }
     )
 
@@ -98,8 +104,8 @@ test_num = 10
 
 # Do train-validation-test split
 df_train = df[:train_num].copy()
-df_valid = df[train_num:train_num+valid_num].copy()
-df_test = df[train_num+valid_num:].copy()
+df_valid = df[train_num : train_num + valid_num].copy()
+df_test = df[train_num + valid_num :].copy()
 ```
 
 ### Step 2: Assess the Pre-Trained Model
@@ -116,10 +122,12 @@ To get predictions, we'll use the [`rerank()` method](/reference/rerank-1) of th
 ```python PYTHON
 # Predict index of document that corrrectly answers query
 def get_prediction(item, model="rerank-english-v3.0"):
-    response = co.rerank(model=model,
-                         query=item["query"], 
-                         documents=item["docs"], 
-                         top_n=1)
+    response = co.rerank(
+        model=model,
+        query=item["query"],
+        documents=item["docs"],
+        top_n=1,
+    )
     prediction = response.results[0].index
     return prediction
 ```
@@ -129,7 +137,11 @@ We apply this function to every row in the test set and save the predictions in
 ```python PYTHON
 # Calculate pre-trained model's test accuracy
 df_test["baseline_prediction"] = df_test.apply(get_prediction, axis=1)
-print("Baseline accuracy:", sum(df_test["baseline_prediction"] == df_test["label"])/len(df_test))
+print(
+    "Baseline accuracy:",
+    sum(df_test["baseline_prediction"] == df_test["label"])
+    / len(df_test),
+)
 ```
 
 The pre-trained model gets 60% accuracy, which isn't bad!  But we can do better with fine-tuning.
@@ -150,20 +162,26 @@ def create_rerank_ft_data(query, relevant_passages, hard_negatives):
     formatted_data = {
         "query": query,
         "relevant_passages": relevant_passages,
-        "hard_negatives": hard_negatives
+        "hard_negatives": hard_negatives,
     }
     return formatted_data
 
+
 # Creates jsonl file if does not already exist
 def create_jsonl_from_list(file_name, df):
-    path = f'{file_name}.jsonl'
+    path = f"{file_name}.jsonl"
     if not os.path.isfile(path):
-        with open(path, 'w+') as file:
+        with open(path, "w+") as file:
             for idx, row in df.iterrows():
-                formatted_data = create_rerank_ft_data(row["query"], row["relevant_passages"], row["hard_negatives"])
-                file.write(json.dumps(formatted_data) + '\n')
+                formatted_data = create_rerank_ft_data(
+                    row["query"],
+                    row["relevant_passages"],
+                    row["hard_negatives"],
+                )
+                file.write(json.dumps(formatted_data) + "\n")
             file.close()
 
+
 # Create training and validation jsonl files
 create_jsonl_from_list("casehold_train", df_train)
 create_jsonl_from_list("casehold_valid", df_valid)
@@ -208,8 +226,15 @@ In the following code, we calculate the test accuracy of the fine-tuned model. W
 
 ```python PYTHON
 # Calculate fine-tuned model's test accuracy
-df_test['ft_prediction'] = df_test.apply(get_prediction, model='9f22e08a-f1ab-4cee-9524-607dcb08c954-ft', axis=1)
-print("Fine-tune accuracy:", sum(df_test["ft_prediction"] == df_test["label"])/len(df_test))
+df_test["ft_prediction"] = df_test.apply(
+    get_prediction,
+    model="9f22e08a-f1ab-4cee-9524-607dcb08c954-ft",
+    axis=1,
+)
+print(
+    "Fine-tune accuracy:",
+    sum(df_test["ft_prediction"] == df_test["label"]) / len(df_test),
+)
 ```
 
 The fine-tuned model has test accuracy 80%, which is a meaningful improvement over the pre-trained model's accuracy of 60%. 

@@ -47,7 +47,8 @@ prediction_without_search = [
     co.chat(
         message=query,
         max_tokens=50,
-    ) for _ in range(5)
+    )
+    for _ in range(5)
 ]
 ```
 
@@ -88,7 +89,7 @@ Next, we’ll feed these 20 paragraphs to a generative model, and instruct it to
 In order to get the generative model to answer a question based on a certain context, we need to create a prompt. And in this prompt, we need to give it a command and a context. The context will be the concatenation of all the paragraphs retrieved in the search step, which we can obtain using this line of code:
 
 ```python PYTHON
-context = [r['text'] for r in responses]
+context = [r["text"] for r in responses]
 ```
 
 The array `context` contains a lot of text, and, given the good results we’ve been obtaining with search mechanisms, we are fairly confident that somewhere in this text lies the answer to our original question. Now, we invoke the `Chat` endpoint. The prompt we’ll use is the following.
@@ -108,10 +109,8 @@ In other words, we’ve prompted the model to answer the question, but only from
 
 ```python PYTHON
 prediction_with_search = [
-    co.chat(
-        message=prompt,
-        max_tokens=50)
-    for _ in range(5)]
+    co.chat(message=prompt, max_tokens=50) for _ in range(5)
+]
 ```
 
 The five responses we get are the following (just like before, they are truncated):

@@ -31,13 +31,15 @@ co = cohere.Client(cohere_api_key)
 
 # Connect to the Weaviate demo databse containing 10M wikipedia vectors
 # This uses a public READ-ONLY Weaviate API key
-auth_config = weaviate.auth.AuthApiKey(api_key="76320a90-53d8-42bc-b41d-678647c6672e")
+auth_config = weaviate.auth.AuthApiKey(
+    api_key="76320a90-53d8-42bc-b41d-678647c6672e"
+)
 client = weaviate.Client(
     url="https://cohere-demo.weaviate.network/",
     auth_client_secret=auth_config,
     additional_headers={
         "X-Cohere-Api-Key": cohere_api_key,
-    }
+    },
 )
 ```
 
@@ -46,25 +48,30 @@ client = weaviate.Client(
 To use keyword matching, we’ll first define the following function for keyword search. In this function, we’ll tell the vector database what properties we want from each retrieved document. We’ll also filter them to the English language (using results_lang), but feel free to explore searching in other languages as well!
 
 ```python PYTHON
-def keyword_search(query, results_lang='en', num_results=10):
-    properties = ["text", "title", "url", "views", "lang", "_additional {distance}"]
+def keyword_search(query, results_lang="en", num_results=10):
+    properties = [
+        "text",
+        "title",
+        "url",
+        "views",
+        "lang",
+        "_additional {distance}",
+    ]
 
     where_filter = {
         "path": ["lang"],
         "operator": "Equal",
-        "valueString": results_lang
+        "valueString": results_lang,
     }
 
     response = (
         client.query.get("Articles", properties)
-        .with_bm25(
-            query=query
-        )
+        .with_bm25(query=query)
         .with_where(where_filter)
         .with_limit(num_results)
         .do()
     )
-    result = response['data']['Get']['Articles']
+    result = response["data"]["Get"]["Articles"]
     return result
 ```
 

@@ -157,7 +157,7 @@ def get_similarity(target: List[float], candidates: List[float], top_k: int):
   cos_scores = torch.mm(target, candidates)
 
   scores, indices = torch.topk(cos_scores, k=top_k)  
-  similarity_hits = \[{'id': idx, 'score': score} for idx, score in  
+  similarity_hits = [{'id': idx, 'score': score} for idx, score in  
     zip(indices[0].tolist(), scores[0].tolist())]
 
   return similarity_hits

@@ -61,11 +61,11 @@ Ok, there’s a high chance that the answer is there. Let’s see if Rerank can
 ```python PYTHON
 def rerank_responses(query, responses, num_responses=3):
     reranked_responses = co.rerank(
-        query = query,
-        documents = responses,
-        top_n = num_responses,
-        model = 'rerank-english-v3.0',
-        return_documents=True
+        query=query,
+        documents=responses,
+        top_n=num_responses,
+        model="rerank-english-v3.0",
+        return_documents=True,
     )
     return reranked_responses
 ```