Merge pull request #33 from longtermrisk/fix_inspect_ai_job

nielsrolf · web-flow · commit a2a86e96372d · 2025-07-22T11:36:35.000+02:00
Fixing inference with OpenAI API
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
 OPENWEIGHTS_API_KEY=<your_openweights_api_key>
 HF_ORG=longtermrisk
 OW_DEFAULT_API_KEY=<optional key that will be used by vllm API deployments>
+OPENAI_API_KEY=<optional key required for examples using the OpenAI API>
diff --git a/example/run_inference_job_wt_openai_model.py b/example/run_inference_job_wt_openai_model.py
@@ -0,0 +1,146 @@
+"""Create a inference job with openai model and poll its results"""
+
+import json
+import logging
+import os
+import random
+import time
+from typing import Dict
+
+from dotenv import load_dotenv
+
+from openweights import OpenWeights
+import openweights.jobs.inference
+
+
+def run_inference_job_and_get_outputs(
+    filepath_conversations: str,
+    model_to_evaluate: str,
+    wait_for_completion: bool = False,
+    display_log_file: bool = False,
+    n_examples_to_log: int = 0,
+    inference_hyperparameters: Dict = None,
+):
+    load_dotenv()
+    client = OpenWeights()
+
+    # Upload inference file
+    with open(filepath_conversations, "rb") as file:
+        file = client.files.create(file, purpose="conversations")
+    file_id = file["id"]
+
+    keys_to_rm = [
+        "learning_rate",
+        "per_device_train_batch_size",
+        "gradient_accumulation_steps",
+        "max_seq_length",
+        "load_in_4bit",
+        "split",
+    ]
+    for key in keys_to_rm:
+        if key in inference_hyperparameters:
+            del inference_hyperparameters[key]
+
+    # Create an inference job
+    logging.info(
+        f"Running inference for {model_to_evaluate} with parameters: {json.dumps(inference_hyperparameters, indent=4)}"
+    )
+    job = client.inference.create(
+        model=model_to_evaluate,
+        input_file_id=file_id,
+        **inference_hyperparameters,
+    )
+
+    if isinstance(job, dict):
+        if "results" in job:  # Completed OpenAI jobs
+            output = job["results"]
+            logging.info(f"Returning loaded outputs with length {len(output)}")
+            if n_examples_to_log > 0:
+                logging.info(f"Logging {n_examples_to_log} random outputs:")
+                random_state = random.getstate()
+                for i in random.sample(
+                    range(len(output)), min(n_examples_to_log, len(output))
+                ):
+                    logging.info(json.dumps(output[i], indent=4))
+                random.setstate(random_state)
+        elif "batch_job_info" in job:  # Failed or running OpenAI batch jobs
+            logging.info(f"Got batch job: {json.dumps(job, indent=4)}")
+            logging.info(f"Retry when the OpenAI batch job is complete...")
+            return None
+        else:
+            raise ValueError(f"Unknown job type: {type(job)}")
+    else:  # Regular OpenWeigths Jobs
+        logging.info(job)
+
+        # Poll job status
+        current_status = job["status"]
+        while True:
+            job = client.jobs.retrieve(job["id"])
+            if job["status"] != current_status:
+                # logging.info(job)
+                current_status = job["status"]
+            if job["status"] in ["completed", "failed", "canceled"]:
+                break
+            if not wait_for_completion:
+                break
+            time.sleep(5)
+
+        if not wait_for_completion and job["status"] != "completed":
+            logging.info(
+                f"Job {job['id']} did not complete, current status: {job['status']}"
+            )
+            return None
+
+        # Get log file:
+        if display_log_file:
+            runs = client.runs.list(job_id=job["id"])
+            for run in runs:
+                print(run)
+            if run["log_file"]:
+                log = client.files.content(run["log_file"]).decode("utf-8")
+                print(log)
+            print("---")
+
+        # Get output
+        job = client.jobs.retrieve(job["id"])
+        output_file_id = job["outputs"]["file"]
+        output = client.files.content(output_file_id).decode("utf-8")
+        output = [json.loads(line) for line in output.splitlines() if line.strip()]
+
+    return output
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+
+    output = run_inference_job_and_get_outputs(
+        filepath_conversations=os.path.join(
+            os.path.dirname(__file__), "../tests/inference_dataset_with_prefill.jsonl"
+        ),
+        model_to_evaluate="openai/gpt-4.1-mini",
+        inference_hyperparameters={
+            "max_tokens": 1000,
+            "temperature": 0.8,
+            "max_model_len": 2048,
+            "n_completions_per_prompt": 1,
+            "use_batch": False,
+        },
+        n_examples_to_log=1,
+    )
+    print("parallel output:", output)
+
+    output = run_inference_job_and_get_outputs(
+        filepath_conversations=os.path.join(
+            os.path.dirname(__file__), "../tests/inference_dataset_with_prefill.jsonl"
+        ),
+        model_to_evaluate="openai/gpt-4.1-mini",
+        inference_hyperparameters={
+            "max_tokens": 1000,
+            "temperature": 0.8,
+            "max_model_len": 2048,
+            "n_completions_per_prompt": 1,
+            "use_batch": True,
+        },
+        n_examples_to_log=1,
+    )
+    print("batch output:", output)
diff --git a/openweights/jobs/inference/openai_support.py b/openweights/jobs/inference/openai_support.py
@@ -22,6 +22,10 @@ def create_openai_inference_batch_request(
         import logging
         import time
 
+        logging.warning(
+            "OpenAI batch API support through OpenWeigths is not tested.\nIssues include:\n-Files sent twice to OpenAI produce different file IDs. This should now be solved with the permanent caching on the function sending the file."
+        )
+
         # Initialize OpenAI client
         client = self._init_openai_client()
 
@@ -60,11 +64,23 @@ def create_openai_inference_batch_request(
         )
 
         # Check for existing batch jobs using this batch file
+        found_batch = False
         try:
             logging.info(f"Checking for existing batch jobs for file {batch_file.id}")
             existing_batches = client.batches.list()
+            # First check for completed batch jobs
+            for batch in existing_batches.data:
+                if batch.input_file_id == batch_file.id and batch.status == "completed":
+                    found_batch = True
+                    logging.info(
+                        f"Found existing batch job {batch.id} for batch file {batch_file.id}"
+                    )
+                    batch_job = client.batches.retrieve(batch.id)
+                    return self.get_batch_job_data(client, batch_job)
+            # Then check for running batch jobs
             for batch in existing_batches.data:
                 if batch.input_file_id == batch_file.id:
+                    found_batch = True
                     logging.info(
                         f"Found existing batch job {batch.id} for batch file {batch_file.id}"
                     )
@@ -73,6 +89,12 @@ def create_openai_inference_batch_request(
         except Exception as e:
             logging.error(f"Error checking existing batch jobs: {str(e)}")
 
+        if found_batch:
+            return {
+                "status": "completed",
+                "results": "Failed to retrieve batch job data",
+            }
+
         # If no existing batch found, create new batch job
         batch_job = client.batches.create(
             input_file_id=batch_file.id,
@@ -298,10 +320,28 @@ def get_batch_job_data(self, openai_client, batch_job):
         logging.info(f"Batch job status: {batch_data.status}")
         if batch_data.status == "completed":
             logging.info(f"Retrieving results for file {batch_data.output_file_id}")
-            file_data = openai_client.files.retrieve(batch_data.output_file_id)
+            file_content = openai_client.files.content(batch_data.output_file_id)
+
+            result_file_name = os.path.join(
+                os.path.dirname(os.path.dirname(__file__)),
+                "tmp.jsonl",
+            )
+            with open(result_file_name, "wb") as file:
+                file.write(file_content.content)
+
+            # Loading data from saved file
+            results = []
+            with open(result_file_name, "r") as file:
+                for line in file:
+                    # Parsing the JSON string into a dict and appending to the list of results
+                    json_object = json.loads(line.strip())
+                    results.append(json_object)
+
+            os.remove(result_file_name)
+
             return {
                 "status": "completed",
-                "results": json.loads(file_data.content),
+                "results": results,
                 "batch_job_info": json.loads(json.dumps(batch_data.model_dump())),
             }
         else: