From 623aa89ccebfbf350fd54db13bddf4f21a3c0cb8 Mon Sep 17 00:00:00 2001 From: "balyan.sid@gmail.com" Date: Tue, 6 Jan 2026 23:54:19 +0530 Subject: [PATCH] feat: add Oolong benchmark example with recursive rollouts Adds an example demonstrating recursive RLM rollouts on the Oolong benchmark. Previously, examples lacked coverage of recursive calls. - Loads context and question from oolongbench/oolong-real dataset - Runs RLM completion with logging enabled - Validates response against expected answer Original-Author: alt-glitch (balyan.sid@gmail.com) Upstream-PR: alexzhang13/rlm#34 --- examples/oolong_example.py | 79 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 examples/oolong_example.py diff --git a/examples/oolong_example.py b/examples/oolong_example.py new file mode 100644 index 0000000..a1fcef4 --- /dev/null +++ b/examples/oolong_example.py @@ -0,0 +1,79 @@ +""" +Example: An example from the Oolong Benchmark from the RLM paper: https://arxiv.org/abs/2512.24601v1 +""" + +import os +import sys +from itertools import islice + +from dotenv import load_dotenv + +from rlm import RLM +from rlm.logger import RLMLogger + +load_dotenv() + +try: + from datasets import load_dataset +except ImportError: + print( + "Please install the 'datasets' library to run this example. Run `uv pip install datasets`" + ) + sys.exit(1) + + +def load_oolong_row(index: int = 1) -> dict: + """Load a single row from the Oolong benchmark.""" + streaming_ds = load_dataset("oolongbench/oolong-real", "toy_dnd", split="test", streaming=True) + row = next(islice(streaming_ds, index, index + 1)) + return row + + +def main(): + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable is not set.") + + # Load benchmark data + row = load_oolong_row(index=1) + context = row["context_window_text"] + question = row["question"] + expected_answer = row["answer"] + + print(f"Question: {question}") + print(f"Expected answer: {expected_answer}") + print("-" * 50) + + # Create logger + logger = RLMLogger(log_dir="./logs") + + # Create RLM instance + rlm = RLM( + backend="openai", + backend_kwargs={ + "model_name": "gpt-5-mini", + "api_key": api_key, + }, + environment="local", + max_iterations=30, + logger=logger, + verbose=True, + ) + + # Run completion with context and question + result = rlm.completion(prompt=context, root_prompt=question) + + print("-" * 50) + print(f"RLM Response: {result.response}") + print(f"Expected: {expected_answer}") + + # Simple validation (exact match or contained) + is_correct = ( + expected_answer.lower() in result.response.lower() + or result.response.lower() in expected_answer.lower() + ) + print(f"Match: {is_correct}") + + +if __name__ == "__main__": + main()