diff --git a/poetry.lock b/poetry.lock index 8a70b417..9bff30e7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.2 and should not be changed by hand. [[package]] name = "aiohappyeyeballs" @@ -1269,26 +1269,26 @@ test = ["Pillow", "contourpy[test-no-images]", "matplotlib"] test-no-images = ["pytest", "pytest-cov", "pytest-rerunfailures", "pytest-xdist", "wurlitzer"] [[package]] -name = "cowdao-cowpy-kongzii" -version = "1.0.0rc2" +name = "cowdao-cowpy" +version = "1.0.0rc5" description = "" optional = false python-versions = "<4.0,>=3.10" groups = ["main"] files = [ - {file = "cowdao_cowpy_kongzii-1.0.0rc2-py3-none-any.whl", hash = "sha256:359d22d06faff41e3e84ec47c7ce7729f9f46a5af22b41cd04b4c9361c3e0628"}, - {file = "cowdao_cowpy_kongzii-1.0.0rc2.tar.gz", hash = "sha256:b78ef90e4e0ca4f1f37bae4d8f62af7270828df93b4677f5f6532432d617dab5"}, + {file = "cowdao_cowpy-1.0.0rc5-py3-none-any.whl", hash = "sha256:07ce36296ee9993ade27796757b9c84990dd9a738961651e3fb18b31b50d0e1d"}, + {file = "cowdao_cowpy-1.0.0rc5.tar.gz", hash = "sha256:5941532beb215851c37f322e05d5e258e28f9f6c7bdb53247f1a0a6dccc9b86a"}, ] [package.dependencies] aiolimiter = ">=1.1.0,<2.0.0" backoff = ">=2.2.1,<3.0.0" -httpx = ">=0.25.0,<1.0.0" +httpx = ">=0.23.0,<1.0.0" multiformats = ">=0.3.1.post4,<0.4.0" pybars3 = ">=0.9.7,<0.10.0" pydantic = ">=2.7.0,<3.0.0" pytest-mock = ">=3.14.0,<4.0.0" -web3 = ">=6.15.1,<7.0.0" +web3 = ">=6,<7" [[package]] name = "cron-validator" @@ -5219,20 +5219,20 @@ test = ["anthropic", "coverage", "django", "flake8", "freezegun (==1.5.1)", "lan [[package]] name = "prediction-market-agent-tooling" -version = "0.65.3" +version = "0.66.4" description = "Tools to benchmark, deploy and monitor prediction market agents." optional = false python-versions = "<3.13,>=3.10" groups = ["main"] files = [ - {file = "prediction_market_agent_tooling-0.65.3-py3-none-any.whl", hash = "sha256:58d0efec25e26dc8e3fe93cb45c14d44e152609bed177273381c02238609808c"}, - {file = "prediction_market_agent_tooling-0.65.3.tar.gz", hash = "sha256:733aedb195d0ea5b2b12cb47b4d89316a317acd0c43a515054608014bf97e9bd"}, + {file = "prediction_market_agent_tooling-0.66.4-py3-none-any.whl", hash = "sha256:5066c01cd29052daaa1428adf947cdf8b63c876ac306b4bebd1958cf48b42a30"}, + {file = "prediction_market_agent_tooling-0.66.4.tar.gz", hash = "sha256:069613ef96f234f6204ebc8e31ef13a95c88aa8d979cfc78b3c95e77fc39009d"}, ] [package.dependencies] autoflake = ">=2.2.1,<3.0.0" base58 = ">=1.0.2,<2.0" -cowdao-cowpy-kongzii = "1.0.0rc2" +cowdao-cowpy = "1.0.0rc5" cron-validator = ">=1.0.8,<2.0.0" eth-account = ">=0.8.0,<0.12.0" eth-keys = ">=0.6.1,<0.7.0" @@ -8885,4 +8885,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "1edbb8ba2ff6e1f096d69f4680e6a950d19c1b189d348ce497e275d3ebd44bd4" +content-hash = "eed298081e18b62db6d7ef71a16b2d239d4755c2a100b77cc6a63d95c92af92e" diff --git a/prediction_prophet/autonolas/research.py b/prediction_prophet/autonolas/research.py index dd3ad2d0..3f2fc1ff 100644 --- a/prediction_prophet/autonolas/research.py +++ b/prediction_prophet/autonolas/research.py @@ -39,6 +39,7 @@ from prediction_market_agent_tooling.loggers import logger from prediction_market_agent_tooling.tools.google_utils import search_google_gcp from prediction_market_agent_tooling.logprobs_parser import LogprobsParser, FieldLogprobs +from prediction_market_agent_tooling.gtypes import Wei load_dotenv() @@ -118,6 +119,51 @@ * The sum of "p_yes" and "p_no" must equal 1. * Output only the JSON object in your response. Do not include any other contents in your response. """ + +PREDICTION_PROMPT_SCALAR = """ +INTRODUCTION: +You are a Large Language Model (LLM) within a multi-agent system. Your primary task is to accurately estimate the 'scalar_value' for the outcome of a 'market question', \ +found in 'USER_PROMPT'. The market question is part of a prediction market, where users can place bets on the outcomes of market questions and earn rewards if the predicted 'scalar_value' is close to the actual outcome. +Each market has {market_upper_bound} and {market_lower_bound} values use those values to calibrate your expectation about your prediction . +Each market has a closing date at which the outcome is evaluated. This date is typically stated within the market question. \ +The closing date is considered to be 23:59:59 of the date provided in the market question. \ +You are provided an itemized list of information under the label "ADDITIONAL_INFORMATION", which is \ +sourced from a Google search engine query performed a few seconds ago and is meant to assist you in your 'scalar_value' estimation. You must adhere to the following 'INSTRUCTIONS'. + + +INSTRUCTIONS: +* Examine the user's input labeled 'USER_PROMPT'. Focus on the part enclosed in double quotes, which contains the 'market question'. +* Estimate the 'scalar_value' for the outcome of the market question. +* Consider the prediction market with the market question, the closing date and the outcomes in an isolated context that has no influence on the protagonists that are involved in the event in the real world, specified in the market question. The closing date is always arbitrarily set by the market creator and has no influence on the real world. So it is likely that the protagonists of the event in the real world are not even aware of the prediction market and do not care about the market's closing date. +* The 'scalar_value' estimations of the market question outcomes must be as accurate as possible, as an inaccurate estimation will lead to financial loss for the user. +* Utilize your training data and the information provided under "ADDITIONAL_INFORMATION" to generate 'scalar_value' estimations for the outcomes of the 'market question'. +* Examine the itemized list under "ADDITIONAL_INFORMATION" thoroughly and use all the relevant information for your 'scalar_value' estimation. This data is sourced from a Google search engine query done a few seconds ago. +* Use any relevant item in "ADDITIONAL_INFORMATION" in addition to your training data to make the 'scalar_value' estimation. You can assume that you have been provided with the most current and relevant information available on the internet. Still pay close attention on the release and modification timestamps provided in parentheses right before each information item. Some information might be outdated and not relevant anymore. +* More recent information indicated by the timestamps provided in parentheses right before each information item overrides older information within ADDITIONAL_INFORMATION and holds more weight for your 'scalar_value' estimation. +* If there exist contradicting information, evaluate the release and modification dates of those information and prioritize the information that is more recent and adjust your confidence in the probability estimation accordingly. +* Even if not all information might not be released today, you can assume that there haven't been publicly available updates in the meantime except for those inside ADDITIONAL_INFORMATION. +* You must provide your response in the format specified under "OUTPUT_FORMAT". +* Do not include any other contents in your response. + + +USER_PROMPT: +``` +{user_prompt} +``` + +ADDITIONAL_INFORMATION: +``` +{additional_information} +``` + +OUTPUT_FORMAT: +* Your output response must be only a single JSON object to be parsed by Python's "json.loads()". +* The JSON must contain {n_fields} fields: {fields_list}. +{fields_description} +* The 'scalar_value' is float number. +* Output only the JSON object in your response. Do not include any other contents in your response. +""" + PREDICTION_PROMPT_CATEGORICAL = """ INTRODUCTION: You are a Large Language Model (LLM) within a multi-agent system. Your primary task is to accurately estimate the probabilities for the outcome of a 'market question', \ @@ -184,6 +230,13 @@ "info_utility": "Utility of the information provided in 'ADDITIONAL_INFORMATION' to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility).", } +FIELDS_DESCRIPTIONS_SCALAR = { + "reasoning": "A string containing the reasoning behind your decision, and the rest of the answer you're about to give.", + "scalar_value": "Predicted value of the market question, float number", + "confidence": "Indicating the confidence in the estimated value you provided ranging from 0 (lowest confidence) to 1 (maximum confidence). Confidence can be calculated based on the quality and quantity of data used for the estimation.", + "info_utility": "Utility of the information provided in 'ADDITIONAL_INFORMATION' to help you make the probability estimation ranging from 0 (lowest utility) to 1 (maximum utility).", +} + URL_QUERY_PROMPT = """ You are a Large Language Model in a multi-agent system. Your task is to formulate search engine queries based on \ a user's 'event question', which specifies an event and any accompanying conditions. The 'event question' allows \ @@ -388,6 +441,15 @@ class Prediction(BaseModel): logprobs: Optional[list[FieldLogprobs]] = [] +class ScalarPrediction(TypedDict): + scalar_value: Wei + upperBound: Wei + lowerBound: Wei + confidence: float + info_utility: float + reasoning: Optional[str] + logprobs: Optional[list[FieldLogprobs]] + class CategoricalPrediction(TypedDict): decision: str probabilities: Dict[str, Probability] @@ -1324,6 +1386,76 @@ def make_prediction_categorical( return response +def avg(key: str, parsed: list[dict[str, Any]]) -> float: + vals = [p[key] for p in parsed if key in p] + return float(sum(vals) / len(vals)) if vals else float("nan") + +@observe() +def make_prediction_scalar( + prompt: str, + market_upper_bound: Wei, + market_lower_bound: Wei, + additional_information: str, + agent: Agent | None, + include_reasoning: bool = False, +) -> ScalarPrediction: + agent = agent or Agent(model="gpt-3.5-turbo-0125", model_settings=ModelSettings(temperature=0.7)) + + current_time_utc = datetime.now(timezone.utc) + formatted_time_utc = current_time_utc.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-6] + "Z" + + field_descriptions = FIELDS_DESCRIPTIONS_SCALAR.copy() + if not include_reasoning: + field_descriptions.pop("reasoning") + prediction_prompt = PREDICTION_PROMPT_SCALAR.format( + user_prompt=prompt, + market_upper_bound=market_upper_bound, + market_lower_bound=market_lower_bound, + additional_information=additional_information, + n_fields=len(field_descriptions), + fields_list=list_to_list_str(list(field_descriptions)), + fields_description=fields_dict_to_bullet_list(field_descriptions), + timestamp=formatted_time_utc, + ) + result = agent.run_sync(prediction_prompt) + completion = result.data + + if agent.model and hasattr(agent.model, "_n") and agent.model._n > 1: + jsons = re.findall(r"\{[^{}]*\}", result.data, flags=re.S) + + parsed: list[dict[str, Any]] = [] + for block in jsons: + try: + parsed.append(json.loads(block)) + except json.JSONDecodeError: + continue # silently drop malformed blocks + + responses: ScalarPrediction = { + "scalar_value": Wei(int(avg("scalar_value", parsed))), + "confidence": avg("confidence", parsed), + "info_utility": avg("info_utility", parsed), + "upperBound": market_upper_bound, + "lowerBound": market_lower_bound, + "reasoning": "\n\n---\n\n".join(p.get("reasoning", "") for p in parsed if p.get("reasoning")), + "logprobs": [lp for p in parsed for lp in p.get("logprobs", [])] or None, + } + return responses + + else: + completion = result.data + logger.info(f"Completion: {completion}") + completion_clean = clean_completion_json(completion) + logger.info(f"Completion cleaned: {completion_clean}") + + try: + response: ScalarPrediction = json.loads(completion_clean) + except json.decoder.JSONDecodeError as e: + raise UnexpectedModelBehavior(f"The response from {agent=} could not be parsed as JSON: {completion_clean=}") from e + + response['upperBound'] = market_upper_bound + response['lowerBound'] = market_lower_bound + + return response def clean_completion_json(completion: str) -> str: """ diff --git a/prediction_prophet/benchmark/agents.py b/prediction_prophet/benchmark/agents.py index 051ef0ba..ce469bc4 100644 --- a/prediction_prophet/benchmark/agents.py +++ b/prediction_prophet/benchmark/agents.py @@ -6,16 +6,17 @@ from prediction_market_agent_tooling.benchmark.agents import ( AbstractBenchmarkedAgent, ) +from prediction_market_agent_tooling.gtypes import Wei from prediction_market_agent_tooling.benchmark.utils import ( - Prediction, + Prediction, ScalarPrediction ) -from prediction_market_agent_tooling.markets.data_models import ProbabilisticAnswer, CategoricalProbabilisticAnswer +from prediction_market_agent_tooling.markets.data_models import ProbabilisticAnswer, CategoricalProbabilisticAnswer, ScalarProbabilisticAnswer from prediction_market_agent_tooling.tools.is_predictable import is_predictable_binary from prediction_market_agent_tooling.tools.langfuse_ import observe from pydantic_ai import Agent from prediction_prophet.autonolas.research import EmbeddingModel -from prediction_prophet.autonolas.research import make_prediction, get_urls_from_queries, make_prediction_categorical +from prediction_prophet.autonolas.research import make_prediction, get_urls_from_queries, make_prediction_categorical, make_prediction_scalar from prediction_prophet.autonolas.research import research as research_autonolas from prediction_prophet.functions.rephrase_question import rephrase_question from prediction_prophet.functions.research import NoResulsFoundError, NotEnoughScrapedSitesError, Research, \ @@ -46,7 +47,24 @@ def _make_prediction( ) return ProbabilisticAnswer.model_validate(prediction) - +@observe() +def _make_prediction_scalar( + market_question: str, + market_upper_bound: Wei, + market_lower_bound: Wei, + additional_information: str, + agent: Agent, + include_reasoning: bool = False, +) -> ScalarProbabilisticAnswer: + prediction = make_prediction_scalar( + prompt=market_question, + market_upper_bound=market_upper_bound, + market_lower_bound=market_lower_bound, + additional_information=additional_information, + agent=agent, + include_reasoning=include_reasoning, + ) + return ScalarProbabilisticAnswer.model_validate(prediction) @observe() def _make_prediction_categorical( @@ -209,6 +227,26 @@ def research(self, market_question: str) -> Research: logger=self.logger, ) + @observe() + def _make_prediction_scalar( + self, + market_question: str, + market_upper_bound: Wei, + market_lower_bound: Wei, + additional_information: str, + agent: Agent, + include_reasoning: bool = False, + ) -> ScalarProbabilisticAnswer: + prediction = make_prediction_scalar( + prompt=market_question, + market_upper_bound=market_upper_bound, + market_lower_bound=market_lower_bound, + additional_information=additional_information, + agent=agent, + include_reasoning=include_reasoning, + ) + return ScalarProbabilisticAnswer.model_validate(prediction) + def predict(self, market_question: str) -> Prediction: try: research = self.research(market_question) @@ -244,6 +282,31 @@ def predict_categorical(self, market_question: str, market_outcomes: t.Sequence[ self.logger.error(f"Error in PredictionProphet's predict_categorical: {e}") return Prediction() + def predict_scalar(self, market_question: str, market_upper_bound: Wei, market_lower_bound: Wei) -> ScalarPrediction: + try: + research = self.research(market_question) + prediction=_make_prediction_scalar( + market_question=market_question, + market_upper_bound=market_upper_bound, + market_lower_bound=market_lower_bound, + additional_information=research.report, + agent=self.prediction_agent, + include_reasoning=self.include_reasoning, + ) + return ScalarPrediction( + outcome_prediction=ScalarProbabilisticAnswer( + scalar_value=prediction.scalar_value, + upperBound=market_upper_bound, + lowerBound=market_lower_bound, + confidence=prediction.confidence, + reasoning=prediction.reasoning, + logprobs=prediction.logprobs, + ) + ) + except (NoResulsFoundError, NotEnoughScrapedSitesError) as e: + self.logger.warning(f"Problem in PredictionProphet's predict_scalar: {e}") + return ScalarPrediction() + def predict_restricted( self, market_question: str, time_restriction_up_to: datetime ) -> Prediction: diff --git a/pyproject.toml b/pyproject.toml index 9739f8dd..1dc835c0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ scikit-learn = "^1.4.0" typer = ">=0.9.0,<1.0.0" types-requests = "^2.31.0.20240125" types-python-dateutil = "^2.9.0" -prediction-market-agent-tooling = { version = ">=0.65.3,<1", extras = ["langchain", "google"] } +prediction-market-agent-tooling = { version = ">=0.66.4,<1", extras = ["langchain", "google"] } langchain-community = "^0.3.0" memory-profiler = "^0.61.0" matplotlib = "^3.8.3"