Integrate Tom's SOTA forecaster (#5477)

rgambee · github-actions[bot] · commit b250a57f2183 · 2026-04-21T18:40:36.000Z
This integrates Tom's SOTA forecaster into the app. Users can toggle
between it and the existing forecaster by setting the effort level
parameter when using the SDK or MCP server.

I tried to keep the implementation faithful to Tom's prototypes. We run
three research agents on each question. Then we refine those three
estimates with an additional LLM call to produce a final forecast (no
further research). Lastly, we produce a user-facing summary. A potential
future optimization would be to combine the refinement and summary steps
into one, which would save time.

The model selections are the same as Tom settled on. The prompts are
also the same except for some slight wording changes. Most notable is
adding units to the prompts for numeric forecasts.

Based on my limited testing, the cost is about $0.75 per row, but it
would be good to average across a wider range of tasks.

Sourced from commit e26e249e30afd2673271297dcc11e4e065f634b7
diff --git a/futuresearch-mcp/src/futuresearch_mcp/models.py b/futuresearch-mcp/src/futuresearch_mcp/models.py
@@ -9,6 +9,7 @@
 from futuresearch.generated.models.dedupe_operation_strategy import (
     DedupeOperationStrategy,
 )
+from futuresearch.generated.models.forecast_effort_level import ForecastEffortLevel
 from futuresearch.generated.models.llm_enum_public import LLMEnumPublic
 from futuresearch.task import EffortLevel
 from jsonschema import SchemaError
@@ -420,6 +421,10 @@ class ForecastInput(_SingleSourceInput):
         "as YYYY-MM-DD strings for timing questions like 'When will X happen?'. "
         "Requires output_field when 'numeric' or 'date'.",
     )
+    effort_level: ForecastEffortLevel | None = Field(
+        default=None,
+        description="Affects accuracy and cost of forecast. Default: low.",
+    )
     output_field: str | None = Field(
         default=None,
         description="Name of the numeric quantity being forecast (e.g. 'price', 'count'). "
diff --git a/futuresearch-mcp/src/futuresearch_mcp/tools.py b/futuresearch-mcp/src/futuresearch_mcp/tools.py
@@ -696,6 +696,7 @@ async def futuresearch_forecast(
             session=session,
             input=input_data,
             forecast_type=params.forecast_type,
+            effort_level=params.effort_level,
             output_field=params.output_field,
             units=params.units,
         )
diff --git a/src/futuresearch/generated/models/__init__.py b/src/futuresearch/generated/models/__init__.py
@@ -23,10 +23,11 @@
 from .dedupe_operation_strategy import DedupeOperationStrategy
 from .error_response import ErrorResponse
 from .error_response_details_type_0 import ErrorResponseDetailsType0
+from .forecast_effort_level import ForecastEffortLevel
 from .forecast_operation import ForecastOperation
-from .forecast_operation_forecast_type import ForecastOperationForecastType
 from .forecast_operation_input_type_1_item import ForecastOperationInputType1Item
 from .forecast_operation_input_type_2 import ForecastOperationInputType2
+from .forecast_type import ForecastType
 from .health_response import HealthResponse
 from .http_validation_error import HTTPValidationError
 from .insufficient_balance_response import InsufficientBalanceResponse
@@ -108,10 +109,11 @@
     "DedupeOperationStrategy",
     "ErrorResponse",
     "ErrorResponseDetailsType0",
+    "ForecastEffortLevel",
     "ForecastOperation",
-    "ForecastOperationForecastType",
     "ForecastOperationInputType1Item",
     "ForecastOperationInputType2",
+    "ForecastType",
     "HealthResponse",
     "HTTPValidationError",
     "InsufficientBalanceResponse",
diff --git a/src/futuresearch/generated/models/forecast_effort_level.py b/src/futuresearch/generated/models/forecast_effort_level.py
@@ -0,0 +1,9 @@
+from enum import Enum
+
+
+class ForecastEffortLevel(str, Enum):
+    HIGH = "high"
+    LOW = "low"
+
+    def __str__(self) -> str:
+        return str(self.value)
diff --git a/src/futuresearch/generated/models/forecast_operation.py b/src/futuresearch/generated/models/forecast_operation.py
@@ -7,7 +7,8 @@
 from attrs import define as _attrs_define
 from attrs import field as _attrs_field
 
-from ..models.forecast_operation_forecast_type import ForecastOperationForecastType
+from ..models.forecast_effort_level import ForecastEffortLevel
+from ..models.forecast_type import ForecastType
 from ..types import UNSET, Unset
 
 if TYPE_CHECKING:
@@ -27,25 +28,24 @@ class ForecastOperation:
             of a list of JSON objects
         task (str): Overall context or instructions for the forecast. Each row in the input should contain the
             question/scenario to forecast.
-        forecast_type (ForecastOperationForecastType): Type of forecast. 'binary': yes/no probability (0-100) for
-            questions like 'Will X happen?'. 'numeric': percentile estimates (p10-p90) for questions like 'What will the
-            price/value/count be?'. 'date': date percentile estimates (p10-p90) as YYYY-MM-DD strings for timing questions
-            like 'When will X happen?'. Requires output_field when 'numeric' or 'date'.
+        forecast_type (ForecastType):
         session_id (None | Unset | UUID): Session ID. If not provided, a new session is auto-created for this task.
         webhook_url (None | str | Unset): Optional URL to receive a POST callback when the task completes or fails.
         output_field (None | str | Unset): Name of the numeric quantity being forecast (e.g. 'price', 'count'). Required
             when forecast_type is 'numeric'. Output columns will be named {output_field}_p10 through {output_field}_p90.
         units (None | str | Unset): Units for the numeric forecast (e.g. 'USD per barrel', 'thousands'). Required when
             forecast_type is 'numeric'.
+        effort_level (ForecastEffortLevel | Unset):
     """
 
     input_: ForecastOperationInputType2 | list[ForecastOperationInputType1Item] | UUID
     task: str
-    forecast_type: ForecastOperationForecastType
+    forecast_type: ForecastType
     session_id: None | Unset | UUID = UNSET
     webhook_url: None | str | Unset = UNSET
     output_field: None | str | Unset = UNSET
     units: None | str | Unset = UNSET
+    effort_level: ForecastEffortLevel | Unset = UNSET
     additional_properties: dict[str, Any] = _attrs_field(init=False, factory=dict)
 
     def to_dict(self) -> dict[str, Any]:
@@ -91,6 +91,10 @@ def to_dict(self) -> dict[str, Any]:
         else:
             units = self.units
 
+        effort_level: str | Unset = UNSET
+        if not isinstance(self.effort_level, Unset):
+            effort_level = self.effort_level.value
+
         field_dict: dict[str, Any] = {}
         field_dict.update(self.additional_properties)
         field_dict.update(
@@ -108,6 +112,8 @@ def to_dict(self) -> dict[str, Any]:
             field_dict["output_field"] = output_field
         if units is not UNSET:
             field_dict["units"] = units
+        if effort_level is not UNSET:
+            field_dict["effort_level"] = effort_level
 
         return field_dict
 
@@ -150,7 +156,7 @@ def _parse_input_(data: object) -> ForecastOperationInputType2 | list[ForecastOp
 
         task = d.pop("task")
 
-        forecast_type = ForecastOperationForecastType(d.pop("forecast_type"))
+        forecast_type = ForecastType(d.pop("forecast_type"))
 
         def _parse_session_id(data: object) -> None | Unset | UUID:
             if data is None:
@@ -196,6 +202,13 @@ def _parse_units(data: object) -> None | str | Unset:
 
         units = _parse_units(d.pop("units", UNSET))
 
+        _effort_level = d.pop("effort_level", UNSET)
+        effort_level: ForecastEffortLevel | Unset
+        if isinstance(_effort_level, Unset):
+            effort_level = UNSET
+        else:
+            effort_level = ForecastEffortLevel(_effort_level)
+
         forecast_operation = cls(
             input_=input_,
             task=task,
@@ -204,6 +217,7 @@ def _parse_units(data: object) -> None | str | Unset:
             webhook_url=webhook_url,
             output_field=output_field,
             units=units,
+            effort_level=effort_level,
         )
 
         forecast_operation.additional_properties = d
diff --git a/src/futuresearch/generated/models/forecast_type.py b/src/futuresearch/generated/models/forecast_type.py
@@ -1,7 +1,7 @@
 from enum import Enum
 
 
-class ForecastOperationForecastType(str, Enum):
+class ForecastType(str, Enum):
     BINARY = "binary"
     DATE = "date"
     NUMERIC = "numeric"
diff --git a/src/futuresearch/ops.py b/src/futuresearch/ops.py
@@ -27,9 +27,10 @@
     DedupeOperation,
     DedupeOperationInputType1Item,
     DedupeOperationStrategy,
+    ForecastEffortLevel,
     ForecastOperation,
-    ForecastOperationForecastType,
     ForecastOperationInputType1Item,
+    ForecastType,
     LLMEnumPublic,
     MergeOperation,
     MergeOperationLeftInputType1Item,
@@ -832,6 +833,7 @@ async def forecast(
     session: Session | None = None,
     *,
     forecast_type: Literal["binary", "numeric", "date"],
+    effort_level: ForecastEffortLevel | None = None,
     output_field: str | None = None,
     units: str | None = None,
 ) -> TableResult:
@@ -867,6 +869,7 @@ async def forecast(
         session: Optional session. If not provided, one will be created automatically.
         forecast_type: ``"binary"`` for probability forecasts, ``"numeric"`` for
             percentile estimates, ``"date"`` for date percentile estimates.
+        effort_level: affects accuracy and cost of forecast. Default: low.
         output_field: Name of the quantity being forecast (required for numeric
             and date, e.g. ``"price"``, ``"launch_date"``).
         units: Units for numeric forecasts (e.g. ``"USD per barrel"``).
@@ -883,6 +886,7 @@ async def forecast(
                 session=internal_session,
                 input=input,
                 forecast_type=forecast_type,
+                effort_level=effort_level,
                 output_field=output_field,
                 units=units,
             )
@@ -908,7 +912,9 @@ async def forecast_async(
     task: str,
     session: Session,
     input: DataFrame | UUID | TableResult,
+    *,
     forecast_type: Literal["binary", "numeric", "date"],
+    effort_level: ForecastEffortLevel | None = None,
     output_field: str | None = None,
     units: str | None = None,
 ) -> EveryrowTask[BaseModel]:
@@ -920,6 +926,7 @@ async def forecast_async(
         input: Input data.
         forecast_type: ``"binary"`` for yes/no probability, ``"numeric"`` for
             percentile estimates, ``"date"`` for date percentile estimates.
+        effort_level: affects accuracy and cost of forecast. Default: low.
         output_field: Name of the quantity (required for numeric and date).
         units: Units for numeric forecasts (required for numeric).
 
@@ -929,10 +936,11 @@ async def forecast_async(
     input_data = _prepare_table_input(input, ForecastOperationInputType1Item)
 
     body = ForecastOperation(
-        input_=input_data,  # type: ignore
+        input_=input_data,
         task=task,
         session_id=session.session_id,
-        forecast_type=ForecastOperationForecastType(forecast_type),
+        forecast_type=ForecastType(forecast_type),
+        effort_level=effort_level if effort_level is not None else UNSET,
         output_field=output_field,
         units=units,
     )

Original file line number	Diff line number	Diff line change
`@@ -696,6 +696,7 @@ async def futuresearch_forecast(`
`696`	`696`	`session=session,`
`697`	`697`	`input=input_data,`
`698`	`698`	`forecast_type=params.forecast_type,`
	`699`	`+ effort_level=params.effort_level,`
`699`	`700`	`output_field=params.output_field,`
`700`	`701`	`units=params.units,`
`701`	`702`	`)`