From 8fc1a5d61e439d5d193fddc0535e942261e27ffa Mon Sep 17 00:00:00 2001 From: cafzal Date: Thu, 18 Jun 2026 09:53:03 -0700 Subject: [PATCH 1/2] Add latent-objective recognition eval to multi-objective skill Signed-off-by: cafzal --- .../evals/evals.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/skills/cuopt-multi-objective-exploration/evals/evals.json b/skills/cuopt-multi-objective-exploration/evals/evals.json index ff2158104e..ddb5b94bff 100644 --- a/skills/cuopt-multi-objective-exploration/evals/evals.json +++ b/skills/cuopt-multi-objective-exploration/evals/evals.json @@ -53,5 +53,20 @@ "Does NOT trace a frontier or sweep the budget as if it were a tradeoff dial", "Returns one recommended supplier set (one solve), citing the binding budget and demand coverage" ] + }, + { + "id": "multiobj-explore-eval-005-latent-objective", + "question": "A planner runs a multi-period production model to MAXIMIZE priority-weighted finished-goods inventory at the end of a 10-period horizon. The model also carries full cost data — per-item unit and holding costs, per-resource hourly production cost — but the current objective ignores it, and leadership has set no budget. The planner asks: 'Push supply as high as it will go — what's the plan?' Using cuOpt, how would you respond?", + "expected_skill": "cuopt-multi-objective-exploration", + "expected_script": null, + "ground_truth": "The agent recognizes that cost is a SECOND objective sitting latent in the problem — the data is present and no budget pins it down — and does NOT simply return the single maximum-supply plan, nor silently fold cost into a weighted-sum blend (maximize supply minus lambda*cost) with a self-chosen lambda. It surfaces the supply-vs-cost tradeoff and traces the Pareto frontier with cuOpt by epsilon-constraint (cap total cost, maximize supply, sweep the cap from tight to slack). Because the model is a MILP (no usable duals), it estimates the supply-per-dollar exchange rate by differencing adjacent frontier points, reports supply in interpretable units rather than the raw priority-weighted total, flags the knee where supply per dollar collapses, names two or three candidate operating points, and defers the budget decision to leadership. It distinguishes this from a hard-budget case: cost is unconstrained here, so the right move is to expose the tradeoff, not to pick one plan.", + "expected_behavior": [ + "Recognizes a LATENT second objective (cost) present in the data but unstated; does NOT optimize the single stated objective (supply) in isolation", + "Does NOT silently collapse to a weighted-sum blend (maximize supply minus lambda*cost) with a self-chosen weight", + "Surfaces the supply-vs-cost tradeoff and traces the Pareto frontier via epsilon-constraint (sweep a total-cost cap, maximize supply)", + "Since the model is MILP (no duals), estimates the supply-per-dollar exchange rate by differencing adjacent frontier points", + "Reports supply in interpretable units, flags the knee, names candidate operating points, and defers the budget call to leadership", + "Distinguishes this from a hard-budget case (cf. the decoy): cost is unconstrained here, so it exposes the tradeoff rather than returning a single plan" + ] } ] From 8275bf7f8583828e1c6dd3b89cf6e8a0c49de48f Mon Sep 17 00:00:00 2001 From: cafzal Date: Thu, 18 Jun 2026 10:20:10 -0700 Subject: [PATCH 2/2] eval-005: defer per-solve mechanics to api/formulation skills (match 002/004 house style) Signed-off-by: cafzal --- skills/cuopt-multi-objective-exploration/evals/evals.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skills/cuopt-multi-objective-exploration/evals/evals.json b/skills/cuopt-multi-objective-exploration/evals/evals.json index ddb5b94bff..44b2620185 100644 --- a/skills/cuopt-multi-objective-exploration/evals/evals.json +++ b/skills/cuopt-multi-objective-exploration/evals/evals.json @@ -59,7 +59,7 @@ "question": "A planner runs a multi-period production model to MAXIMIZE priority-weighted finished-goods inventory at the end of a 10-period horizon. The model also carries full cost data — per-item unit and holding costs, per-resource hourly production cost — but the current objective ignores it, and leadership has set no budget. The planner asks: 'Push supply as high as it will go — what's the plan?' Using cuOpt, how would you respond?", "expected_skill": "cuopt-multi-objective-exploration", "expected_script": null, - "ground_truth": "The agent recognizes that cost is a SECOND objective sitting latent in the problem — the data is present and no budget pins it down — and does NOT simply return the single maximum-supply plan, nor silently fold cost into a weighted-sum blend (maximize supply minus lambda*cost) with a self-chosen lambda. It surfaces the supply-vs-cost tradeoff and traces the Pareto frontier with cuOpt by epsilon-constraint (cap total cost, maximize supply, sweep the cap from tight to slack). Because the model is a MILP (no usable duals), it estimates the supply-per-dollar exchange rate by differencing adjacent frontier points, reports supply in interpretable units rather than the raw priority-weighted total, flags the knee where supply per dollar collapses, names two or three candidate operating points, and defers the budget decision to leadership. It distinguishes this from a hard-budget case: cost is unconstrained here, so the right move is to expose the tradeoff, not to pick one plan.", + "ground_truth": "The agent recognizes that cost is a SECOND objective sitting latent in the problem — the data is present and no budget pins it down — and does NOT simply return the single maximum-supply plan, nor silently fold cost into a weighted-sum blend (maximize supply minus lambda*cost) with a self-chosen lambda. It surfaces the supply-vs-cost tradeoff and traces the Pareto frontier with cuOpt by epsilon-constraint (cap total cost, maximize supply, sweep the cap from tight to slack). Because the model is a MILP (no usable duals), it estimates the supply-per-dollar exchange rate by differencing adjacent frontier points, reports supply in interpretable units rather than the raw priority-weighted total, flags the knee where supply per dollar collapses, names two or three candidate operating points, and defers the budget decision to leadership. It distinguishes this from a hard-budget case: cost is unconstrained here, so the right move is to expose the tradeoff, not to pick one plan. It defers per-solve mechanics to the api-* skills and formulation to cuopt-numerical-optimization-formulation.", "expected_behavior": [ "Recognizes a LATENT second objective (cost) present in the data but unstated; does NOT optimize the single stated objective (supply) in isolation", "Does NOT silently collapse to a weighted-sum blend (maximize supply minus lambda*cost) with a self-chosen weight",