meta-pytorch · jiannanWang · Apr 28, 2026
diff --git a/examples/configs/beam_search_diverse.yaml b/examples/configs/beam_search_diverse.yaml
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Diverse beam search — multi-LLM, multi-sample expansion with PTX dedup.
+#
+# SPREAD variant — broad beam expansion.
+#
+# Layout per round:
+#   5 expanding parents (top half of beam)
+#     × 3 bottlenecks
+#     × 3 LLMs (Claude / GPT / Gemini)
+#     × 2 samples per prompt
+#   = 90 worker processes, each producing one candidate kernel.
+#
+# Pairs with examples/configs/beam_search_diverse_concentrated.yaml
+# (top-2 parents × 5 samples each, same 90-worker budget) for A/B
+# comparison of "spread vs. concentrated" expansion strategies.
+#
+# After workers return:
+#   1. Candidates (and the existing 10-kernel beam) are deduplicated by
+#      normalized-PTX fingerprint — kernels that compile to identical PTX
+#      collapse to the fastest representative.
+#   2. The surviving pool is sorted by runtime.
+#   3. The top 10 become the next round's beam.
+#
+# Usage:
+#   python examples/run_opt_manager.py \
+#       --kernel-dir examples/optimize_01_matvec \
+#       --config examples/configs/beam_search_diverse.yaml
+
+strategy: beam_search
+num_workers: 90
+strategy_config:
+  num_top_kernels: 10            # beam width (candidate pool size)
+  num_expanding_parents: 5       # spread expansion across top-5
+  num_bottlenecks: 3             # 3 ranked bottlenecks per parent (post-plumbing-fix)
+  samples_per_prompt: 2          # two LLM draws per (parent, bottleneck, model)
+  # Three models routed via the Relay provider.  Names not present in
+  # utils/providers/available_models.py are auto-routed to Relay by
+  # get_model_provider (see models.py:70–79), so the plugboard server
+  # resolves them.
+  models:
+    - claude-opus-4.6
+    - gpt-5-4
+    - gemini-2-5-pro
+
+# Default LLM (used when no per-candidate override is set; here it's
+# overridden per-candidate via the `models` list above).
+openai_model: claude-opus-4.6
+high_reasoning_effort: true
+
+# Worker configuration
+benchmark_warmup: 25
+benchmark_repeat: 100
+divergence_threshold: 50.0
+target_platform: cuda
+gpu_name: "NVIDIA H100 NVL 94GB"
diff --git a/examples/configs/beam_search_diverse_concentrated.yaml b/examples/configs/beam_search_diverse_concentrated.yaml
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# CONCENTRATED variant — push the leaders harder.
+#
+# Layout per round:
+#   2 expanding parents (top of beam)
+#     × 3 bottlenecks
+#     × 3 LLMs (Claude / GPT / Gemini)
+#     × 5 samples per prompt
+#   = 90 worker processes, each producing one candidate kernel.
+#
+# Same 90-worker budget as beam_search_diverse.yaml (the spread variant)
+# but reallocated toward more LLM draws per (parent, bottleneck, model)
+# triple instead of more parents.  Use this when you trust the leader-
+# kernel pair and want to squeeze them rather than explore broadly.
+#
+# Motivation: in our smoke A/B, Design A (P=1, C=10) outperformed
+# Design B (P=2, C=1) on the same problem and budget.  This config is
+# the production-scale version of that observation: keep enough beam-
+# diversity to dedup against (P=2, beam=10) but spend most of the budget
+# concentrating attempts on the current leaders.
+#
+# After workers return:
+#   1. Candidates (and the existing 10-kernel beam) are deduplicated by
+#      normalized-PTX fingerprint — kernels that compile to identical PTX
+#      collapse to the fastest representative.
+#   2. The surviving pool is sorted by runtime.
+#   3. The top 10 become the next round's beam.
+#
+# Usage:
+#   python examples/run_opt_manager.py \
+#       --kernel-dir examples/optimize_01_matvec \
+#       --strategy beam_search_diverse_concentrated
+
+strategy: beam_search
+num_workers: 90
+strategy_config:
+  num_top_kernels: 10            # beam width (candidate pool size)
+  num_expanding_parents: 2       # concentrate on top-2 leaders
+  num_bottlenecks: 3             # 3 ranked bottlenecks per parent
+  samples_per_prompt: 5          # five LLM draws per (parent, bottleneck, model)
+  models:
+    # Three models routed via the Relay provider.
+    - claude-opus-4.6
+    - gpt-5-4
+    - gemini-2-5-pro
+
+# Default LLM (used when no per-candidate override is set; here it's
+# overridden per-candidate via the `models` list above).
+openai_model: claude-opus-4.6
+high_reasoning_effort: true
+
+# Worker configuration
+benchmark_warmup: 25
+benchmark_repeat: 100
+divergence_threshold: 50.0
+target_platform: cuda
+gpu_name: "NVIDIA H100 NVL 94GB"
diff --git a/examples/configs/beam_search_diverse_smoke.yaml b/examples/configs/beam_search_diverse_smoke.yaml
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Smoke-test variant of beam_search_diverse.yaml.
+#
+# Per-round fanout: 1 parent × 3 bottlenecks × 3 models × 1 sample = 9 workers.
+# Exercises the bottleneck-plumbing fix — each (model, bottleneck_id) trio
+# should produce 3 distinct optimization directions, not 3 copies of the
+# top-ranked bottleneck.
+
+strategy: beam_search
+num_workers: 9
+strategy_config:
+  num_top_kernels: 4
+  num_expanding_parents: 1
+  num_bottlenecks: 3            # ← drives BottleneckAnalyzer to request 3 ranked options
+  samples_per_prompt: 1
+  models:
+    - claude-opus-4.6
+    - gpt-5-4
+    - gemini-2-5-pro
+
+openai_model: claude-opus-4.6
+high_reasoning_effort: true
+
+benchmark_warmup: 25
+benchmark_repeat: 100
+divergence_threshold: 50.0
+target_platform: cuda
+gpu_name: "NVIDIA H100 NVL 94GB"