Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions examples/configs/beam_search_diverse.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Diverse beam search — multi-LLM, multi-sample expansion with PTX dedup.
#
# SPREAD variant — broad beam expansion.
#
# Layout per round:
# 5 expanding parents (top half of beam)
# × 3 bottlenecks
# × 3 LLMs (Claude / GPT / Gemini)
# × 2 samples per prompt
# = 90 worker processes, each producing one candidate kernel.
#
# Pairs with examples/configs/beam_search_diverse_concentrated.yaml
# (top-2 parents × 5 samples each, same 90-worker budget) for A/B
# comparison of "spread vs. concentrated" expansion strategies.
#
# After workers return:
# 1. Candidates (and the existing 10-kernel beam) are deduplicated by
# normalized-PTX fingerprint — kernels that compile to identical PTX
# collapse to the fastest representative.
# 2. The surviving pool is sorted by runtime.
# 3. The top 10 become the next round's beam.
#
# Usage:
# python examples/run_opt_manager.py \
# --kernel-dir examples/optimize_01_matvec \
# --config examples/configs/beam_search_diverse.yaml

strategy: beam_search
num_workers: 90
strategy_config:
num_top_kernels: 10 # beam width (candidate pool size)
num_expanding_parents: 5 # spread expansion across top-5
num_bottlenecks: 3 # 3 ranked bottlenecks per parent (post-plumbing-fix)
samples_per_prompt: 2 # two LLM draws per (parent, bottleneck, model)
# Three models routed via the Relay provider. Names not present in
# utils/providers/available_models.py are auto-routed to Relay by
# get_model_provider (see models.py:70–79), so the plugboard server
# resolves them.
models:
- claude-opus-4.6
- gpt-5-4
- gemini-2-5-pro

# Default LLM (used when no per-candidate override is set; here it's
# overridden per-candidate via the `models` list above).
openai_model: claude-opus-4.6
high_reasoning_effort: true

# Worker configuration
benchmark_warmup: 25
benchmark_repeat: 100
divergence_threshold: 50.0
target_platform: cuda
gpu_name: "NVIDIA H100 NVL 94GB"
64 changes: 64 additions & 0 deletions examples/configs/beam_search_diverse_concentrated.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0

# CONCENTRATED variant — push the leaders harder.
#
# Layout per round:
# 2 expanding parents (top of beam)
# × 3 bottlenecks
# × 3 LLMs (Claude / GPT / Gemini)
# × 5 samples per prompt
# = 90 worker processes, each producing one candidate kernel.
#
# Same 90-worker budget as beam_search_diverse.yaml (the spread variant)
# but reallocated toward more LLM draws per (parent, bottleneck, model)
# triple instead of more parents. Use this when you trust the leader-
# kernel pair and want to squeeze them rather than explore broadly.
#
# Motivation: in our smoke A/B, Design A (P=1, C=10) outperformed
# Design B (P=2, C=1) on the same problem and budget. This config is
# the production-scale version of that observation: keep enough beam-
# diversity to dedup against (P=2, beam=10) but spend most of the budget
# concentrating attempts on the current leaders.
#
# After workers return:
# 1. Candidates (and the existing 10-kernel beam) are deduplicated by
# normalized-PTX fingerprint — kernels that compile to identical PTX
# collapse to the fastest representative.
# 2. The surviving pool is sorted by runtime.
# 3. The top 10 become the next round's beam.
#
# Usage:
# python examples/run_opt_manager.py \
# --kernel-dir examples/optimize_01_matvec \
# --strategy beam_search_diverse_concentrated

strategy: beam_search
num_workers: 90
strategy_config:
num_top_kernels: 10 # beam width (candidate pool size)
num_expanding_parents: 2 # concentrate on top-2 leaders
num_bottlenecks: 3 # 3 ranked bottlenecks per parent
samples_per_prompt: 5 # five LLM draws per (parent, bottleneck, model)
models:
# Three models routed via the Relay provider.
- claude-opus-4.6
- gpt-5-4
- gemini-2-5-pro

# Default LLM (used when no per-candidate override is set; here it's
# overridden per-candidate via the `models` list above).
openai_model: claude-opus-4.6
high_reasoning_effort: true

# Worker configuration
benchmark_warmup: 25
benchmark_repeat: 100
divergence_threshold: 50.0
target_platform: cuda
gpu_name: "NVIDIA H100 NVL 94GB"
35 changes: 35 additions & 0 deletions examples/configs/beam_search_diverse_smoke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0

# Smoke-test variant of beam_search_diverse.yaml.
#
# Per-round fanout: 1 parent × 3 bottlenecks × 3 models × 1 sample = 9 workers.
# Exercises the bottleneck-plumbing fix — each (model, bottleneck_id) trio
# should produce 3 distinct optimization directions, not 3 copies of the
# top-ranked bottleneck.

strategy: beam_search
num_workers: 9
strategy_config:
num_top_kernels: 4
num_expanding_parents: 1
num_bottlenecks: 3 # ← drives BottleneckAnalyzer to request 3 ranked options
samples_per_prompt: 1
models:
- claude-opus-4.6
- gpt-5-4
- gemini-2-5-pro

openai_model: claude-opus-4.6
high_reasoning_effort: true

benchmark_warmup: 25
benchmark_repeat: 100
divergence_threshold: 50.0
target_platform: cuda
gpu_name: "NVIDIA H100 NVL 94GB"
Loading
Loading