Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions config/act_patch/mimo_mimo_act_patch_cf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# MiMo-7B Activation Patching Configuration
# Uses MiMo-7B for activation patching predictions

target_model_path: XiaomiMiMo/MiMo-7B-RL
model_path: XiaomiMiMo/MiMo-7B-RL
trust_remote_code: true

# MiMo-specific settings
generation_config:
temperature: 0.6
do_sample: true

continuous_tokens:
"begin_continuous": "<|reserved_special_token_0|>"
"end_continuous": "<|reserved_special_token_1|>"
"continuous_rep": "<|reserved_special_token_2|>"

output_dir: /data/artifacts/mimo/checkpoints/mimo_act_patch_cf
cache_dir: /data/artifacts/mimo/cache/

train:
num_samples: 100000000
batch_size: 16 # Reduced for 7B model memory
save_strategy: steps
save_total_limit: 10
save_steps: 2000
learning_rate: !!float 5e-5
num_epochs: 20
eval_strategy: steps
eval_steps: 2000
peft_lora: true
lora_r: 128
dataset: ["counterfact"]
evaluation_type: mixed
intervention_path: Transluce/act_patch_mimo_7b_counterfact
hf_data_cache_dir: /data/artifacts/mimo/datasets/
# MiMo-7B has 32 layers (similar to Llama-3.1-8B)
layers: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
bf16: true
tasks:
act_patch:
num_samples: 100000000
question_types:
generative_explanation:
evaluation_type: exact_match
weight: 1.0
prompts:
- messages:
- role: "user"
content: "If feature {begin_continuous}{feature}{end_continuous} at layer {layer} is added to tokens {tokens} when processing the text <<<{input}>>>, how would the output change?"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "When feature {begin_continuous}{feature}{end_continuous} at layer {layer} is added at tokens {tokens} in the input <<<{input}>>>, what happens to the model's output?"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "Consider the input text: <<<{input}>>>. If we steer layer {layer} towards feature {begin_continuous}{feature}{end_continuous} at tokens {tokens}, how does this affect the generated continuation?"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "Given the text <<<{input}>>>, what would be the effect on the output if feature {begin_continuous}{feature}{end_continuous} at layer {layer} is added to tokens {tokens}?"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "If we steer towards feature {begin_continuous}{feature}{end_continuous} at layer {layer} and tokens {tokens} when processing <<<{input}>>>, how would the model's response differ?"
- role: "assistant"
content: ""

test:
batch_size: 8
tasks:
act_patch:
num_samples: 3200
intervention_path: Transluce/act_patch_mimo_7b_counterfact
evaluation_type: exact_match
80 changes: 80 additions & 0 deletions config/feature_descriptions/mimo_131k.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# MiMo-7B Feature Descriptions Configuration
# Uses MiMo-7B as the explainer model for SAE feature descriptions

target_model_path: meta-llama/Llama-3.1-8B
model_path: XiaomiMiMo/MiMo-7B-RL
trust_remote_code: true

# MiMo-specific settings
# MiMo recommends empty system prompt and temperature 0.6
generation_config:
temperature: 0.6
do_sample: true

continuous_tokens:
"begin_continuous": "<|reserved_special_token_0|>"
"end_continuous": "<|reserved_special_token_1|>"
"continuous_rep": "<|reserved_special_token_2|>"

output_dir: /data/artifacts/mimo/checkpoints/mimo_feature_descriptions
cache_dir: /data/artifacts/mimo/cache/

use_embed_proj: true

train:
num_samples: 1000000000
batch_size: 32 # Reduced for 7B model memory
save_strategy: steps
save_total_limit: 10
save_steps: 8000
learning_rate: !!float 5e-5
num_epochs: 100
eval_strategy: steps
eval_steps: 8000
peft_lora: false
lora_r: 128
dataset: ["sae_explanations"]
evaluation_type: mixed
explanation_dir: /data/artifacts/bzl/autointerp/datasets/SAE_feature_explanations_llama3.1_8b/
split_keys: explanations/{split}_layer_feature_idxs.pkl
split_explanations_data: explanations/{split}_explanations.pkl
sae_save_path: features
layers: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
bf16: true
tasks:
features_explain:
question_types:
generative_explanation:
evaluation_type: semantic_similarity
weight: 1.0
layers: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
prompts:
- messages:
- role: "user"
content: "Generate a description of this feature at layer {layer}: {begin_continuous}{feature}{end_continuous}"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "What does {begin_continuous}{feature}{end_continuous} encode at layer {layer}?"
- role: "assistant"
content: ""
- messages:
- role: "user"
content: "{begin_continuous}{feature}{end_continuous} activates at layer {layer} for inputs with the following features:"
- role: "assistant"
- messages:
- role: "user"
content: "What does {begin_continuous}{feature}{end_continuous} mean at layer {layer}?"
- role: "assistant"
- messages:
- role: "user"
content: "Layer {layer}, {begin_continuous}{feature}{end_continuous} means?"
- role: "assistant"

test:
batch_size: 32
tasks:
features_explain:
num_samples: 1600
evaluation_type: semantic_similarity
61 changes: 61 additions & 0 deletions config/input_ablation/mimo_mimo_hint.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# MiMo-7B Input Ablation Configuration
# Uses MiMo-7B for predicting input ablation effects

target_model_path: XiaomiMiMo/MiMo-7B-RL
model_path: XiaomiMiMo/MiMo-7B-RL
trust_remote_code: true

# MiMo-specific settings - empty system prompt recommended
generation_config:
temperature: 0.6
do_sample: true

continuous_tokens:
"begin_continuous": "<|reserved_special_token_0|>"
"end_continuous": "<|reserved_special_token_1|>"
"continuous_rep": "<|reserved_special_token_2|>"

output_dir: /data/artifacts/mimo/checkpoints/mimo_input_ablation
cache_dir: /data/artifacts/mimo/cache/

use_embed_proj: false

train:
num_samples: 100000000
batch_size: 8
save_strategy: steps
save_total_limit: 10
save_steps: 2000
learning_rate: !!float 5e-5
num_epochs: 20
eval_strategy: steps
eval_steps: 2000
peft_lora: false
lora_r: 128
dataset: ["hint"]
evaluation_type: mixed
hint_path: Transluce/input_ablation_mimo_7b_mmlu_hint
hf_data_cache_dir: /data/artifacts/mimo/datasets/
layers: [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
bf16: true
tasks:
hint_attribution:
num_samples: 100000000
weight: 1.0
self_consistency: false
evaluation_type: exact_match
question_types:
generative_explanation:
evaluation_type: exact_match
prompts:
- messages:
- role: "user"
content: "{user_prompt}\n\nIf the hint were removed how would the assistant answer change?"
- role: "assistant"
content: ""

test:
batch_size: 8
tasks:
hint_attribution:
num_samples: 3200
2 changes: 2 additions & 0 deletions model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from model.continuous_gemma2 import ContinuousGemma2ForCausalLM
from model.continuous_llama import ContinuousLlama
from model.continuous_qwen import ContinuousQwen3ForCausalLM
from model.continuous_mimo import ContinuousMiMo
from model.nearest_neighbor import NearestNeighborModel
from model.continuous_peft import ContinuousPeft
from model.self_explanations import SelfExplanationsModel
Expand All @@ -11,6 +12,7 @@
"ContinuousGemma3ForCausalLM",
"ContinuousGemma2ForCausalLM",
"ContinuousQwen3ForCausalLM",
"ContinuousMiMo",
"ContinuousPeft",
"NearestNeighborModel",
"SelfExplanationsModel",
Expand Down
Loading