-
Notifications
You must be signed in to change notification settings - Fork 59
Expand file tree
/
Copy pathray_config.yaml
More file actions
144 lines (119 loc) · 4.65 KB
/
ray_config.yaml
File metadata and controls
144 lines (119 loc) · 4.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# Ray Distributed Training Configuration
# This configuration file defines settings for Ray Train distributed training
# =============================================================================
# Ray Cluster Configuration
# =============================================================================
cluster:
# Ray cluster address
# - "auto": Initialize local Ray cluster automatically
# - "ray://<head_ip>:10001": Connect to remote Ray cluster
ray_address: "auto"
# =============================================================================
# Scaling Configuration
# =============================================================================
scaling:
# Number of training workers (typically equals number of GPUs)
num_workers: 8
# Whether to use GPU for training
use_gpu: true
# Resources allocated per worker
resources_per_worker:
CPU: 4 # CPU cores per worker
GPU: 1 # GPUs per worker (1 for single-GPU workers)
# =============================================================================
# Training Configuration
# =============================================================================
training:
# PyTorch distributed backend
# - "nccl": For GPU training (recommended)
# - "gloo": For CPU training
backend: "nccl"
# DeepSpeed integration
use_deepspeed: true
# DeepSpeed configuration (maps to Accelerate's deepspeed_config)
deepspeed_config:
# ZeRO optimization configuration
zero_optimization:
stage: 2 # ZeRO-2 optimization (1, 2, or 3)
# Note: ZeRO-3 requires additional memory management
# Mixed precision training
bf16:
enabled: true # Use BFloat16 for mixed precision
# Gradient configuration
gradient_clipping: 1.0
gradient_accumulation_steps: 1
# Batch size per GPU (should match train_batch_size below)
train_micro_batch_size_per_gpu: 8
# =============================================================================
# Fault Tolerance Configuration
# =============================================================================
fault_tolerance:
# Maximum number of failures before giving up
max_failures: 3
# Number of checkpoints to keep
checkpoint_num_to_keep: 3
# Checkpoint selection criteria
checkpoint_score_attribute: "loss" # Metric to use for checkpoint selection
checkpoint_score_order: "min" # "min" for loss, "max" for accuracy
# =============================================================================
# Model and Data Configuration
# =============================================================================
# Model path (HuggingFace model or local path)
model_path: "Qwen/Qwen2.5-0.5B" # Example: change to your model
# Experiment identification
experiment_id: "ray_f2llm_training"
# Output directories
output_dir: "./outputs/ray_train"
tb_dir: "./tensorboard/ray_train"
cache_dir: "./cache"
# Training data
train_data_path: "./data/train"
# =============================================================================
# Training Hyperparameters
# =============================================================================
# Batch size per device
train_batch_size: 8
# Maximum sequence length
max_seq_length: 2048
# Optimizer settings
learning_rate: 1.0e-4
min_lr: 1.0e-6
weight_decay: 0.01
# Learning rate schedule
warmup_steps: 100
# Embedding training settings
num_hard_neg: 7 # Number of hard negatives per sample
# Training duration
# train_steps: -1 means use train_epochs instead
train_steps: -1
train_epochs: 5
# Logging and checkpointing
log_interval: 20 # Log every N steps
checkpointing_steps: 100 # Save checkpoint every N steps
validation_steps: 100 # Run validation every N steps
# =============================================================================
# Notes and Tips
# =============================================================================
# 1. For single-node training:
# - Keep ray_address: "auto"
# - Set num_workers to number of GPUs
#
# 2. For multi-node training:
# - Start Ray cluster on head node: ray start --head --port=6379
# - Start Ray on worker nodes: ray start --address=<head_ip>:6379
# - Set ray_address: "ray://<head_ip>:10001"
# - Set num_workers to total GPUs across all nodes
#
# 3. For fault-tolerant training (spot instances):
# - Increase max_failures (e.g., 5-10)
# - Decrease checkpointing_steps (e.g., 50)
# - Increase checkpoint_num_to_keep (e.g., 5)
#
# 4. DeepSpeed ZeRO stages:
# - ZeRO-1: Optimizer state partitioning
# - ZeRO-2: + Gradient partitioning (recommended)
# - ZeRO-3: + Parameter partitioning (for very large models)
#
# 5. For debugging:
# - Set num_workers: 1
# - Use local_mode: ray.init(local_mode=True) in code