sgl-project · curnane-lab · May 25, 2026 · gemini-code-assist · May 25, 2026
@@ -0,0 +1,47 @@
+{
+    "architectures": [
+      "DFlashDraftModel"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoModel": "dflash.DFlashDraftModel"
+    },
+    "block_size": 16,
+    "bos_token_id": 248043,
+    "dflash_config": {
+      "mask_token_id": 248070,
+      "target_layer_ids": [1, 8, 15, 22, 29]
+    },
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 262144,
+    "max_window_layers": 5,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 5,
+    "num_key_value_heads": 8,
+    "num_target_layers": 32,
+    "pad_token_id": 248044,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 10000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "transformers_version": "4.57.1",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 248320
+  }
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+# Train DFlash for Qwen3.5-4B
+TP_SIZE=1
+BUILD_DATASET_NUM_PROC=64
+
+export HF_DATASETS_CACHE=$ROOT_DIR/cache/hf_datasets
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+
+ATTENTION_BACKEND=${2:-flex_attention}
+NUM_GPUS=${1:-8}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_dflash.py \
+    --target-model-path PATH/TO/Qwen3.5-4B \
+    --draft-config-path $ROOT_DIR/configs/qwen3.5-4b-dflash.json \
+    --train-data-path $ROOT_DIR/cache/dataset/train_regen.jsonl \
+    --output-dir $ROOT_DIR/outputs/qwen3.5-4b-dflash \
+    --num-epochs 10 \
+    --batch-size 2 \
+    --accumulation-steps 4 \
+    --learning-rate 6e-4 \
+    --warmup-ratio 0.04 \
+    --max-grad-norm 1.0 \
+    --max-length 3072 \
+    --chat-template qwen3.5 \
+    --attention-backend $ATTENTION_BACKEND \
+    --num-anchors 512 \
+    --loss-decay-gamma 7.0 \
+    --log-interval 50 \
+    --save-interval 10000 \
+    --report-to tensorboard \
+    --target-model-backend hf \
+    --tp-size $TP_SIZE \
+    --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
+    --block-size 16 \
+    --trust-remote-code