From 02ebc899207a0d4cdff96958effc601547e99e69 Mon Sep 17 00:00:00 2001 From: Curnane Date: Mon, 25 May 2026 17:42:05 +0800 Subject: [PATCH] feat(configs): add Qwen3.5-4B DFlash draft config and training example - Add configs/qwen3.5-4b-dflash.json with model architecture aligned to Qwen3.5-4B (hidden_size=2560, vocab_size=248320, target_layer_ids=[1,8,15,22,29]). - Add examples/run_qwen3.5_4b_dflash_online_npu.sh as a reference training script using HF backend and paper-aligned hyperparameters: max_length=3072, num_anchors=512, lr=6e-4, gamma=7.0. - The config is compatible with the existing DFlash training pipeline and uses the same block_size=16 and attention backends as other Qwen3/3.5 examples. --- configs/qwen3.5-4b-dflash.json | 47 ++++++++++++++++++++ examples/run_qwen3.5_4b_dflash_online_npu.sh | 42 +++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 configs/qwen3.5-4b-dflash.json create mode 100644 examples/run_qwen3.5_4b_dflash_online_npu.sh diff --git a/configs/qwen3.5-4b-dflash.json b/configs/qwen3.5-4b-dflash.json new file mode 100644 index 00000000..1acd248f --- /dev/null +++ b/configs/qwen3.5-4b-dflash.json @@ -0,0 +1,47 @@ +{ + "architectures": [ + "DFlashDraftModel" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoModel": "dflash.DFlashDraftModel" + }, + "block_size": 16, + "bos_token_id": 248043, + "dflash_config": { + "mask_token_id": 248070, + "target_layer_ids": [1, 8, 15, 22, 29] + }, + "dtype": "bfloat16", + "eos_token_id": 248044, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 2560, + "initializer_range": 0.02, + "intermediate_size": 9728, + "layer_types": [ + "full_attention", + "full_attention", + "full_attention", + "full_attention", + "full_attention" + ], + "max_position_embeddings": 262144, + "max_window_layers": 5, + "model_type": "qwen3", + "num_attention_heads": 32, + "num_hidden_layers": 5, + "num_key_value_heads": 8, + "num_target_layers": 32, + "pad_token_id": 248044, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 10000000, + "sliding_window": null, + "tie_word_embeddings": true, + "transformers_version": "4.57.1", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 248320 + } diff --git a/examples/run_qwen3.5_4b_dflash_online_npu.sh b/examples/run_qwen3.5_4b_dflash_online_npu.sh new file mode 100644 index 00000000..9c4c997e --- /dev/null +++ b/examples/run_qwen3.5_4b_dflash_online_npu.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ROOT_DIR=$(dirname $SCRIPT_DIR) + +# Train DFlash for Qwen3.5-4B +TP_SIZE=1 +BUILD_DATASET_NUM_PROC=64 + +export HF_DATASETS_CACHE=$ROOT_DIR/cache/hf_datasets +export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels + +ATTENTION_BACKEND=${2:-flex_attention} +NUM_GPUS=${1:-8} + +torchrun \ + --standalone \ + --nproc_per_node $NUM_GPUS \ + $ROOT_DIR/scripts/train_dflash.py \ + --target-model-path PATH/TO/Qwen3.5-4B \ + --draft-config-path $ROOT_DIR/configs/qwen3.5-4b-dflash.json \ + --train-data-path $ROOT_DIR/cache/dataset/train_regen.jsonl \ + --output-dir $ROOT_DIR/outputs/qwen3.5-4b-dflash \ + --num-epochs 10 \ + --batch-size 2 \ + --accumulation-steps 4 \ + --learning-rate 6e-4 \ + --warmup-ratio 0.04 \ + --max-grad-norm 1.0 \ + --max-length 3072 \ + --chat-template qwen3.5 \ + --attention-backend $ATTENTION_BACKEND \ + --num-anchors 512 \ + --loss-decay-gamma 7.0 \ + --log-interval 50 \ + --save-interval 10000 \ + --report-to tensorboard \ + --target-model-backend hf \ + --tp-size $TP_SIZE \ + --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \ + --block-size 16 \ + --trust-remote-code