From 02ebc899207a0d4cdff96958effc601547e99e69 Mon Sep 17 00:00:00 2001
From: Curnane <mingliangfu@users.noreply.github.com>
Date: Mon, 25 May 2026 17:42:05 +0800
Subject: [PATCH] feat(configs): add Qwen3.5-4B DFlash draft config and
 training example

- Add configs/qwen3.5-4b-dflash.json with model architecture aligned
  to Qwen3.5-4B (hidden_size=2560, vocab_size=248320,
  target_layer_ids=[1,8,15,22,29]).

- Add examples/run_qwen3.5_4b_dflash_online_npu.sh as a reference training script
  using HF backend and paper-aligned hyperparameters:
  max_length=3072, num_anchors=512, lr=6e-4, gamma=7.0.

- The config is compatible with the existing DFlash training pipeline
  and uses the same block_size=16 and attention backends as other
  Qwen3/3.5 examples.
---
 configs/qwen3.5-4b-dflash.json               | 47 ++++++++++++++++++++
 examples/run_qwen3.5_4b_dflash_online_npu.sh | 42 +++++++++++++++++
 2 files changed, 89 insertions(+)
 create mode 100644 configs/qwen3.5-4b-dflash.json
 create mode 100644 examples/run_qwen3.5_4b_dflash_online_npu.sh

diff --git a/configs/qwen3.5-4b-dflash.json b/configs/qwen3.5-4b-dflash.json
new file mode 100644
index 00000000..1acd248f
--- /dev/null
+++ b/configs/qwen3.5-4b-dflash.json
@@ -0,0 +1,47 @@
+{
+    "architectures": [
+      "DFlashDraftModel"
+    ],
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoModel": "dflash.DFlashDraftModel"
+    },
+    "block_size": 16,
+    "bos_token_id": 248043,
+    "dflash_config": {
+      "mask_token_id": 248070,
+      "target_layer_ids": [1, 8, 15, 22, 29]
+    },
+    "dtype": "bfloat16",
+    "eos_token_id": 248044,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2560,
+    "initializer_range": 0.02,
+    "intermediate_size": 9728,
+    "layer_types": [
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention",
+      "full_attention"
+    ],
+    "max_position_embeddings": 262144,
+    "max_window_layers": 5,
+    "model_type": "qwen3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 5,
+    "num_key_value_heads": 8,
+    "num_target_layers": 32,
+    "pad_token_id": 248044,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 10000000,
+    "sliding_window": null,
+    "tie_word_embeddings": true,
+    "transformers_version": "4.57.1",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "vocab_size": 248320
+  }
diff --git a/examples/run_qwen3.5_4b_dflash_online_npu.sh b/examples/run_qwen3.5_4b_dflash_online_npu.sh
new file mode 100644
index 00000000..9c4c997e
--- /dev/null
+++ b/examples/run_qwen3.5_4b_dflash_online_npu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+# Train DFlash for Qwen3.5-4B
+TP_SIZE=1
+BUILD_DATASET_NUM_PROC=64
+
+export HF_DATASETS_CACHE=$ROOT_DIR/cache/hf_datasets
+export TORCHINDUCTOR_CACHE_DIR=$ROOT_DIR/cache/compiled_kernels
+
+ATTENTION_BACKEND=${2:-flex_attention}
+NUM_GPUS=${1:-8}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_dflash.py \
+    --target-model-path PATH/TO/Qwen3.5-4B \
+    --draft-config-path $ROOT_DIR/configs/qwen3.5-4b-dflash.json \
+    --train-data-path $ROOT_DIR/cache/dataset/train_regen.jsonl \
+    --output-dir $ROOT_DIR/outputs/qwen3.5-4b-dflash \
+    --num-epochs 10 \
+    --batch-size 2 \
+    --accumulation-steps 4 \
+    --learning-rate 6e-4 \
+    --warmup-ratio 0.04 \
+    --max-grad-norm 1.0 \
+    --max-length 3072 \
+    --chat-template qwen3.5 \
+    --attention-backend $ATTENTION_BACKEND \
+    --num-anchors 512 \
+    --loss-decay-gamma 7.0 \
+    --log-interval 50 \
+    --save-interval 10000 \
+    --report-to tensorboard \
+    --target-model-backend hf \
+    --tp-size $TP_SIZE \
+    --build-dataset-num-proc $BUILD_DATASET_NUM_PROC \
+    --block-size 16 \
+    --trust-remote-code