NimbleEdge
diff --git a/‎configs/mistral_skip_causal_7b.json‎
Lines changed: 27 additions & 0 deletions b/‎configs/mistral_skip_causal_7b.json‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎configs/phi3_skip_causal_3.8b.json‎
Lines changed: 144 additions & 0 deletions b/‎configs/phi3_skip_causal_3.8b.json‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions b/‎requirements.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎run_benchmark.py‎
Lines changed: 0 additions & 7 deletions b/‎run_benchmark.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎src/modeling_skip.py‎
Lines changed: 75 additions & 4 deletions b/‎src/modeling_skip.py‎
Lines changed: 75 additions & 4 deletions
diff --git a/‎src/modeling_utils.py‎
Lines changed: 0 additions & 72 deletions b/‎src/modeling_utils.py‎
Lines changed: 0 additions & 72 deletions
diff --git a/‎src/models/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/models/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/models/llama/modelling_llama_skip.py‎
Lines changed: 0 additions & 3 deletions b/‎src/models/llama/modelling_llama_skip.py‎
Lines changed: 0 additions & 3 deletions
@@ -0,0 +1,27 @@
+{
+  "_name_or_path": "mistralai/Mistral-7B-Instruct-v0.3",
+  "sparsity": 0.3,
+  "architectures": [
+    "MistralSkipConnectionForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "model_type": "mistral-skip",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.42.0.dev0",
+  "use_cache": true,
+  "vocab_size": 32768
+}
@@ -0,0 +1,144 @@
+{
+  "_name_or_path": "microsoft/Phi-4-mini-instruct",
+  "sparsity": 0.3,
+  "architectures": [
+    "Phi3SkipConnectionForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoTokenizer": "Xenova/gpt-4o"
+  },
+  "bos_token_id": 199999,
+  "embd_pdrop": 0.0,
+  "eos_token_id": 199999,
+  "full_attn_mod": 1,
+  "hidden_act": "silu",
+  "hidden_size": 3072,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "interpolate_factor": 1,
+  "lm_head_bias": false,
+  "max_position_embeddings": 131072,
+  "mlp_bias": false,
+  "model_type": "phi3-skip",
+  "num_attention_heads": 24,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "original_max_position_embeddings": 4096,
+  "pad_token_id": 199999,
+  "partial_rotary_factor": 0.75,
+  "resid_pdrop": 0.0,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": {
+    "long_factor": [
+      1,
+      1.118320672,
+      1.250641126,
+      1.398617824,
+      1.564103225,
+      1.74916897,
+      1.956131817,
+      2.187582649,
+      2.446418898,
+      2.735880826,
+      3.059592084,
+      3.421605075,
+      3.826451687,
+      4.279200023,
+      4.785517845,
+      5.351743533,
+      5.984965424,
+      6.693110555,
+      7.485043894,
+      8.370679318,
+      9.36110372,
+      10.4687158,
+      11.70738129,
+      13.09260651,
+      14.64173252,
+      16.37415215,
+      18.31155283,
+      20.47818807,
+      22.90118105,
+      25.61086418,
+      28.64115884,
+      32.03,
+      32.1,
+      32.13,
+      32.23,
+      32.6,
+      32.61,
+      32.64,
+      32.66,
+      32.7,
+      32.71,
+      32.93,
+      32.97,
+      33.28,
+      33.49,
+      33.5,
+      44.16,
+      47.77
+    ],
+    "short_factor": [
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0,
+      1.0
+    ],
+    "type": "longrope"
+  },
+  "rope_theta": 10000.0,
+  "sliding_window": 262144,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.45.0",
+  "use_cache": true,
+  "vocab_size": 200064
+}
+
@@ -8,4 +8,6 @@ attrs
 scikit-learn
 accelerate
 datasets
+sentencepiece
+protobuf
 #wandb
@@ -8,7 +8,6 @@
 
 import torch
 from transformers import AutoModelForCausalLM, AutoConfig, AutoTokenizer
-from src.modeling_utils import FastLoRAProjection
 from src.utilities.cuda_utils import GPUMonitor, setup_cuda_debugging
 from src.utilities.sys_utils import print_system_info
 import src.models   # adds models to registry
@@ -407,12 +406,6 @@ def main():
 
     # Always run SkipLLaMA benchmark with HuggingFace
     skip_model = AutoModelForCausalLM.from_pretrained(checkpoint, config=config)
-    for module in skip_model.modules():
-        if any(hasattr(p, 'is_meta') and p.is_meta for p in module.parameters()) and isinstance(module, FastLoRAProjection):
-            module = module.to_empty(device="cpu")
-            with torch.no_grad():
-                torch.nn.init.xavier_normal_(module.down.weight)
-                torch.nn.init.zeros_(module.up.weight)  # Initialize up projection to zeros for stable training
     skip_model.tie_weights()
 
     skip_name = "Skip-%s" % model_name
 
@@ -36,12 +36,74 @@
     approx_topk_threshold
 )
 
-from src.modeling_utils import (
-    FastLoRAProjection, BaseModelOutputWithPastAndPredictorLoss
-)
-
 logger = logging.get_logger(__name__)
 
+@dataclass
+class BaseModelOutputWithPastAndPredictorLoss(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None    
+
+
+class FastLoRAProjection(nn.Module):
+    def __init__(self, hidden_size, intermediate_size, lora_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.lora_size = lora_size
+        # Force creation of linear layers with actual tensors (not meta tensors)
+        self.down = nn.Linear(hidden_size, lora_size, bias=False)
+        self.up = nn.Linear(lora_size, intermediate_size, bias=False)
+        # Pre-allocate buffers on CPU initially
+        self.register_buffer('intermediate', torch.zeros(1, lora_size))
+        self.register_buffer('output', torch.zeros(1, intermediate_size))
+
+    def to(self, *args, **kwargs):
+        # Move buffers to same device as model when .to() is called
+        device = args[0] if args else kwargs.get('device')
+        
+        if device:
+            self.intermediate = self.intermediate.to(device)
+            self.output = self.output.to(device)
+        return super().to(*args, **kwargs)
+    
+    def _fix_unloaded_weights(self):
+        out = self.to_empty(device="cpu")
+        with torch.no_grad():
+            torch.nn.init.xavier_normal_(out.down.weight)
+            torch.nn.init.zeros_(out.up.weight)  # Initialize up projection to zeros for stable training
+        return out
+    
+    def _resize_buffers(self, batch_size: int, dtype: torch.dtype):
+        if self.intermediate.size(0) != batch_size:
+            self.intermediate.resize_(batch_size, self.lora_size)
+            self.intermediate = self.intermediate.to(dtype=dtype)
+            self.intermediate.fill_(0.0)  # Explicitly initialize with zeros
+            self.output.resize_(batch_size, self.intermediate_size)
+            self.output = self.output.to(dtype=dtype)
+            self.output.fill_(0.0)  # Explicitly initialize with zeros
+   
+    def forward(self, x):
+        batch_size = x.size(0)
+        
+        # Check if gradients are required (training mode)
+        if self.training:
+            # Use regular matrix multiplication for gradient computation
+            intermediate = torch.mm(x, self.down.weight.t())
+            output = torch.mm(intermediate, self.up.weight.t())
+            return output
+        else:
+            # # Use optimized in-place operations for inference
+            # intermediate = torch.mm(x, self.down.weight.t())
+            # output = torch.mm(intermediate, self.up.weight.t())
+            # return output
+        
+            self._resize_buffers(batch_size, x.dtype)
+            torch.mm(x, self.down.weight.t(), out=self.intermediate)
+            torch.mm(self.intermediate, self.up.weight.t(), out=self.output)
+            return self.output
 
 class SkipMLP(nn.Module):
     def __init__(self, hidden_size: int, intermediate_size: int, sparsity: float, bias: bool = False):
@@ -415,6 +477,15 @@ def __init__(self, config):
 
             # Initialize weights and apply final processing
             self.post_init()
+
+        @classmethod
+        def from_pretrained(cls, *args, **kwargs):
+            out = super(SkipConnectionModelForCausalLM, cls).from_pretrained(*args, **kwargs)
+            for module in out.modules():
+                if any(hasattr(p, 'is_meta') and p.is_meta for p in module.parameters()) and \
+                        hasattr(module, '_fix_unloaded_weights'):
+                    module = module._fix_unloaded_weights()
+            return out
 
         def get_input_embeddings(self):
             return self.model.embed_tokens
 
@@ -1,3 +1,5 @@
 from . import llama
 from . import qwen2
+from . import mistral
+from . import phi3
 # from . import dia
@@ -37,9 +37,6 @@
 )
 
 from src.models.llama.configuration_llama_skip import LlamaSkipConnectionConfig
-from src.modeling_utils import (
-    FastLoRAProjection, BaseModelOutputWithPastAndPredictorLoss
-)
 from src.modeling_skip import SkipMLP, SkipDecoderLayer, build_skip_connection_model, build_skip_connection_model_for_causal_lm
 
 logger = logging.get_logger(__name__)
Original file line number	Diff line number	Diff line change
`@@ -37,9 +37,6 @@`
`37`	`37`	`)`
`38`	`38`
`39`	`39`	`from src.models.llama.configuration_llama_skip import LlamaSkipConnectionConfig`
`40`		`-from src.modeling_utils import (`
`41`		`- FastLoRAProjection, BaseModelOutputWithPastAndPredictorLoss`
`42`		`-)`
`43`	`40`	`from src.modeling_skip import SkipMLP, SkipDecoderLayer, build_skip_connection_model, build_skip_connection_model_for_causal_lm`
`44`	`41`
`45`	`42`	`logger = logging.get_logger(__name__)`