Efficient-Large-Model
/

Fast_dVLM_3B

@@ -81,6 +81,49 @@ def hybrid_block_causal_mask_multiturn(b, h, q_idx, kv_idx, response_block_idx=N
     return block_diagonal | offset_block_causal | x0_causal
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
     # Compute block indices
     block_q = q_idx // block_size
@@ -710,16 +753,42 @@ class Fast_dVLMAttention(nn.Module):
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         cos, sin = position_embeddings
         if self.training:
-            #split q into two parts
-            q_1 = query_states[:,:,:query_states.shape[2]//2]
-            q_2 = query_states[:,:,query_states.shape[2]//2:]
-            #split k into two parts
-            k_1 = key_states[:,:,:key_states.shape[2]//2]
-            k_2 = key_states[:,:,key_states.shape[2]//2:]
-            q_1, k_1 = apply_multimodal_rotary_pos_emb(q_1, k_1, cos, sin, self.rope_scaling["mrope_section"])
-            q_2, k_2 = apply_multimodal_rotary_pos_emb(q_2, k_2, cos, sin, self.rope_scaling["mrope_section"])
             query_states = torch.cat((q_1, q_2), dim=-2)
             key_states = torch.cat((k_1, k_2), dim=-2)
         else:
@@ -1504,6 +1573,11 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         self.minimum_noise_level = getattr(config, 'minimum_noise_level', 0.0)
         self.im_end_token_id = 151645  # <|im_end|> token id
         # Vision-to-text aligner (if vision output dim != text hidden dim)
         vision_out_dim = config.vision_config.out_hidden_size
         text_hidden = config.text_config.hidden_size
@@ -1614,6 +1688,27 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         )
         return mask
     @can_return_tuple
     @auto_docstring
     def forward(
@@ -1726,6 +1821,8 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 labels = torch.cat([labels, complementary_labels], dim=0)
                 attention_mask = self.gen_hybrid_block_causal_mask(seq_len, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
             else:
                 # Multimodal block diffusion path.
@@ -1808,11 +1905,26 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 labels_noisy = labels.clone()
                 labels_noisy[~mask_indices] = -100
-                # Concatenate [noisy | clean] along the sequence dimension.
                 input_ids_pair1 = torch.cat([noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair1 = torch.cat([noisy_embeds, original_embeds], dim=1)
                 labels_pair1 = labels_noisy
-                position_ids_pair1 = original_position_ids
                 # Complementary pair: mask the positions that were left clean above.
                 complementary_mask_indices = response_mask & ~mask_indices
@@ -1825,13 +1937,16 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 complementary_noisy_embeds_raw = self.model.language_model.embed_tokens(complementary_noisy_input_ids)
                 complementary_noisy_embeds = torch.where(vision_mask_3d, original_embeds, complementary_noisy_embeds_raw)
                 complementary_labels = original_labels.clone()
                 complementary_labels[~complementary_mask_indices] = -100
                 input_ids_pair2 = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair2 = torch.cat([complementary_noisy_embeds, original_embeds], dim=1)
                 labels_pair2 = complementary_labels
-                position_ids_pair2 = original_position_ids
                 # Stack the complementary pair along the batch dimension.
                 input_ids = torch.cat([input_ids_pair1, input_ids_pair2], dim=0)
@@ -1839,11 +1954,18 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 labels = torch.cat([labels_pair1, labels_pair2], dim=0)
                 position_ids = torch.cat([position_ids_pair1, position_ids_pair2], dim=1)
-                attention_mask = self.gen_hybrid_block_causal_mask(L, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
             # Phase D: forward through the inner model. Vision features (if any)
             # have already been scattered into inputs_embeds, so pixel_values are
-            # cleared to skip re-processing inside `Fast_dVLMModel`.
             outputs = self.model(
                 input_ids=input_ids,
                 pixel_values=None,
@@ -1889,7 +2011,8 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
         loss = None
         if self.training:
-            mdm_hidden_states = hidden_states[:, :hidden_states.shape[1]//2, :]
             # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(mdm_hidden_states[:, slice_indices, :])
@@ -1901,7 +2024,7 @@ class Fast_dVLMForConditionalGeneration(Fast_dVLMPreTrainedModel, GenerationMixi
                 loss = self.loss_function(
                     logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
                 ) * 0.5
-            causal_hidden_states = hidden_states[:hidden_states.shape[0]//2, hidden_states.shape[1]//2:, :]
             causal_logits = self.lm_head(causal_hidden_states[:, slice_indices, :])
             loss += self.loss_function(
                 logits=causal_logits, labels=original_labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs

     return block_diagonal | offset_block_causal | x0_causal
+def hybrid_block_causal_mask_multiturn_asymmetric(
+    b, h, q_idx, kv_idx,
+    turn_idx_noisy=None,
+    turn_idx_clean=None,
+    n_noisy=None,
+):
+    """
+    Asymmetric variant of `hybrid_block_causal_mask_multiturn` used by the
+    efficient vision path.
+    Layout: ``[noisy(L_text) | clean(L)]`` where the noisy half drops vision
+    tokens, so ``L_text < L``. Separate ``turn_idx`` tensors are required for
+    the two halves (the noisy half indexes into the compressed positions, the
+    clean half into the original positions). Mask rules are identical to the
+    symmetric version:
+      * block_diagonal: x_t ↔ x_t within the same turn.
+      * offset_block_causal: x_t may attend to x_0 of strictly earlier turns.
+      * x0_causal: standard causal masking inside the x_0 region.
+    """
+    x0_flag_q = (q_idx >= n_noisy)
+    x0_flag_kv = (kv_idx >= n_noisy)
+    pos_q = torch.where(x0_flag_q, q_idx - n_noisy, q_idx)
+    pos_kv = torch.where(x0_flag_kv, kv_idx - n_noisy, kv_idx)
+    turn_q = torch.where(
+        x0_flag_q,
+        turn_idx_clean[torch.clamp(pos_q, max=turn_idx_clean.shape[0] - 1)],
+        turn_idx_noisy[torch.clamp(pos_q, max=turn_idx_noisy.shape[0] - 1)],
+    )
+    turn_kv = torch.where(
+        x0_flag_kv,
+        turn_idx_clean[torch.clamp(pos_kv, max=turn_idx_clean.shape[0] - 1)],
+        turn_idx_noisy[torch.clamp(pos_kv, max=turn_idx_noisy.shape[0] - 1)],
+    )
+    block_diagonal = ~x0_flag_q & ~x0_flag_kv & (turn_q == turn_kv)
+    offset_block_causal = (turn_q > turn_kv) & x0_flag_kv & (~x0_flag_q)
+    x0_causal = x0_flag_q & x0_flag_kv & (pos_q >= pos_kv)
+    return block_diagonal | offset_block_causal | x0_causal
 def eval_block_diff_mask(q_idx, kv_idx, block_size=None):
     # Compute block indices
     block_q = q_idx // block_size
         key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        # `noisy_seq_len` is an MDM-specific kwarg; pop it before the value can
+        # leak into attention backends (e.g. flash_attention_2) that don't
+        # understand it.
+        noisy_seq_len = kwargs.pop("noisy_seq_len", None)
         cos, sin = position_embeddings
         if self.training:
+            total_seq_len = query_states.shape[2]
+            # The noisy half can be shorter than the clean half (multimodal
+            # batches drop vision tokens from the noisy side). When the caller
+            # tells us its length explicitly we honor it; otherwise we fall back
+            # to the symmetric split used for text-only batches.
+            if noisy_seq_len is not None:
+                noisy_len = int(noisy_seq_len)
+            else:
+                noisy_len = total_seq_len // 2
+            q_1 = query_states[:, :, :noisy_len]
+            q_2 = query_states[:, :, noisy_len:]
+            k_1 = key_states[:, :, :noisy_len]
+            k_2 = key_states[:, :, noisy_len:]
+            if cos.shape[2] >= total_seq_len:
+                cos_1 = cos[:, :, :noisy_len, :]
+                sin_1 = sin[:, :, :noisy_len, :]
+                cos_2 = cos[:, :, noisy_len:, :]
+                sin_2 = sin[:, :, noisy_len:, :]
+            else:
+                # `position_ids` only covers the clean half length. Both halves
+                # share the same RoPE — valid only for the symmetric layout
+                # where noisy_len == clean_len.
+                cos_1, sin_1 = cos, sin
+                cos_2, sin_2 = cos, sin
+            q_1, k_1 = apply_multimodal_rotary_pos_emb(q_1, k_1, cos_1, sin_1, self.rope_scaling["mrope_section"])
+            q_2, k_2 = apply_multimodal_rotary_pos_emb(q_2, k_2, cos_2, sin_2, self.rope_scaling["mrope_section"])
             query_states = torch.cat((q_1, q_2), dim=-2)
             key_states = torch.cat((k_1, k_2), dim=-2)
         else:
         self.minimum_noise_level = getattr(config, 'minimum_noise_level', 0.0)
         self.im_end_token_id = 151645  # <|im_end|> token id
+        # Length of the noisy half passed through to attention. For text-only
+        # batches it equals the (symmetric) sequence length; for multimodal
+        # batches the noisy half drops vision tokens, so it is shorter.
+        self._noisy_seq_len: Optional[int] = None
         # Vision-to-text aligner (if vision output dim != text hidden dim)
         vision_out_dim = config.vision_config.out_hidden_size
         text_hidden = config.text_config.hidden_size
         )
         return mask
+    def gen_hybrid_block_causal_mask_asymmetric(
+        self, L_text, L_clean, turn_idx_noisy, turn_idx_clean, B, H
+    ):
+        """Generate the asymmetric hybrid mask used by the efficient vision path.
+        Layout: ``[noisy(L_text) | clean(L_clean)]`` where vision tokens have
+        been removed from the noisy half.
+        """
+        n_noisy_t = torch.tensor(L_text, device=self.device, dtype=torch.int32)
+        total = L_text + L_clean
+        mask = create_block_mask(
+            partial(
+                hybrid_block_causal_mask_multiturn_asymmetric,
+                turn_idx_noisy=turn_idx_noisy,
+                turn_idx_clean=turn_idx_clean,
+                n_noisy=n_noisy_t,
+            ),
+            B=B, H=H, Q_LEN=total, KV_LEN=total,
+        )
+        return mask
     @can_return_tuple
     @auto_docstring
     def forward(
                 labels = torch.cat([labels, complementary_labels], dim=0)
                 attention_mask = self.gen_hybrid_block_causal_mask(seq_len, response_block_idx, turn_idx, input_ids.shape[0], self.config.num_attention_heads)
+                # Text-only path: noisy and clean halves have identical length.
+                self._noisy_seq_len = seq_len
             else:
                 # Multimodal block diffusion path.
                 labels_noisy = labels.clone()
                 labels_noisy[~mask_indices] = -100
+                # Efficient vision: drop vision tokens from the noisy half so the
+                # model only attends to the (much shorter) text portion on that
+                # side, while the clean half keeps the full sequence so visual
+                # context is still available via cross-attention.
+                text_positions = (~vision_token_mask[0]).nonzero(as_tuple=True)[0]
+                L_text = text_positions.shape[0]
+                noisy_embeds = noisy_embeds[:, text_positions, :]
+                noisy_input_ids = noisy_input_ids[:, text_positions]
+                labels_noisy = labels_noisy[:, text_positions]
+                noisy_position_ids = original_position_ids[:, :, text_positions]
+                combined_position_ids = torch.cat([noisy_position_ids, original_position_ids], dim=2)
+                turn_idx_noisy = turn_idx[text_positions]
+                # Concatenate [noisy(L_text) | clean(L)] along the sequence dim.
                 input_ids_pair1 = torch.cat([noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair1 = torch.cat([noisy_embeds, original_embeds], dim=1)
                 labels_pair1 = labels_noisy
+                position_ids_pair1 = combined_position_ids
                 # Complementary pair: mask the positions that were left clean above.
                 complementary_mask_indices = response_mask & ~mask_indices
                 complementary_noisy_embeds_raw = self.model.language_model.embed_tokens(complementary_noisy_input_ids)
                 complementary_noisy_embeds = torch.where(vision_mask_3d, original_embeds, complementary_noisy_embeds_raw)
+                complementary_noisy_embeds = complementary_noisy_embeds[:, text_positions, :]
+                complementary_noisy_input_ids = complementary_noisy_input_ids[:, text_positions]
                 complementary_labels = original_labels.clone()
                 complementary_labels[~complementary_mask_indices] = -100
+                complementary_labels = complementary_labels[:, text_positions]
                 input_ids_pair2 = torch.cat([complementary_noisy_input_ids, original_input_ids], dim=1)
                 embeds_pair2 = torch.cat([complementary_noisy_embeds, original_embeds], dim=1)
                 labels_pair2 = complementary_labels
+                position_ids_pair2 = combined_position_ids
                 # Stack the complementary pair along the batch dimension.
                 input_ids = torch.cat([input_ids_pair1, input_ids_pair2], dim=0)
                 labels = torch.cat([labels_pair1, labels_pair2], dim=0)
                 position_ids = torch.cat([position_ids_pair1, position_ids_pair2], dim=1)
+                attention_mask = self.gen_hybrid_block_causal_mask_asymmetric(
+                    L_text, L, turn_idx_noisy, turn_idx,
+                    input_ids.shape[0], self.config.num_attention_heads,
+                )
+                self._noisy_seq_len = L_text
             # Phase D: forward through the inner model. Vision features (if any)
             # have already been scattered into inputs_embeds, so pixel_values are
+            # cleared to skip re-processing inside `Fast_dVLMModel`. The noisy
+            # half length is forwarded as a kwarg so attention can split the
+            # asymmetric `[noisy | clean]` layout correctly.
+            kwargs['noisy_seq_len'] = self._noisy_seq_len
             outputs = self.model(
                 input_ids=input_ids,
                 pixel_values=None,
         loss = None
         if self.training:
+            noisy_len = self._noisy_seq_len
+            mdm_hidden_states = hidden_states[:, :noisy_len, :]
             # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
             slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
             logits = self.lm_head(mdm_hidden_states[:, slice_indices, :])
                 loss = self.loss_function(
                     logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs
                 ) * 0.5
+            causal_hidden_states = hidden_states[:hidden_states.shape[0]//2, noisy_len:, :]
             causal_logits = self.lm_head(causal_hidden_states[:, slice_indices, :])
             loss += self.loss_function(
                 logits=causal_logits, labels=original_labels, vocab_size=self.config.text_config.vocab_size, **new_kwargs