test

#25

by jasonmoo - opened Oct 27, 2025

base: refs/heads/main

←

from: refs/pr/25

Discussion Files changed

+784

-1207

Files changed (12) hide show

hf_moondream.py +1 -8
layers.py +13 -38
lora.py +56 -411
model.safetensors.index.json +0 -0
model_fp8.pt +0 -3
modelv2-00001-of-00004.safetensors +0 -3
modelv2-00002-of-00004.safetensors +0 -3
modelv2-00003-of-00004.safetensors +0 -3
modelv2-00004-of-00004.safetensors +0 -3
moondream.py +29 -56
region.py +0 -2
text.py +23 -12

hf_moondream.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
 from typing import Union
@@ -43,14 +44,6 @@ class HfMoondream(PreTrainedModel):
             MoondreamConfig.from_dict(config.config), setup_caches=False
         )
         self._is_kv_cache_setup = False
-        self.post_init()
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        output = super().from_pretrained(*args, **kwargs)
-        model = output[0] if isinstance(output, tuple) else output
-        model.model._refresh_runtime_buffers()
-        return output
     def _setup_caches(self):
         if not self._is_kv_cache_setup:

 import torch
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
 from typing import Union
             MoondreamConfig.from_dict(config.config), setup_caches=False
         )
         self._is_kv_cache_setup = False
     def _setup_caches(self):
         if not self._is_kv_cache_setup:

layers.py CHANGED Viewed

@@ -5,14 +5,6 @@ import torch.nn.functional as F
 from dataclasses import dataclass
 from typing import Literal, Optional
-from .lora import (
-    DenseLoRALayer,
-    MoELoRALayer,
-    apply_dense_lora,
-    apply_moe_lora_fc1_flat,
-    apply_moe_lora_fc2_flat,
-)
 try:
     from torchao import quantize_
     from torchao.quantization import int4_weight_only
@@ -134,12 +126,11 @@ class MLPWeights:
     act: Literal["gelu_approx"] = "gelu_approx"
-def mlp(
-    x: torch.Tensor, w: MLPWeights, lora: Optional[DenseLoRALayer] = None
-) -> torch.Tensor:
     x0 = w.fc1(x)
     if lora is not None:
-        x = x0 + apply_dense_lora(x, lora.up_a, lora.up_b)
     else:
         x = x0
@@ -147,7 +138,8 @@ def mlp(
     x0 = w.fc2(x)
     if lora is not None:
-        x = x0 + apply_dense_lora(x, lora.down_a, lora.down_b)
     else:
         x = x0
@@ -155,10 +147,7 @@ def mlp(
 def moe_mlp(
-    x: torch.Tensor,
-    mlp_module: nn.Module,
-    experts_per_token: int,
-    lora: Optional[MoELoRALayer] = None,
 ) -> torch.Tensor:
     B, T, C = x.shape
     x = x.reshape(-1, C)
@@ -178,23 +167,21 @@ def moe_mlp(
         flat_weights = topk_weights.view(-1)  # [T*A]
         # Select expert weights
-        w1_selected = w1_weight[flat_idxs]
-        w2_selected = w2_weight[flat_idxs]
         # Expand input for all token-expert pairs
         x_expanded = x.unsqueeze(1).expand(-1, top_k, -1).reshape(-1, C)  # [T*A, D]
         # First linear layer with GeGLU: [T*A, H, D] @ [T*A, D, 1] -> [T*A, H]
-        x1_full = torch.bmm(w1_selected, x_expanded.unsqueeze(-1)).squeeze(-1)  # [T*A, H]
-        if lora is not None:
-            x1_full = x1_full + apply_moe_lora_fc1_flat(x_expanded, lora, flat_idxs)
         x1, g = x1_full.chunk(2, dim=-1)
         x1 = F.gelu(x1) * (g + 1)
         # Second linear layer: [T*A, D, H] @ [T*A, H, 1] -> [T*A, D]
         expert_outs = torch.bmm(w2_selected, x1.unsqueeze(-1)).squeeze(-1)  # [T*A, D]
-        if lora is not None:
-            expert_outs = expert_outs + apply_moe_lora_fc2_flat(x1, lora, flat_idxs)
         # Apply weights and reshape
         weighted_outs = expert_outs * flat_weights.unsqueeze(-1)  # [T*A, D]
@@ -216,22 +203,10 @@ def moe_mlp(
             x_tok = x.index_select(0, token_pos)
             gate_tok = topk_weights[token_pos, which_k]
-            w1 = mlp_module.fc1.weight[expert_id]
-            h_full = F.linear(x_tok, w1)
-            if lora is not None:
-                lora_up_a = lora.up_a[expert_id]
-                lora_up_b = lora.up_b[expert_id]
-                lora_mid = F.linear(x_tok, lora_up_a)
-                h_full = h_full + F.linear(lora_mid, lora_up_b)
             h, g = h_full.chunk(2, dim=-1)
             h = F.gelu(h) * (g + 1)
-            w2 = mlp_module.fc2.weight[expert_id]
-            y = F.linear(h, w2)
-            if lora is not None:
-                lora_down_a = lora.down_a[expert_id]
-                lora_down_b = lora.down_b[expert_id]
-                lora_mid = F.linear(h, lora_down_a)
-                y = y + F.linear(lora_mid, lora_down_b)
             y.mul_(gate_tok.unsqueeze(-1))
             out.index_add_(0, token_pos, y)

 from dataclasses import dataclass
 from typing import Literal, Optional
 try:
     from torchao import quantize_
     from torchao.quantization import int4_weight_only
     act: Literal["gelu_approx"] = "gelu_approx"
+def mlp(x: torch.Tensor, w: MLPWeights, lora: Optional[dict] = None) -> torch.Tensor:
     x0 = w.fc1(x)
     if lora is not None:
+        x1 = F.linear(F.linear(x, lora["fc1"]["A"]), lora["fc1"]["B"])
+        x = x0 + x1
     else:
         x = x0
     x0 = w.fc2(x)
     if lora is not None:
+        x1 = F.linear(F.linear(x, lora["fc2"]["A"]), lora["fc2"]["B"])
+        x = x0 + x1
     else:
         x = x0
 def moe_mlp(
+    x: torch.Tensor, mlp_module: nn.Module, experts_per_token: int
 ) -> torch.Tensor:
     B, T, C = x.shape
     x = x.reshape(-1, C)
         flat_weights = topk_weights.view(-1)  # [T*A]
         # Select expert weights
+        w1_selected = w1_weight[flat_idxs]  # [T*A, H, D]
+        w2_selected = w2_weight[flat_idxs]  # [T*A, D, H]
         # Expand input for all token-expert pairs
         x_expanded = x.unsqueeze(1).expand(-1, top_k, -1).reshape(-1, C)  # [T*A, D]
         # First linear layer with GeGLU: [T*A, H, D] @ [T*A, D, 1] -> [T*A, H]
+        x1_full = torch.bmm(w1_selected, x_expanded.unsqueeze(-1)).squeeze(
+            -1
+        )  # [T*A, H]
         x1, g = x1_full.chunk(2, dim=-1)
         x1 = F.gelu(x1) * (g + 1)
         # Second linear layer: [T*A, D, H] @ [T*A, H, 1] -> [T*A, D]
         expert_outs = torch.bmm(w2_selected, x1.unsqueeze(-1)).squeeze(-1)  # [T*A, D]
         # Apply weights and reshape
         weighted_outs = expert_outs * flat_weights.unsqueeze(-1)  # [T*A, D]
             x_tok = x.index_select(0, token_pos)
             gate_tok = topk_weights[token_pos, which_k]
+            h_full = F.linear(x_tok, mlp_module.fc1.weight[expert_id])
             h, g = h_full.chunk(2, dim=-1)
             h = F.gelu(h) * (g + 1)
+            y = F.linear(h, mlp_module.fc2.weight[expert_id])
             y.mul_(gate_tok.unsqueeze(-1))
             out.index_add_(0, token_pos, y)

lora.py CHANGED Viewed

@@ -1,437 +1,82 @@
-import json
 import os
-import re
 import shutil
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Any, Dict, Optional, Tuple
-from urllib.request import Request, urlopen
 import torch
-from .config import TextConfig
-class AdapterLoadError(RuntimeError):
-    pass
-def _cache_root() -> Path:
     hf_hub_cache = os.environ.get("HF_HUB_CACHE")
-    if hf_hub_cache:
-        return Path(hf_hub_cache)
     hf_home = os.environ.get("HF_HOME")
-    if hf_home:
-        return Path(hf_home) / "hub"
-    return Path("~/.cache/huggingface/hub").expanduser()
-def adapter_cache_dir() -> Path:
-    return _cache_root() / "md_finetunes"
-def normalize_adapter_id(value: Optional[str]) -> Optional[str]:
-    if not value:
-        return None
-    tail = value.split("/")[-1].strip()
-    if "@" not in tail:
-        return None
-    return tail
-def parse_adapter_id(adapter_id: str) -> Tuple[str, str]:
-    if not adapter_id or "@" not in adapter_id:
-        raise AdapterLoadError(
-            f"Invalid adapter id '{adapter_id}'. Expected 'finetune_id@step'."
-        )
-    finetune_id, step = adapter_id.split("@", 1)
-    if not finetune_id or not step:
-        raise AdapterLoadError(
-            f"Invalid adapter id '{adapter_id}'. Expected 'finetune_id@step'."
-        )
-    return finetune_id, step
-def _fetch_presigned_url(finetune_id: str, step: str) -> str:
-    endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai").rstrip("/")
     api_key = os.getenv("MOONDREAM_API_KEY")
-    if not api_key:
-        raise AdapterLoadError("MOONDREAM_API_KEY is required to load finetune adapters.")
-    headers = {"User-Agent": "moondream-torch", "X-Moondream-Auth": api_key}
-    url = f"{endpoint}/v1/tuning/finetunes/{finetune_id}/checkpoints/{step}/download"
-    req = Request(url, headers=headers)
-    try:
-        with urlopen(req) as r:
-            payload = json.loads(r.read().decode("utf-8"))
-    except Exception as e:
-        raise AdapterLoadError(f"Failed to fetch adapter URL: {e}") from e
-    presigned = payload.get("url")
-    if not presigned:
-        raise AdapterLoadError("Adapter URL response missing 'url' field.")
-    return presigned
-def cached_adapter_path(adapter_id: str) -> Path:
-    finetune_id, step = parse_adapter_id(adapter_id)
-    cache_dir = adapter_cache_dir() / finetune_id / step
-    cache_dir.mkdir(parents=True, exist_ok=True)
-    for name in ("adapter.pt", "adapter.safetensors"):
-        path = cache_dir / name
-        if path.exists() and path.stat().st_size > 0:
-            return path
-    presigned_url = _fetch_presigned_url(finetune_id, step)
-    dest = cache_dir / "adapter.pt"
-    try:
-        with urlopen(presigned_url) as r, open(dest, "wb") as f:
-            shutil.copyfileobj(r, f)
-    except Exception as e:
-        raise AdapterLoadError(f"Failed to download adapter: {e}") from e
     return dest
-def _load_state_dict(path: Path, device: torch.device) -> Dict[str, Any]:
-    if path.suffix == ".safetensors":
-        try:
-            from safetensors.torch import safe_open
-        except Exception as e:
-            raise AdapterLoadError(
-                "safetensors is required to load .safetensors adapters."
-            ) from e
-        data = {}
-        with safe_open(str(path), framework="pt") as f:
-            for key in f.keys():
-                data[key] = f.get_tensor(key).to(device=device)
-        return data
-    try:
-        return torch.load(path, map_location=device, weights_only=True)
-    except TypeError:
-        return torch.load(path, map_location=device)
-@dataclass
-class DenseLoRALayer:
-    up_a: torch.Tensor
-    up_b: torch.Tensor
-    down_a: torch.Tensor
-    down_b: torch.Tensor
-@dataclass
-class MoELoRALayer:
-    up_a: torch.Tensor
-    up_b: torch.Tensor
-    down_a: torch.Tensor
-    down_b: torch.Tensor
-class TextLoRA:
-    def __init__(
-        self,
-        text_config: TextConfig,
-        *,
-        rank: int,
-        max_rank: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        adapter_id: Optional[str] = None,
-    ) -> None:
-        if rank <= 0:
-            raise AdapterLoadError("LoRA rank must be positive.")
-        if max_rank < rank:
-            raise AdapterLoadError("max_rank must be >= rank.")
-        self.text_config = text_config
-        self.rank = rank
-        self.max_rank = max_rank
-        self.adapter_id = adapter_id
-        moe_cfg = text_config.moe
-        self.start_layer = moe_cfg.start_layer if moe_cfg else text_config.n_layers
-        if moe_cfg is not None:
-            self.rank_per_expert = rank // moe_cfg.experts_per_token
-            if self.rank_per_expert < 1:
-                raise AdapterLoadError(
-                    f"rank ({rank}) must be >= experts_per_token ({moe_cfg.experts_per_token})"
-                )
-            self.max_rank_per_expert = max_rank // moe_cfg.experts_per_token
-            if self.max_rank_per_expert < 1:
-                raise AdapterLoadError(
-                    f"max_rank ({max_rank}) must be >= experts_per_token ({moe_cfg.experts_per_token})"
-                )
-        else:
-            self.rank_per_expert = 0
-            self.max_rank_per_expert = 0
-        d_model = text_config.dim
-        d_ffn = text_config.ff_dim
-        self.dense: list[DenseLoRALayer] = []
-        for _ in range(self.start_layer):
-            self.dense.append(
-                DenseLoRALayer(
-                    up_a=torch.zeros((max_rank, d_model), device=device, dtype=dtype),
-                    up_b=torch.zeros((d_ffn, max_rank), device=device, dtype=dtype),
-                    down_a=torch.zeros((max_rank, d_ffn), device=device, dtype=dtype),
-                    down_b=torch.zeros((d_model, max_rank), device=device, dtype=dtype),
-                )
-            )
-        self.moe: list[MoELoRALayer] = []
-        if moe_cfg is not None:
-            num_experts = moe_cfg.num_experts
-            d_expert = moe_cfg.expert_inner_dim
-            for _ in range(text_config.n_layers - self.start_layer):
-                self.moe.append(
-                    MoELoRALayer(
-                        up_a=torch.zeros(
-                            (num_experts, self.max_rank_per_expert, d_model),
-                            device=device,
-                            dtype=dtype,
-                        ),
-                        up_b=torch.zeros(
-                            (num_experts, d_expert * 2, self.max_rank_per_expert),
-                            device=device,
-                            dtype=dtype,
-                        ),
-                        down_a=torch.zeros(
-                            (num_experts, self.max_rank_per_expert, d_expert),
-                            device=device,
-                            dtype=dtype,
-                        ),
-                        down_b=torch.zeros(
-                            (num_experts, d_model, self.max_rank_per_expert),
-                            device=device,
-                            dtype=dtype,
-                        ),
-                    )
-                )
-    def dense_layer(self, layer_idx: int) -> Optional[DenseLoRALayer]:
-        if layer_idx < len(self.dense):
-            return self.dense[layer_idx]
-        return None
-    def moe_layer(self, layer_idx: int) -> Optional[MoELoRALayer]:
-        moe_idx = layer_idx - self.start_layer
-        if 0 <= moe_idx < len(self.moe):
-            return self.moe[moe_idx]
         return None
-    @staticmethod
-    def _pad_axis(tensor: torch.Tensor, target: int, axis: int) -> torch.Tensor:
-        if tensor.shape[axis] == target:
-            return tensor
-        if tensor.shape[axis] > target:
-            raise AdapterLoadError(
-                f"LoRA tensor rank {tensor.shape[axis]} exceeds max {target}"
-            )
-        pad_shape = list(tensor.shape)
-        pad_shape[axis] = target - tensor.shape[axis]
-        pad = torch.zeros(pad_shape, device=tensor.device, dtype=tensor.dtype)
-        return torch.cat([tensor, pad], dim=axis)
-    @staticmethod
-    def detect_rank(state_dict: Dict[str, Any], text_config: TextConfig) -> int:
-        for key, tensor in state_dict.items():
-            if "dense" in key and "up_a" in key:
-                return int(tensor.shape[0])
-        for key, tensor in state_dict.items():
-            if "moe" in key and "up_a" in key:
-                rank_per_expert = int(tensor.shape[1])
-                moe_cfg = text_config.moe
-                if moe_cfg:
-                    return rank_per_expert * moe_cfg.experts_per_token
-                return rank_per_expert
-        raise AdapterLoadError("Could not detect LoRA rank from state dict.")
-    @classmethod
-    def from_state_dict(
-        cls,
-        state_dict: Dict[str, Any],
-        *,
-        text_config: TextConfig,
-        max_rank: int,
-        dtype: torch.dtype,
-        device: torch.device,
-        adapter_id: Optional[str] = None,
-    ) -> "TextLoRA":
-        rank = cls.detect_rank(state_dict, text_config)
-        if rank > max_rank:
-            raise AdapterLoadError(
-                f"Adapter rank ({rank}) exceeds max_rank ({max_rank})."
-            )
-        lora = cls(
-            text_config,
-            rank=rank,
-            max_rank=max_rank,
-            dtype=dtype,
-            device=device,
-            adapter_id=adapter_id,
-        )
-        dense_seen = set()
-        moe_seen = set()
-        pattern = re.compile(r"(dense|moe)\.(\d+)\.(up_a|up_b|down_a|down_b)$")
-        for key, tensor in state_dict.items():
-            match = pattern.search(key)
-            if not match:
-                continue
-            kind, idx_str, name = match.group(1), match.group(2), match.group(3)
-            idx = int(idx_str)
-            arr = tensor.to(device=device, dtype=dtype)
-            if kind == "dense":
-                if idx >= len(lora.dense):
-                    raise AdapterLoadError(f"Dense LoRA layer index {idx} out of range.")
-                layer = lora.dense[idx]
-                if name in ("up_a", "down_a"):
-                    arr = cls._pad_axis(arr, lora.max_rank, axis=0)
-                else:
-                    arr = cls._pad_axis(arr, lora.max_rank, axis=1)
-                setattr(layer, name, arr)
-                dense_seen.add((idx, name))
-            else:
-                if idx >= len(lora.moe):
-                    raise AdapterLoadError(f"MoE LoRA layer index {idx} out of range.")
-                layer = lora.moe[idx]
-                if name in ("up_a", "down_a"):
-                    arr = cls._pad_axis(arr, lora.max_rank_per_expert, axis=1)
-                else:
-                    arr = cls._pad_axis(arr, lora.max_rank_per_expert, axis=2)
-                setattr(layer, name, arr)
-                moe_seen.add((idx, name))
-        for layer_idx in range(len(lora.dense)):
-            for name in ("up_a", "up_b", "down_a", "down_b"):
-                if (layer_idx, name) not in dense_seen:
-                    raise AdapterLoadError(
-                        f"Adapter missing dense LoRA for layer {layer_idx} ({name})."
-                    )
-        for layer_idx in range(len(lora.moe)):
-            for name in ("up_a", "up_b", "down_a", "down_b"):
-                if (layer_idx, name) not in moe_seen:
-                    raise AdapterLoadError(
-                        f"Adapter missing MoE LoRA for layer {layer_idx} ({name})."
-                    )
-        return lora
-def select_layer_lora(
-    lora: Optional[TextLoRA], layer_idx: int, *, is_moe: bool
-) -> Optional[object]:
-    if lora is None:
-        return None
-    return lora.moe_layer(layer_idx) if is_moe else lora.dense_layer(layer_idx)
-def apply_dense_lora(
-    x: torch.Tensor, lora_a: torch.Tensor, lora_b: torch.Tensor
-) -> torch.Tensor:
-    b, t, c = x.shape
-    x_flat = x.reshape(-1, c)
-    lora_mid = torch.matmul(x_flat, lora_a.t())
-    lora_out = torch.matmul(lora_mid, lora_b.t())
-    return lora_out.reshape(b, t, -1)
-def apply_moe_lora_fc1_flat(
-    x_expanded: torch.Tensor, lora: MoELoRALayer, flat_idxs: torch.Tensor
-) -> torch.Tensor:
-    lora_up_a = lora.up_a[flat_idxs]
-    lora_up_b = lora.up_b[flat_idxs]
-    lora_mid = torch.bmm(lora_up_a, x_expanded.unsqueeze(-1)).squeeze(-1)
-    lora_up = torch.bmm(lora_up_b, lora_mid.unsqueeze(-1)).squeeze(-1)
-    return lora_up
-def apply_moe_lora_fc2_flat(
-    h: torch.Tensor, lora: MoELoRALayer, flat_idxs: torch.Tensor
-) -> torch.Tensor:
-    lora_down_a = lora.down_a[flat_idxs]
-    lora_down_b = lora.down_b[flat_idxs]
-    lora_mid = torch.bmm(lora_down_a, h.unsqueeze(-1)).squeeze(-1)
-    lora_down = torch.bmm(lora_down_b, lora_mid.unsqueeze(-1)).squeeze(-1)
-    return lora_down
-_ADAPTER_CACHE: Dict[Tuple[str, str, str, Tuple], TextLoRA] = {}
-_CACHE_ORDER: list[Tuple[str, str, str, Tuple]] = []
-_CACHE_SIZE = 8
-def _config_key(text_config: TextConfig) -> Tuple:
-    moe = text_config.moe
-    moe_key = None
-    if moe is not None:
-        moe_key = (
-            moe.num_experts,
-            moe.start_layer,
-            moe.experts_per_token,
-            moe.expert_inner_dim,
-        )
-    return (
-        text_config.dim,
-        text_config.ff_dim,
-        text_config.n_layers,
-        moe_key,
-    )
-def load_adapter(
-    adapter_id: Optional[str],
-    *,
-    text_config: TextConfig,
-    device: torch.device,
-    dtype: torch.dtype,
-    max_rank: int = 16,
-) -> Optional[TextLoRA]:
-    if adapter_id is None:
-        return None
-    adapter_id = normalize_adapter_id(adapter_id)
-    if adapter_id is None:
-        return None
-    key = (adapter_id, str(device), str(dtype), _config_key(text_config))
-    cached = _ADAPTER_CACHE.get(key)
-    if cached is not None:
-        return cached
-    path = cached_adapter_path(adapter_id)
-    checkpoint = _load_state_dict(path, device)
-    if not isinstance(checkpoint, dict):
-        raise AdapterLoadError("Invalid adapter checkpoint format.")
-    state_dict = checkpoint.get("lora_state_dict", checkpoint)
-    if not isinstance(state_dict, dict):
-        raise AdapterLoadError("Adapter checkpoint missing lora_state_dict.")
-    lora = TextLoRA.from_state_dict(
-        state_dict,
-        text_config=text_config,
-        max_rank=max_rank,
-        dtype=dtype,
-        device=device,
-        adapter_id=adapter_id,
     )
-    _ADAPTER_CACHE[key] = lora
-    _CACHE_ORDER.append(key)
-    if len(_CACHE_ORDER) > _CACHE_SIZE:
-        old = _CACHE_ORDER.pop(0)
-        _ADAPTER_CACHE.pop(old, None)
-    return lora

+import functools
 import os
 import shutil
 import torch
+from pathlib import Path
+from urllib.request import Request, urlopen
+from typing import Optional
+def variant_cache_dir():
     hf_hub_cache = os.environ.get("HF_HUB_CACHE")
+    if hf_hub_cache is not None:
+        return Path(hf_hub_cache) / "md_variants"
     hf_home = os.environ.get("HF_HOME")
+    if hf_home is not None:
+        return Path(hf_home) / "hub" / "md_variants"
+    return Path("~/.cache/huggingface/hub").expanduser() / "md_variants"
+def cached_variant_path(variant_id: str):
+    variant, *rest = variant_id.split("/", 1)
+    step = rest[0] if rest else "final"
+    cache_dir = variant_cache_dir() / variant
+    os.makedirs(cache_dir, exist_ok=True)
+    dest = cache_dir / f"{step}.pt"
+    if dest.exists():
+        return dest
+    md_endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai")
+    headers = {"User-Agent": "moondream-torch"}
     api_key = os.getenv("MOONDREAM_API_KEY")
+    if api_key is not None:
+        headers["X-Moondream-Auth"] = api_key
+    req = Request(f"{md_endpoint}/v1/variants/{variant_id}/download", headers=headers)
+    with urlopen(req) as r, open(dest, "wb") as f:
+        shutil.copyfileobj(r, f)
     return dest
+def nest(flat):
+    tree = {}
+    for k, v in flat.items():
+        parts = k.split(".")
+        d = tree
+        for p in parts[:-1]:
+            d = d.setdefault(p, {})
+        d[parts[-1]] = v
+    return tree
+@functools.lru_cache(maxsize=5)
+def variant_state_dict(variant_id: Optional[str] = None, device: str = "cpu"):
+    if variant_id is None:
         return None
+    state_dict = torch.load(
+        cached_variant_path(variant_id), map_location=device, weights_only=True
     )
+    # TODO: Move these into the training code that saves checkpoints...
+    rename_rules = [
+        ("text_model.transformer.h", "text.blocks"),
+        (".mixer", ".attn"),
+        (".out_proj", ".proj"),
+        (".Wqkv", ".qkv"),
+        (".parametrizations.weight.0", ""),
+    ]
+    new_state_dict = {}
+    for key, tensor in state_dict.items():
+        new_key = key
+        for old, new in rename_rules:
+            if old in new_key:
+                new_key = new_key.replace(old, new)
+        new_state_dict[new_key] = tensor
+    return nest(new_state_dict)

model.safetensors.index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

model_fp8.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:699bd3876f9e105440d60a5fe30c26bc33fbdf008f5bd611a3557663b24bd371
-size 10505451019

modelv2-00001-of-00004.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:79006ed488cca15b173cd5c0c7c1a467c20aaf5508e13934c36378d071d48c13
-size 4907406296

modelv2-00002-of-00004.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:40202c61286ec7386d9bbce31d87af3064e42931b10323ed4b3e44158c0521e3
-size 4736548872

modelv2-00003-of-00004.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff46835f23bac47c7409032391e02a095821e274f3faaeea3f826a960db9bf80
-size 4502742464

modelv2-00004-of-00004.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0a4d39e1bcb0ab835b9a00c7f458dedca4faf8741fc0b23fd2caf2af4547bca6
-size 4390628760

moondream.py CHANGED Viewed

@@ -21,13 +21,12 @@ from .region import (
     SpatialRefs,
 )
 from .layers import QuantizedLinear
-from .lora import load_adapter, normalize_adapter_id
-from .rope import precompute_freqs_cis
 from .utils import remove_outlier_points
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
-    {"adapter": str, "model": str},
     total=False,
 )
@@ -37,15 +36,14 @@ TextSamplingSettings = TypedDict(
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
-        "adapter": str,
-        "model": str,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
-    {"max_objects": int, "adapter": str, "model": str},
     total=False,
 )
@@ -122,7 +120,6 @@ class MoondreamModel(nn.Module):
                 "size_decoder": linear_cls(
                     config.region.dim, config.region.size_out_dim, dtype=dtype
                 ),
-                "ln": nn.LayerNorm(config.region.dim, dtype=dtype),
             }
         )
         self.region.coord_features = nn.Parameter(
@@ -172,26 +169,6 @@ class MoondreamModel(nn.Module):
             )
         return self._point_gen_indices
-    def _refresh_runtime_buffers(self):
-        attn_mask = torch.tril(
-            torch.ones(
-                1,
-                1,
-                self.config.text.max_context,
-                self.config.text.max_context,
-                dtype=torch.bool,
-                device=self.device,
-            )
-        )
-        patch_w = self.config.vision.crop_size // self.config.vision.enc_patch_size
-        prefix_attn_len = 1 + patch_w**2
-        attn_mask[..., :prefix_attn_len, :prefix_attn_len] = 1
-        self.attn_mask = attn_mask
-        self.text.freqs_cis = precompute_freqs_cis(
-            self.config.text.dim // (2 * self.config.text.n_heads),
-            self.config.text.max_context,
-        ).to(device=self.device)
     def _setup_caches(self):
         c = self.config.text
         for b in self.text.blocks:
@@ -204,29 +181,6 @@ class MoondreamModel(nn.Module):
                 dtype=self.vision.pos_emb.dtype,
             )
-    def _adapter_id_from_settings(self, settings: Optional[dict]) -> Optional[str]:
-        if settings is None:
-            return None
-        adapter = settings.get("adapter")
-        if adapter is not None:
-            return normalize_adapter_id(adapter)
-        model_value = settings.get("model")
-        if isinstance(model_value, str):
-            return normalize_adapter_id(model_value)
-        return None
-    def _resolve_lora(self, settings: Optional[dict]) -> Optional[object]:
-        adapter_id = self._adapter_id_from_settings(settings)
-        if adapter_id is None:
-            return None
-        return load_adapter(
-            adapter_id,
-            text_config=self.config.text,
-            device=self.device,
-            dtype=self.vision.pos_emb.dtype,
-        )
     @property
     def device(self):
         return self.vision.pos_emb.device
@@ -349,7 +303,11 @@ class MoondreamModel(nn.Module):
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
-        lora = self._resolve_lora(settings)
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
@@ -450,7 +408,11 @@ class MoondreamModel(nn.Module):
             if settings
             else DEFAULT_TEMPERATURE
         )
-        lora = self._resolve_lora(settings)
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = self.config.tokenizer.answer_id
@@ -562,7 +524,11 @@ class MoondreamModel(nn.Module):
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = eos_id if eos_id is not None else self.config.tokenizer.eos_id
-        lora = self._resolve_lora(settings)
         _, _, next_token, pos = self._prefill_prompt(
             prompt_tokens,
@@ -705,7 +671,6 @@ class MoondreamModel(nn.Module):
             reasoning_dict = {
                 "reasoning": {"text": reasoning_text, "grounding": reasoning_grounding}
             }
-            spatial_refs = None
         else:
             prompt_tokens[0] += self.config.tokenizer.templates["query"]["suffix"]
             reasoning_dict = {}
@@ -869,7 +834,11 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
-        lora = self._resolve_lora(settings)
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
@@ -913,7 +882,11 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
-        lora = self._resolve_lora(settings)
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora

     SpatialRefs,
 )
 from .layers import QuantizedLinear
+from .lora import variant_state_dict
 from .utils import remove_outlier_points
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
+    {"variant": str},
     total=False,
 )
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
+        "variant": str,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
+    {"max_objects": int, "variant": str},
     total=False,
 )
                 "size_decoder": linear_cls(
                     config.region.dim, config.region.size_out_dim, dtype=dtype
                 ),
             }
         )
         self.region.coord_features = nn.Parameter(
             )
         return self._point_gen_indices
     def _setup_caches(self):
         c = self.config.text
         for b in self.text.blocks:
                 dtype=self.vision.pos_emb.dtype,
             )
     @property
     def device(self):
         return self.vision.pos_emb.device
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
             if settings
             else DEFAULT_TEMPERATURE
         )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = self.config.tokenizer.answer_id
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         eos_id = eos_id if eos_id is not None else self.config.tokenizer.eos_id
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, _, next_token, pos = self._prefill_prompt(
             prompt_tokens,
             reasoning_dict = {
                 "reasoning": {"text": reasoning_text, "grounding": reasoning_grounding}
             }
         else:
             prompt_tokens[0] += self.config.tokenizer.templates["query"]["suffix"]
             reasoning_dict = {}
             device=self.device,
         )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
             device=self.device,
         )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, hidden, next_token, pos = self._prefill_prompt(
             prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora

region.py CHANGED Viewed

@@ -52,7 +52,6 @@ def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         A single logit representing the predicted coordinate value (x or y)
     """
-    hidden_state = w.ln(hidden_state)
     return w.coord_decoder(hidden_state)
@@ -89,7 +88,6 @@ def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
         A tensor containing logits for 1024 bins for width and height.
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
-    hidden_state = w.ln(hidden_state)
     return w.size_decoder(hidden_state).view(2, -1)

     Returns:
         A single logit representing the predicted coordinate value (x or y)
     """
     return w.coord_decoder(hidden_state)
         A tensor containing logits for 1024 bins for width and height.
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
     return w.size_decoder(hidden_state).view(2, -1)

text.py CHANGED Viewed

@@ -8,7 +8,6 @@ from typing import Optional
 from .layers import layer_norm, mlp, QuantizedLinear, moe_mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
-from .lora import select_layer_lora
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
@@ -24,12 +23,15 @@ def attn(
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
     flex_block_mask_slice=None,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     q, k, v = qkv_out.split([q_dim, kv_dim, kv_dim], dim=-1)
@@ -67,7 +69,14 @@ def attn(
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
-    return w.proj(out)
 def text_decoder(
@@ -76,13 +85,17 @@ def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
-    lora: Optional[object] = None,
     flex_block_mask_slice=None,
 ):
     for i, block in enumerate(w.blocks):
-        layer_lora = select_layer_lora(
-            lora, i, is_moe=config.moe is not None and i >= config.moe.start_layer
-        )
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
@@ -94,15 +107,14 @@ def text_decoder(
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
             flex_block_mask_slice=flex_block_mask_slice,
         )
         if config.moe is not None and i >= config.moe.start_layer:
-            l_mlp = moe_mlp(
-                l_in, block.mlp, config.moe.experts_per_token, lora=layer_lora
-            )
         else:
-            l_mlp = mlp(l_in, block.mlp, lora=layer_lora)
         x = x + l_attn + l_mlp
@@ -133,7 +145,7 @@ def build_dense_mlp(d_model, d_ffn, dtype, linear_cls):
 def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
     # For GeGLU, fc1 needs to output 2 * d_ffn (for gating)
-    mlp = nn.ModuleDict(
         {
             "router": nn.Linear(d_model, n_experts, dtype=dtype),
             "fc1": nn.ParameterDict(
@@ -152,7 +164,6 @@ def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
             ),
         }
     )
-    return mlp
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:

 from .layers import layer_norm, mlp, QuantizedLinear, moe_mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
 def text_encoder(input_ids: torch.Tensor, w: nn.Module):
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
+    lora: Optional[dict] = None,
     flex_block_mask_slice=None,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
+    if lora is not None:
+        qkv_out += F.linear(F.linear(x, lora["qkv"]["A"]), lora["qkv"]["B"])
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
     q, k, v = qkv_out.split([q_dim, kv_dim, kv_dim], dim=-1)
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    out0 = w.proj(out)
+    if lora is not None:
+        out1 = F.linear(F.linear(x, lora["proj"]["A"]), lora["proj"]["B"])
+        out = out0 + out1
+    else:
+        out = out0
+    return out
 def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
+    lora: Optional[dict] = None,
     flex_block_mask_slice=None,
 ):
     for i, block in enumerate(w.blocks):
+        if lora is not None:
+            layer_lora = lora["text"]["blocks"][str(i)]
+            mlp_lora = layer_lora["mlp"]
+            attn_lora = layer_lora["attn"]
+        else:
+            mlp_lora = None
+            attn_lora = None
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
+            lora=attn_lora,
             flex_block_mask_slice=flex_block_mask_slice,
         )
         if config.moe is not None and i >= config.moe.start_layer:
+            l_mlp = moe_mlp(l_in, block.mlp, config.moe.experts_per_token)
         else:
+            l_mlp = mlp(l_in, block.mlp, lora=mlp_lora)
         x = x + l_attn + l_mlp
 def build_moe_mlp(d_model, d_ffn, n_experts, dtype):
     # For GeGLU, fc1 needs to output 2 * d_ffn (for gating)
+    return nn.ModuleDict(
         {
             "router": nn.Linear(d_model, n_experts, dtype=dtype),
             "fc1": nn.ParameterDict(
             ),
         }
     )
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module: