Spaces:

silveroxides
/

Lodestone-Tagger-UI

Running on Zero

App Files Files Community

silveroxides commited on Apr 16

Commit

aa5c7de

1 Parent(s): 81c8dbd

Add image similarity mode using forward_embedding FEATURE_DIM descriptors

Browse files

Files changed (3) hide show

app.py +55 -0
inference_tagger_standalone.py +136 -82
tagger_ui/templates/index.html +238 -3

app.py CHANGED Viewed

@@ -183,6 +183,20 @@ def _build_custom_pca(
 # ---------------------------------------------------------------------------
 @spaces.GPU
 def _gpu_infer(pixel_values: torch.Tensor) -> torch.Tensor:
     """Move tensor to device, run model forward, return CPU logits."""
@@ -282,6 +296,47 @@ def get_pca(
     )
 # ---- FastAPI routes --------------------------------------------------------

 # ---------------------------------------------------------------------------
+@spaces.GPU
+def _gpu_extract_descriptor(pixel_values: torch.Tensor) -> np.ndarray:
+    """Extract the FEATURE_DIM=6400 image descriptor via forward_embedding.
+    Returns a [6400] float32 numpy array on CPU.
+    """
+    pv = pixel_values.to(model.device)
+    with (
+        torch.no_grad(),
+        torch.autocast(device_type=model.device.type, dtype=model.dtype),
+    ):
+        features = model.model.forward_embedding(pv)  # [1, 6400]
+    return features[0].cpu().numpy()  # [6400]
 @spaces.GPU
 def _gpu_infer(pixel_values: torch.Tensor) -> torch.Tensor:
     """Move tensor to device, run model forward, return CPU logits."""
     )
+@app.api(name="get_similarity")
+def get_similarity(image_a: str, image_b: str, max_size: int = 1024) -> str:
+    """Extract FEATURE_DIM=6400 descriptors for two images and return their
+    cosine similarity.
+    Returns JSON:
+      {
+        "score": float,          # cosine similarity in [-1, 1]
+        "desc_a": [6400 floats], # L2-normalised descriptor for image A
+        "desc_b": [6400 floats], # L2-normalised descriptor for image B
+      }
+    """
+    src_a = _resolve_image_source(image_a)
+    src_b = _resolve_image_source(image_b)
+    img_a = _open_image(src_a)
+    img_b = _open_image(src_b)
+    pv_a = _preprocess(img_a, max_size)
+    pv_b = _preprocess(img_b, max_size)
+    # Run both through the backbone in separate GPU calls
+    # (spaces.GPU does not support batching across different-sized tensors)
+    feat_a = _gpu_extract_descriptor(pv_a)  # [6400]
+    feat_b = _gpu_extract_descriptor(pv_b)  # [6400]
+    # L2-normalise
+    feat_a = feat_a / (np.linalg.norm(feat_a) + 1e-8)
+    feat_b = feat_b / (np.linalg.norm(feat_b) + 1e-8)
+    score = float(np.dot(feat_a, feat_b))
+    return json.dumps(
+        {
+            "score": round(score, 6),
+            "desc_a": feat_a.tolist(),
+            "desc_b": feat_b.tolist(),
+        }
+    )
 # ---- FastAPI routes --------------------------------------------------------

inference_tagger_standalone.py CHANGED Viewed

@@ -83,6 +83,7 @@ FEATURE_DIM = (1 + N_REGISTERS) * D_MODEL  # 6400
 # RoPE helpers
 # ---------------------------------------------------------------------------
 @lru_cache(maxsize=32)
 def _patch_coords_cached(h: int, w: int, device_str: str) -> torch.Tensor:
     device = torch.device(device_str)
@@ -94,11 +95,14 @@ def _patch_coords_cached(h: int, w: int, device_str: str) -> torch.Tensor:
     return coords  # [h*w, 2]
-def _build_rope(h_patches: int, w_patches: int,
-                dtype: torch.dtype, device: torch.device):
     coords = _patch_coords_cached(h_patches, w_patches, str(device))
-    inv_freq = 1.0 / (ROPE_THETA ** torch.arange(
-        0, 1, 4 / HEAD_DIM, dtype=torch.float32, device=device))
     angles = 2 * math.pi * coords[:, :, None] * inv_freq[None, None, :]
     angles = angles.flatten(1, 2).tile(2)
     cos = torch.cos(angles).to(dtype).unsqueeze(0).unsqueeze(0)
@@ -111,8 +115,7 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor:
     return torch.cat((-x[..., h:], x[..., :h]), dim=-1)
-def _apply_rope(q: torch.Tensor, k: torch.Tensor,
-                cos: torch.Tensor, sin: torch.Tensor):
     n_pre = 1 + N_REGISTERS
     q_pre, q_pat = q[..., :n_pre, :], q[..., n_pre:, :]
     k_pre, k_pat = k[..., :n_pre, :], k[..., n_pre:, :]
@@ -125,6 +128,7 @@ def _apply_rope(q: torch.Tensor, k: torch.Tensor,
 # Transformer blocks
 # ---------------------------------------------------------------------------
 class _Attention(nn.Module):
     def __init__(self):
         super().__init__()
@@ -139,7 +143,7 @@ class _Attention(nn.Module):
         k = self.k_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         v = self.v_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         q, k = _apply_rope(q, k, cos, sin)
-        out = F.scaled_dot_product_attention(q, k, v, scale=HEAD_DIM ** -0.5)
         return self.o_proj(out.transpose(1, 2).reshape(B, S, D_MODEL))
@@ -179,13 +183,15 @@ class _Embeddings(nn.Module):
         self.mask_token = nn.Parameter(torch.zeros(1, 1, D_MODEL))
         self.register_tokens = nn.Parameter(torch.zeros(1, N_REGISTERS, D_MODEL))
         self.patch_embeddings = nn.Conv2d(
-            3, D_MODEL, kernel_size=PATCH_SIZE, stride=PATCH_SIZE)
     def forward(self, pixel_values):
         B = pixel_values.shape[0]
         dtype = self.patch_embeddings.weight.dtype
-        patches = self.patch_embeddings(
-            pixel_values.to(dtype)).flatten(2).transpose(1, 2)
         cls = self.cls_token.expand(B, -1, -1)
         regs = self.register_tokens.expand(B, -1, -1)
         return torch.cat([cls, regs, patches], dim=1)
@@ -224,7 +230,7 @@ class DINOv3ViTH(nn.Module):
             x = block(x, cos, sin)
         x = self.norm(x)
         # token layout: [CLS, reg_0..reg_R-1, patch_0..patch_N]
-        patch_tokens = x[:, 1 + N_REGISTERS:, :]  # [B, h_p*w_p, D_MODEL]
         return patch_tokens, h_p, w_p
@@ -232,16 +238,18 @@ class DINOv3ViTH(nn.Module):
 # Head — auto-detected from the checkpoint
 # =============================================================================
 class _LowRankHead(nn.Module):
     """Two-matrix low-rank projection head.
-        features (in_dim)
-          → Linear(in_dim, rank, bias=?)
-          → Linear(rank, num_tags, bias=?)
     """
-    def __init__(self, in_dim: int, rank: int, num_tags: int,
-                 down_bias: bool, up_bias: bool):
         super().__init__()
         self.proj_down = nn.Linear(in_dim, rank, bias=down_bias)
         self.proj_up = nn.Linear(rank, num_tags, bias=up_bias)
@@ -265,15 +273,15 @@ def _build_head_from_checkpoint(
     Returns (module, remapped_state_dict) where the remapped state dict
     matches the module's own key names so strict loading works.
     """
-    weights_2d = [(k, v) for k, v in head_sd.items()
-                  if k.endswith(".weight") and v.ndim == 2]
     # --- Case 1: single dense linear ---------------------------------------
-    singles = [(k, v) for k, v in weights_2d
-               if tuple(v.shape) == (num_tags, in_dim)]
     if len(weights_2d) <= 2 and len(singles) == 1:
         wkey, wval = singles[0]
-        base = wkey[:-len(".weight")]
         bias_key = base + ".bias"
         has_bias = bias_key in head_sd
         module = nn.Linear(in_dim, num_tags, bias=has_bias)
@@ -285,12 +293,13 @@ def _build_head_from_checkpoint(
         extra = set(head_sd) - expected_src
         if extra:
             raise RuntimeError(
-                f"Head has single-linear shape but extra unknown keys: {sorted(extra)}")
         return module, remapped
     # --- Case 2: low-rank pair ---------------------------------------------
     down = None  # (key, tensor) with shape [rank, in_dim]
-    up = None    # (key, tensor) with shape [num_tags, rank]
     for k, v in weights_2d:
         if v.shape[1] == in_dim and v.shape[0] != num_tags:
             down = (k, v)
@@ -303,12 +312,13 @@ def _build_head_from_checkpoint(
         if rank_down != rank_up:
             raise RuntimeError(
                 f"Low-rank head: inner dims disagree "
-                f"(down out={rank_down}, up in={rank_up})")
         down_key, down_w = down
         up_key, up_w = up
-        down_base = down_key[:-len(".weight")]
-        up_base = up_key[:-len(".weight")]
         down_bias_key = down_base + ".bias"
         up_bias_key = up_base + ".bias"
         has_down_bias = down_bias_key in head_sd
@@ -340,11 +350,14 @@ def _build_head_from_checkpoint(
         if extra:
             raise RuntimeError(
                 f"Low-rank head detected but checkpoint has extra unknown "
-                f"head keys: {sorted(extra)}")
-        print(f"[Tagger] Detected low-rank head: "
-              f"in_dim={in_dim}, rank={rank_down}, num_tags={num_tags} "
-              f"(down_bias={has_down_bias}, up_bias={has_up_bias})")
         return module, remapped
     raise RuntimeError(
@@ -357,6 +370,7 @@ def _build_head_from_checkpoint(
 # Tagger wrapper module
 # =============================================================================
 class DINOv3Tagger(nn.Module):
     """Backbone + head. The head is attached after the checkpoint is
     inspected (so we can build the right shape)."""
@@ -369,15 +383,26 @@ class DINOv3Tagger(nn.Module):
     def forward(self, pixel_values):
         hidden = self.backbone(pixel_values)
         cls = hidden[:, 0, :]
-        regs = hidden[:, 1: 1 + N_REGISTERS, :].flatten(1)
         features = torch.cat([cls, regs], dim=-1).float()  # fp32 for head
         return self.head(features)
 # =============================================================================
 # Checkpoint loading helpers
 # =============================================================================
 def _split_and_clean_state_dict(sd: dict) -> tuple[dict, dict]:
     """Split full state dict into (backbone_sd, head_sd), stripping the
     ``backbone.`` prefix and applying the remaps needed to match
@@ -395,10 +420,10 @@ def _split_and_clean_state_dict(sd: dict) -> tuple[dict, dict]:
     head_sd: dict = {}
     for k, v in sd.items():
         if k.startswith("backbone."):
-            nk = k[len("backbone."):]
             # Remap (1): strip intermediate "model." before "layer."
             if nk.startswith("model.layer."):
-                nk = nk[len("model."):]
             backbone_sd[nk] = v
         else:
             head_sd[k] = v
@@ -406,7 +431,7 @@ def _split_and_clean_state_dict(sd: dict) -> tuple[dict, dict]:
     # Remap (2): layer.N.layer_scale{1,2}.lambda1 → layer.N.layer_scale{1,2}
     for k in list(backbone_sd.keys()):
         if ".layer_scale" in k and k.endswith(".lambda1"):
-            backbone_sd[k[:-len(".lambda1")]] = backbone_sd.pop(k)
     # Remap (3): drop rope buffers (recomputed on the fly)
     for k in list(backbone_sd.keys()):
@@ -454,18 +479,21 @@ def preprocess_image(source, max_size: int = 1024) -> torch.Tensor:
     new_w = _snap(max(PATCH_SIZE, round(w * scale)), PATCH_SIZE)
     new_h = _snap(max(PATCH_SIZE, round(h * scale)), PATCH_SIZE)
-    return v2.Compose([
-        v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
-        v2.ToImage(),
-        v2.ToDtype(torch.float32, scale=True),
-        v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
-    ])(img).unsqueeze(0)
 # =============================================================================
 # Tagger wrapper
 # =============================================================================
 class Tagger:
     """Inference wrapper for DINOv3Tagger (ViT-H/16+).
@@ -519,12 +547,15 @@ class Tagger:
         if not head_sd:
             raise RuntimeError(
-                "Checkpoint contains no non-backbone keys — cannot build head.")
         # --- Build model, inferring head shape from the checkpoint --------
         self.model = DINOv3Tagger()
         head_module, head_sd_remapped = _build_head_from_checkpoint(
-            head_sd, in_dim=FEATURE_DIM, num_tags=self.num_tags,
         )
         self.model.head = head_module
@@ -533,10 +564,8 @@ class Tagger:
         self.model.head.load_state_dict(head_sd_remapped, strict=True)
         # --- Move to device. Backbone → bf16/fp16; head stays fp32. --------
-        self.model.backbone = self.model.backbone.to(
-            device=self.device, dtype=dtype)
-        self.model.head = self.model.head.to(
-            device=self.device, dtype=torch.float32)
         self.model.eval()
         print(f"[Tagger] Ready on {self.device} (backbone={dtype}, head=fp32)")
@@ -571,12 +600,20 @@ class Tagger:
             scale = min(1.0, max_size / max(w, h))
             new_w = _snap(round(w * scale), PATCH_SIZE)
             new_h = _snap(round(h * scale), PATCH_SIZE)
-            pv = v2.Compose([
-                v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
-                v2.ToImage(),
-                v2.ToDtype(torch.float32, scale=True),
-                v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
-            ])(img).unsqueeze(0).to(self.device)
         else:
             pv = preprocess_image(image, max_size=max_size).to(self.device)
@@ -606,8 +643,9 @@ class Tagger:
         return Image.fromarray(rgb_uint8, mode="RGB")
     @torch.no_grad()
-    def predict(self, image, topk: int | None = 30,
-                threshold: float | None = None) -> list[tuple[str, float]]:
         """Tag a single image (local path or URL)."""
         if topk is None and threshold is None:
             topk = 30
@@ -625,20 +663,23 @@ class Tagger:
             order = values.argsort(descending=True)
             indices, values = indices[order], values[order]
-        return [(self.idx2tag[i], float(v))
-                for i, v in zip(indices.tolist(), values.tolist())]
     @torch.no_grad()
-    def predict_batch(self, images, topk: int | None = 30,
-                      threshold: float | None = None):
-        return [self.predict(img, topk=topk, threshold=threshold)
-                for img in images]
 # =============================================================================
 # Output formatters
 # =============================================================================
 def _fmt_pretty(path: str, results) -> str:
     lines = [f"\n{'─' * 60}", f" {path}", f"{'─' * 60}"]
     for rank, (tag, score) in enumerate(results, 1):
@@ -652,38 +693,53 @@ def _fmt_tags(results) -> str:
 def _fmt_json(path: str, results) -> dict:
-    return {"file": path,
-            "tags": [{"tag": t, "score": round(s, 4)} for t, s in results]}
 # =============================================================================
 # CLI
 # =============================================================================
 def main():
     parser = argparse.ArgumentParser(
         description="DINOv3 ViT-H/16+ tagger inference (standalone)",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
-    parser.add_argument("--checkpoint", required=True,
-                        help="Path to .safetensors or .pt checkpoint")
-    parser.add_argument("--vocab", required=True,
-                        help="Path to tagger_vocab*.json")
-    parser.add_argument("--images", nargs="+", required=True,
-                        help="Image paths and/or http(s) URLs")
-    parser.add_argument("--device", default="cuda",
-                        help="Device: cuda, cuda:0, cpu (default: cuda)")
-    parser.add_argument("--max-size", type=int, default=1024,
-                        help="Long-edge cap in pixels (default: 1024)")
     mode = parser.add_mutually_exclusive_group()
-    mode.add_argument("--topk", type=int, default=30,
-                      help="Return top-k tags (default: 30)")
-    mode.add_argument("--threshold", type=float,
-                      help="Return all tags with score >= threshold")
-    parser.add_argument("--format", choices=["pretty", "tags", "json"],
-                        default="pretty", help="Output format (default: pretty)")
     args = parser.parse_args()
     tagger = Tagger(
@@ -693,9 +749,7 @@ def main():
         max_size=args.max_size,
     )
-    topk, threshold = (
-        (None, args.threshold) if args.threshold else (args.topk, None)
-    )
     json_out = []
     for src in args.images:
@@ -716,4 +770,4 @@ def main():
 if __name__ == "__main__":
-    main()

 # RoPE helpers
 # ---------------------------------------------------------------------------
 @lru_cache(maxsize=32)
 def _patch_coords_cached(h: int, w: int, device_str: str) -> torch.Tensor:
     device = torch.device(device_str)
     return coords  # [h*w, 2]
+def _build_rope(
+    h_patches: int, w_patches: int, dtype: torch.dtype, device: torch.device
+):
     coords = _patch_coords_cached(h_patches, w_patches, str(device))
+    inv_freq = 1.0 / (
+        ROPE_THETA
+        ** torch.arange(0, 1, 4 / HEAD_DIM, dtype=torch.float32, device=device)
+    )
     angles = 2 * math.pi * coords[:, :, None] * inv_freq[None, None, :]
     angles = angles.flatten(1, 2).tile(2)
     cos = torch.cos(angles).to(dtype).unsqueeze(0).unsqueeze(0)
     return torch.cat((-x[..., h:], x[..., :h]), dim=-1)
+def _apply_rope(q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor):
     n_pre = 1 + N_REGISTERS
     q_pre, q_pat = q[..., :n_pre, :], q[..., n_pre:, :]
     k_pre, k_pat = k[..., :n_pre, :], k[..., n_pre:, :]
 # Transformer blocks
 # ---------------------------------------------------------------------------
 class _Attention(nn.Module):
     def __init__(self):
         super().__init__()
         k = self.k_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         v = self.v_proj(x).view(B, S, N_HEADS, HEAD_DIM).transpose(1, 2)
         q, k = _apply_rope(q, k, cos, sin)
+        out = F.scaled_dot_product_attention(q, k, v, scale=HEAD_DIM**-0.5)
         return self.o_proj(out.transpose(1, 2).reshape(B, S, D_MODEL))
         self.mask_token = nn.Parameter(torch.zeros(1, 1, D_MODEL))
         self.register_tokens = nn.Parameter(torch.zeros(1, N_REGISTERS, D_MODEL))
         self.patch_embeddings = nn.Conv2d(
+            3, D_MODEL, kernel_size=PATCH_SIZE, stride=PATCH_SIZE
+        )
     def forward(self, pixel_values):
         B = pixel_values.shape[0]
         dtype = self.patch_embeddings.weight.dtype
+        patches = (
+            self.patch_embeddings(pixel_values.to(dtype)).flatten(2).transpose(1, 2)
+        )
         cls = self.cls_token.expand(B, -1, -1)
         regs = self.register_tokens.expand(B, -1, -1)
         return torch.cat([cls, regs, patches], dim=1)
             x = block(x, cos, sin)
         x = self.norm(x)
         # token layout: [CLS, reg_0..reg_R-1, patch_0..patch_N]
+        patch_tokens = x[:, 1 + N_REGISTERS :, :]  # [B, h_p*w_p, D_MODEL]
         return patch_tokens, h_p, w_p
 # Head — auto-detected from the checkpoint
 # =============================================================================
 class _LowRankHead(nn.Module):
     """Two-matrix low-rank projection head.
+    features (in_dim)
+      → Linear(in_dim, rank, bias=?)
+      → Linear(rank, num_tags, bias=?)
     """
+    def __init__(
+        self, in_dim: int, rank: int, num_tags: int, down_bias: bool, up_bias: bool
+    ):
         super().__init__()
         self.proj_down = nn.Linear(in_dim, rank, bias=down_bias)
         self.proj_up = nn.Linear(rank, num_tags, bias=up_bias)
     Returns (module, remapped_state_dict) where the remapped state dict
     matches the module's own key names so strict loading works.
     """
+    weights_2d = [
+        (k, v) for k, v in head_sd.items() if k.endswith(".weight") and v.ndim == 2
+    ]
     # --- Case 1: single dense linear ---------------------------------------
+    singles = [(k, v) for k, v in weights_2d if tuple(v.shape) == (num_tags, in_dim)]
     if len(weights_2d) <= 2 and len(singles) == 1:
         wkey, wval = singles[0]
+        base = wkey[: -len(".weight")]
         bias_key = base + ".bias"
         has_bias = bias_key in head_sd
         module = nn.Linear(in_dim, num_tags, bias=has_bias)
         extra = set(head_sd) - expected_src
         if extra:
             raise RuntimeError(
+                f"Head has single-linear shape but extra unknown keys: {sorted(extra)}"
+            )
         return module, remapped
     # --- Case 2: low-rank pair ---------------------------------------------
     down = None  # (key, tensor) with shape [rank, in_dim]
+    up = None  # (key, tensor) with shape [num_tags, rank]
     for k, v in weights_2d:
         if v.shape[1] == in_dim and v.shape[0] != num_tags:
             down = (k, v)
         if rank_down != rank_up:
             raise RuntimeError(
                 f"Low-rank head: inner dims disagree "
+                f"(down out={rank_down}, up in={rank_up})"
+            )
         down_key, down_w = down
         up_key, up_w = up
+        down_base = down_key[: -len(".weight")]
+        up_base = up_key[: -len(".weight")]
         down_bias_key = down_base + ".bias"
         up_bias_key = up_base + ".bias"
         has_down_bias = down_bias_key in head_sd
         if extra:
             raise RuntimeError(
                 f"Low-rank head detected but checkpoint has extra unknown "
+                f"head keys: {sorted(extra)}"
+            )
+        print(
+            f"[Tagger] Detected low-rank head: "
+            f"in_dim={in_dim}, rank={rank_down}, num_tags={num_tags} "
+            f"(down_bias={has_down_bias}, up_bias={has_up_bias})"
+        )
         return module, remapped
     raise RuntimeError(
 # Tagger wrapper module
 # =============================================================================
 class DINOv3Tagger(nn.Module):
     """Backbone + head. The head is attached after the checkpoint is
     inspected (so we can build the right shape)."""
     def forward(self, pixel_values):
         hidden = self.backbone(pixel_values)
         cls = hidden[:, 0, :]
+        regs = hidden[:, 1 : 1 + N_REGISTERS, :].flatten(1)
         features = torch.cat([cls, regs], dim=-1).float()  # fp32 for head
         return self.head(features)
+    def forward_embedding(self, pixel_values):
+        """Return the FEATURE_DIM=6400 image descriptor without applying the head.
+        Same as forward() but stops before self.head — use this for similarity queries.
+        """
+        hidden = self.backbone(pixel_values)
+        cls = hidden[:, 0, :]
+        regs = hidden[:, 1 : 1 + N_REGISTERS, :].flatten(1)
+        features = torch.cat([cls, regs], dim=-1).float()  # fp32 for head
+        return features
 # =============================================================================
 # Checkpoint loading helpers
 # =============================================================================
 def _split_and_clean_state_dict(sd: dict) -> tuple[dict, dict]:
     """Split full state dict into (backbone_sd, head_sd), stripping the
     ``backbone.`` prefix and applying the remaps needed to match
     head_sd: dict = {}
     for k, v in sd.items():
         if k.startswith("backbone."):
+            nk = k[len("backbone.") :]
             # Remap (1): strip intermediate "model." before "layer."
             if nk.startswith("model.layer."):
+                nk = nk[len("model.") :]
             backbone_sd[nk] = v
         else:
             head_sd[k] = v
     # Remap (2): layer.N.layer_scale{1,2}.lambda1 → layer.N.layer_scale{1,2}
     for k in list(backbone_sd.keys()):
         if ".layer_scale" in k and k.endswith(".lambda1"):
+            backbone_sd[k[: -len(".lambda1")]] = backbone_sd.pop(k)
     # Remap (3): drop rope buffers (recomputed on the fly)
     for k in list(backbone_sd.keys()):
     new_w = _snap(max(PATCH_SIZE, round(w * scale)), PATCH_SIZE)
     new_h = _snap(max(PATCH_SIZE, round(h * scale)), PATCH_SIZE)
+    return v2.Compose(
+        [
+            v2.Resize((new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS),
+            v2.ToImage(),
+            v2.ToDtype(torch.float32, scale=True),
+            v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
+        ]
+    )(img).unsqueeze(0)
 # =============================================================================
 # Tagger wrapper
 # =============================================================================
 class Tagger:
     """Inference wrapper for DINOv3Tagger (ViT-H/16+).
         if not head_sd:
             raise RuntimeError(
+                "Checkpoint contains no non-backbone keys — cannot build head."
+            )
         # --- Build model, inferring head shape from the checkpoint --------
         self.model = DINOv3Tagger()
         head_module, head_sd_remapped = _build_head_from_checkpoint(
+            head_sd,
+            in_dim=FEATURE_DIM,
+            num_tags=self.num_tags,
         )
         self.model.head = head_module
         self.model.head.load_state_dict(head_sd_remapped, strict=True)
         # --- Move to device. Backbone → bf16/fp16; head stays fp32. --------
+        self.model.backbone = self.model.backbone.to(device=self.device, dtype=dtype)
+        self.model.head = self.model.head.to(device=self.device, dtype=torch.float32)
         self.model.eval()
         print(f"[Tagger] Ready on {self.device} (backbone={dtype}, head=fp32)")
             scale = min(1.0, max_size / max(w, h))
             new_w = _snap(round(w * scale), PATCH_SIZE)
             new_h = _snap(round(h * scale), PATCH_SIZE)
+            pv = (
+                v2.Compose(
+                    [
+                        v2.Resize(
+                            (new_h, new_w), interpolation=v2.InterpolationMode.LANCZOS
+                        ),
+                        v2.ToImage(),
+                        v2.ToDtype(torch.float32, scale=True),
+                        v2.Normalize(mean=_IMAGENET_MEAN, std=_IMAGENET_STD),
+                    ]
+                )(img)
+                .unsqueeze(0)
+                .to(self.device)
+            )
         else:
             pv = preprocess_image(image, max_size=max_size).to(self.device)
         return Image.fromarray(rgb_uint8, mode="RGB")
     @torch.no_grad()
+    def predict(
+        self, image, topk: int | None = 30, threshold: float | None = None
+    ) -> list[tuple[str, float]]:
         """Tag a single image (local path or URL)."""
         if topk is None and threshold is None:
             topk = 30
             order = values.argsort(descending=True)
             indices, values = indices[order], values[order]
+        return [
+            (self.idx2tag[i], float(v))
+            for i, v in zip(indices.tolist(), values.tolist())
+        ]
     @torch.no_grad()
+    def predict_batch(
+        self, images, topk: int | None = 30, threshold: float | None = None
+    ):
+        return [self.predict(img, topk=topk, threshold=threshold) for img in images]
 # =============================================================================
 # Output formatters
 # =============================================================================
 def _fmt_pretty(path: str, results) -> str:
     lines = [f"\n{'─' * 60}", f" {path}", f"{'─' * 60}"]
     for rank, (tag, score) in enumerate(results, 1):
 def _fmt_json(path: str, results) -> dict:
+    return {
+        "file": path,
+        "tags": [{"tag": t, "score": round(s, 4)} for t, s in results],
+    }
 # =============================================================================
 # CLI
 # =============================================================================
 def main():
     parser = argparse.ArgumentParser(
         description="DINOv3 ViT-H/16+ tagger inference (standalone)",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
+    parser.add_argument(
+        "--checkpoint", required=True, help="Path to .safetensors or .pt checkpoint"
+    )
+    parser.add_argument("--vocab", required=True, help="Path to tagger_vocab*.json")
+    parser.add_argument(
+        "--images", nargs="+", required=True, help="Image paths and/or http(s) URLs"
+    )
+    parser.add_argument(
+        "--device", default="cuda", help="Device: cuda, cuda:0, cpu (default: cuda)"
+    )
+    parser.add_argument(
+        "--max-size",
+        type=int,
+        default=1024,
+        help="Long-edge cap in pixels (default: 1024)",
+    )
     mode = parser.add_mutually_exclusive_group()
+    mode.add_argument(
+        "--topk", type=int, default=30, help="Return top-k tags (default: 30)"
+    )
+    mode.add_argument(
+        "--threshold", type=float, help="Return all tags with score >= threshold"
+    )
+    parser.add_argument(
+        "--format",
+        choices=["pretty", "tags", "json"],
+        default="pretty",
+        help="Output format (default: pretty)",
+    )
     args = parser.parse_args()
     tagger = Tagger(
         max_size=args.max_size,
     )
+    topk, threshold = (None, args.threshold) if args.threshold else (args.topk, None)
     json_out = []
     for src in args.images:
 if __name__ == "__main__":
+    main()

tagger_ui/templates/index.html CHANGED Viewed

@@ -259,6 +259,81 @@
     .tag-pill:hover { opacity: .8; }
     .tag-pill .score { font-size: .66rem; opacity: .7; }
     .tag-pill.hidden { display: none; }
   </style>
 </head>
 <body>
@@ -266,7 +341,14 @@
 <h1>DINOv3 <span>Tagger</span></h1>
 <p class="subtitle">ViT-H/16+ · {{ num_tags | format_number }} tags · {{ vocab_path }}</p>
-<div class="layout">
   <!-- ====== LEFT PANEL ====== -->
   <div class="panel-left">
@@ -367,7 +449,57 @@
     </div>
   </div><!-- /panel-right -->
-</div><!-- /layout -->
 <script>
   // ---- category metadata from server ----
@@ -653,7 +785,7 @@
     if (e.key === 'Enter') runFromUrl();
   });
-  // drag & drop
   const dz = document.getElementById('drop-zone');
   dz.addEventListener('dragover',  e => { e.preventDefault(); dz.classList.add('drag-over'); });
   dz.addEventListener('dragleave', () => dz.classList.remove('drag-over'));
@@ -662,6 +794,55 @@
     const file = e.dataTransfer.files[0];
     if (file) stageFile(file);
   });
 </script>
 <!-- @gradio/client module: patches runFromUrl / submitFile / runPca on window
@@ -773,6 +954,60 @@
   // Re-run with current colour pickers — re-submits full request (backbone
   // result is cached by the Gradio queue so subsequent calls are fast if
   // the same image/max_size is used, but ZeroGPU requires a full round-trip).
   window.rerunCustomPca = async function() {
     if (!_lastPcaRequest) return;
     const spinner = document.getElementById('pca-spinner');

     .tag-pill:hover { opacity: .8; }
     .tag-pill .score { font-size: .66rem; opacity: .7; }
     .tag-pill.hidden { display: none; }
+    /* ---- similarity drop zones ---- */
+    .drop-zone-sim {
+      border: 2px dashed var(--border); border-radius: var(--radius);
+      color: var(--muted); cursor: pointer; font-size: .85rem;
+      padding: 1.2rem; text-align: center;
+      transition: border-color .15s, background .15s;
+    }
+    .drop-zone-sim.drag-over { border-color: var(--accent); background: rgba(124,106,247,.06); }
+    /* ---- mode toggle ---- */
+    .mode-toggle {
+      display: flex; gap: .5rem; margin-bottom: 1.5rem;
+    }
+    .mode-btn {
+      background: var(--surface); border: 1px solid var(--border);
+      border-radius: var(--radius); color: var(--muted); cursor: pointer;
+      font-size: .9rem; font-weight: 600; padding: .5rem 1.4rem;
+      transition: border-color .15s, color .15s, background .15s;
+    }
+    .mode-btn:hover { border-color: var(--accent); color: var(--text); }
+    .mode-btn.active { border-color: var(--accent); color: #fff; background: var(--accent); }
+    /* ---- similarity panel ---- */
+    #similarity-panel { display: none; width: 100%; max-width: 1600px; }
+    .sim-inputs {
+      display: flex; gap: 1.25rem; flex-wrap: wrap; margin-bottom: 1rem;
+    }
+    .sim-inputs .card { flex: 1 1 0; min-width: 260px; }
+    .sim-run-row {
+      display: flex; justify-content: center; margin-bottom: 1.5rem;
+    }
+    /* score display */
+    .sim-score-card {
+      background: var(--surface); border: 1px solid var(--border);
+      border-radius: var(--radius); padding: 1.25rem 1.5rem;
+      margin-bottom: 1.25rem; display: none;
+    }
+    .sim-score-label {
+      font-size: .8rem; color: var(--muted); margin-bottom: .5rem;
+      text-transform: uppercase; letter-spacing: .05em;
+    }
+    .sim-score-value {
+      font-size: 2.4rem; font-weight: 700; letter-spacing: -.02em;
+      margin-bottom: .6rem;
+    }
+    .sim-score-bar-bg {
+      height: 8px; background: var(--border); border-radius: 4px; overflow: hidden;
+    }
+    .sim-score-bar-fill {
+      height: 100%; border-radius: 4px; background: var(--accent);
+      transition: width .4s ease;
+    }
+    .sim-score-note {
+      font-size: .75rem; color: var(--muted); margin-top: .4rem;
+    }
+    /* image previews in similarity mode */
+    .sim-previews {
+      display: flex; gap: 1.25rem; flex-wrap: wrap;
+    }
+    .sim-preview-col {
+      flex: 1 1 0; min-width: 0;
+    }
+    .sim-preview-col img {
+      width: 100%; border-radius: var(--radius); border: 1px solid var(--border);
+      object-fit: contain; max-height: 480px; display: block;
+    }
+    .sim-preview-label {
+      font-size: .75rem; color: var(--muted); margin-bottom: .4rem;
+      text-transform: uppercase; letter-spacing: .04em;
+    }
   </style>
 </head>
 <body>
 <h1>DINOv3 <span>Tagger</span></h1>
 <p class="subtitle">ViT-H/16+ · {{ num_tags | format_number }} tags · {{ vocab_path }}</p>
+<!-- mode toggle -->
+<div class="mode-toggle">
+  <button class="mode-btn active" id="mode-btn-tagger" onclick="setMode('tagger')">Tagger</button>
+  <button class="mode-btn" id="mode-btn-similarity" onclick="setMode('similarity')">Similarity</button>
+</div>
+<!-- ===== TAGGER MODE ===== -->
+<div id="tagger-panels" class="layout">
   <!-- ====== LEFT PANEL ====== -->
   <div class="panel-left">
     </div>
   </div><!-- /panel-right -->
+</div><!-- /tagger-panels -->
+<!-- ===== SIMILARITY MODE ===== -->
+<div id="similarity-panel">
+  <div class="sim-inputs">
+    <!-- Image A -->
+    <div class="card">
+      <div style="font-size:.8rem;font-weight:600;color:var(--muted);text-transform:uppercase;letter-spacing:.05em;margin-bottom:.75rem">Image A</div>
+      <div class="input-row">
+        <input type="text" id="sim-url-a" placeholder="Paste URL…" />
+      </div>
+      <div id="sim-drop-a" class="drop-zone-sim" onclick="document.getElementById('sim-file-a').click()">
+        <input type="file" id="sim-file-a" accept="image/*" style="display:none" onchange="simStageFile('a', this)" />
+        Drop image A here or <strong>click to browse</strong>
+      </div>
+      <img id="sim-preview-a" src="" alt="" style="display:none;width:100%;margin-top:.75rem;border-radius:var(--radius);border:1px solid var(--border);max-height:300px;object-fit:contain" />
+    </div>
+    <!-- Image B -->
+    <div class="card">
+      <div style="font-size:.8rem;font-weight:600;color:var(--muted);text-transform:uppercase;letter-spacing:.05em;margin-bottom:.75rem">Image B</div>
+      <div class="input-row">
+        <input type="text" id="sim-url-b" placeholder="Paste URL…" />
+      </div>
+      <div id="sim-drop-b" class="drop-zone-sim" onclick="document.getElementById('sim-file-b').click()">
+        <input type="file" id="sim-file-b" accept="image/*" style="display:none" onchange="simStageFile('b', this)" />
+        Drop image B here or <strong>click to browse</strong>
+      </div>
+      <img id="sim-preview-b" src="" alt="" style="display:none;width:100%;margin-top:.75rem;border-radius:var(--radius);border:1px solid var(--border);max-height:300px;object-fit:contain" />
+    </div>
+  </div>
+  <div class="sim-run-row">
+    <button class="btn" id="sim-run-btn" onclick="runSimilarity()">Compare</button>
+  </div>
+  <div class="spinner" id="sim-spinner" style="display:none"></div>
+  <div class="error-msg" id="sim-error" style="display:none"></div>
+  <!-- score -->
+  <div class="sim-score-card" id="sim-score-card">
+    <div class="sim-score-label">Cosine Similarity (FEATURE_DIM descriptor)</div>
+    <div class="sim-score-value" id="sim-score-value">—</div>
+    <div class="sim-score-bar-bg">
+      <div class="sim-score-bar-fill" id="sim-score-bar" style="width:0%"></div>
+    </div>
+    <div class="sim-score-note" id="sim-score-note"></div>
+  </div>
+</div><!-- /similarity-panel -->
 <script>
   // ---- category metadata from server ----
     if (e.key === 'Enter') runFromUrl();
   });
+  // drag & drop — tagger
   const dz = document.getElementById('drop-zone');
   dz.addEventListener('dragover',  e => { e.preventDefault(); dz.classList.add('drag-over'); });
   dz.addEventListener('dragleave', () => dz.classList.remove('drag-over'));
     const file = e.dataTransfer.files[0];
     if (file) stageFile(file);
   });
+  // ---- mode toggle ----
+  function setMode(mode) {
+    document.getElementById('tagger-panels').style.display  = mode === 'tagger'     ? 'flex' : 'none';
+    document.getElementById('similarity-panel').style.display = mode === 'similarity' ? 'block' : 'none';
+    document.getElementById('mode-btn-tagger').classList.toggle('active', mode === 'tagger');
+    document.getElementById('mode-btn-similarity').classList.toggle('active', mode === 'similarity');
+  }
+  // ---- similarity: staged files ----
+  const _simStaged = { a: null, b: null };
+  function simStageFile(side, input) {
+    const file = input.files[0];
+    if (!file) return;
+    _simStaged[side] = file;
+    const reader = new FileReader();
+    reader.onload = e => {
+      const img = document.getElementById(`sim-preview-${side}`);
+      img.src = e.target.result;
+      img.style.display = 'block';
+    };
+    reader.readAsDataURL(file);
+  }
+  // drag & drop — similarity A
+  function _wireDrop(dzId, side) {
+    const el = document.getElementById(dzId);
+    el.addEventListener('dragover',  e => { e.preventDefault(); el.classList.add('drag-over'); });
+    el.addEventListener('dragleave', () => el.classList.remove('drag-over'));
+    el.addEventListener('drop', e => {
+      e.preventDefault(); el.classList.remove('drag-over');
+      const file = e.dataTransfer.files[0];
+      if (!file) return;
+      _simStaged[side] = file;
+      const reader = new FileReader();
+      reader.onload = ev => {
+        const img = document.getElementById(`sim-preview-${side}`);
+        img.src = ev.target.result;
+        img.style.display = 'block';
+      };
+      reader.readAsDataURL(file);
+    });
+  }
+  _wireDrop('sim-drop-a', 'a');
+  _wireDrop('sim-drop-b', 'b');
+  // placeholder — replaced by module script
+  function runSimilarity() {}
 </script>
 <!-- @gradio/client module: patches runFromUrl / submitFile / runPca on window
   // Re-run with current colour pickers — re-submits full request (backbone
   // result is cached by the Gradio queue so subsequent calls are fast if
   // the same image/max_size is used, but ZeroGPU requires a full round-trip).
+  // ---- similarity ----
+  window.runSimilarity = async function() {
+    const urlA = document.getElementById('sim-url-a').value.trim();
+    const urlB = document.getElementById('sim-url-b').value.trim();
+    const imageArgA = urlA ? urlA : (_simStaged.a ? await handle_file(_simStaged.a) : null);
+    const imageArgB = urlB ? urlB : (_simStaged.b ? await handle_file(_simStaged.b) : null);
+    if (!imageArgA || !imageArgB) {
+      document.getElementById('sim-error').textContent = 'Provide both images before comparing.';
+      document.getElementById('sim-error').style.display = 'block';
+      return;
+    }
+    document.getElementById('sim-error').style.display = 'none';
+    document.getElementById('sim-run-btn').disabled = true;
+    document.getElementById('sim-spinner').style.display = 'block';
+    document.getElementById('sim-score-card').style.display = 'none';
+    try {
+      const res = await gradioApp.predict("/get_similarity", {
+        image_a:  imageArgA,
+        image_b:  imageArgB,
+        max_size: 1024,
+      });
+      const data = JSON.parse(res.data[0]);
+      const score = data.score;                         // [-1, 1]
+      const pct   = Math.round(((score + 1) / 2) * 100); // map to [0,100]
+      document.getElementById('sim-score-value').textContent = score.toFixed(4);
+      document.getElementById('sim-score-bar').style.width   = pct + '%';
+      document.getElementById('sim-score-note').textContent  =
+        score > 0.9  ? 'Very high similarity — nearly identical semantic content.' :
+        score > 0.7  ? 'High similarity — strongly related images.' :
+        score > 0.5  ? 'Moderate similarity — some shared features.' :
+        score > 0.2  ? 'Low similarity — loosely related.' :
+                       'Very low similarity — likely unrelated images.';
+      // colour the bar by score
+      const barEl = document.getElementById('sim-score-bar');
+      barEl.style.background =
+        score > 0.7 ? '#4ade80' :
+        score > 0.4 ? '#facc15' : '#f87171';
+      document.getElementById('sim-score-card').style.display = 'block';
+    } catch (err) {
+      document.getElementById('sim-error').textContent = String(err);
+      document.getElementById('sim-error').style.display = 'block';
+    } finally {
+      document.getElementById('sim-run-btn').disabled = false;
+      document.getElementById('sim-spinner').style.display = 'none';
+    }
+  };
   window.rerunCustomPca = async function() {
     if (!_lastPcaRequest) return;
     const spinner = document.getElementById('pca-spinner');