Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Changes from 1 commit
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
pyright-based changes for tools/mtmd/legacy-models/convert_image_enco…
…der_to_gguf.py

Fix torch 2.5.1 / numpy 2.x compatibility in convert_image_encoder_to_gguf.py

- Updated Tensor-to-array conversions to use `np.asarray(..., dtype=...)` per NumPy 2.x migration rules (avoids copy error on float16).
- Used explicit typing and `cast(...)` to guide Pyright/Pylance under torch 2.5.1:
  - Annotated `model` as PreTrainedModel.
  - Re-cast `model.vision_model` to `CLIPVisionTransformer` to safely access `.encoder.layers`.
  - Replaced slice assignment with `__init__` to reset ModuleList contents.
- Verified compatibility by converting `openai/clip-vit-base-patch32` using `--clip-model-is-openclip`.
  • Loading branch information
robbiemu committed May 16, 2025
commit c278affb18879adfb3c85af2e8c41cedaa48a710
27 changes: 18 additions & 9 deletions tools/mtmd/legacy-models/convert_image_encoder_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
import torch
import numpy as np
from gguf import *
from typing import cast
from torch.nn import ModuleList
from transformers.models.clip.modeling_clip import CLIPVisionTransformer
from transformers import PreTrainedModel
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel, SiglipVisionModel

TEXT = "clip.text"
Expand Down Expand Up @@ -162,13 +166,13 @@ def bytes_to_unicode():
ftype = 0

if args.clip_model_is_siglip:
model = SiglipVisionModel.from_pretrained(dir_model)
model: PreTrainedModel = SiglipVisionModel.from_pretrained(dir_model)
processor = None
elif args.clip_model_is_vision or args.clip_model_is_openclip:
model = CLIPVisionModel.from_pretrained(dir_model)
model: PreTrainedModel = CLIPVisionModel.from_pretrained(dir_model)
processor = None
else:
model = CLIPModel.from_pretrained(dir_model)
model: PreTrainedModel = CLIPModel.from_pretrained(dir_model)
processor = CLIPProcessor.from_pretrained(dir_model)

fname_middle = None
Expand Down Expand Up @@ -350,9 +354,14 @@ def get_non_negative_vision_feature_layers(v_hparams):
# By default, we drop the last layer for llava projector
# models unless we have explicitly set vision feature layers
if feature_layers is None:
model.vision_model.encoder.layers.pop(-1)
vision_model = cast(CLIPVisionTransformer, model.vision_model)
encoder_layers = vision_model.encoder.layers
encoder_layers.pop(-1)
else:
model.vision_model.encoder.layers = model.vision_model.encoder.layers[:max(feature_layers)]
vision_model = cast(CLIPVisionTransformer, model.vision_model)
encoder_layers = vision_model.encoder.layers
encoder_layers = cast(ModuleList, encoder_layers)
encoder_layers.__init__(encoder_layers[:max(feature_layers)])

projector = torch.load(args.llava_projector)
for name, data in projector.items():
Expand All @@ -375,24 +384,24 @@ def get_non_negative_vision_feature_layers(v_hparams):
continue

name = get_tensor_name(name)
data = data.squeeze().numpy()
data = np.ascontiguousarray(data.detach().cpu().squeeze().numpy())

n_dims = len(data.shape)

# ftype == 0 -> float32, ftype == 1 -> float16
ftype_cur = 0
if n_dims == 4:
print(f"tensor {name} is always saved in f16")
data = data.astype(np.float16)
data = np.asarray(data, dtype=np.float16)
ftype_cur = 1
elif ftype == 1:
if name[-7:] == ".weight" and n_dims == 2:
print(" Converting to float16")
data = data.astype(np.float16)
data = np.asarray(data, dtype=np.float16)
ftype_cur = 1
else:
print(" Converting to float32")
data = data.astype(np.float32)
data = np.asarray(data, dtype=np.float32)
ftype_cur = 0
else:
if data.dtype != np.float32:
Expand Down