From da81af4fca5bc33d123b541a76c95d96e139247d Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 8 Jul 2025 21:51:50 +0000 Subject: [PATCH 01/18] Added GGUF mappings for CogVLM model --- gguf-py/gguf/constants.py | 7 ++++ gguf-py/gguf/tensor_mapping.py | 66 ++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b8ac394580b1f..5f6521dab47c6 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -567,6 +567,13 @@ class MODEL_TENSOR(IntEnum): SHORTCONV_CONV = auto() SHORTCONV_INPROJ = auto() SHORTCONV_OUTPROJ = auto() + VISEXP_ATTN_Q = auto() + VISEXP_ATTN_K = auto() + VISEXP_ATTN_V = auto() + VISEXP_ATTN_OUT = auto() + VISEXP_GATE = auto() + VISEXP_DOWN = auto() + VISEXP_UP = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index b0c3d65e95847..35839d737d61a 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -155,6 +155,7 @@ class TensorNameMap: "encoder.layer.{bid}.layer_norm_1", # jina-v2-code "rwkv.blocks.{bid}.ln2", # rwkv6 "model.layers.{bid}.ln2", # rwkv7 + "model.layers.{bid}.post_attention_layernorm", # cogvlm ), # Attention query-key-value @@ -195,6 +196,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h + "model.layers.{bid}.self_attn.language_expert_query", # cogvlm ), # Attention key @@ -215,6 +217,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h + "model.layers.{bid}.self_attn.language_expert_key", # cogvlm ), # Attention value @@ -234,6 +237,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h + "model.layers.{bid}.self_attn.language_expert_value", # cogvlm ), # Attention output @@ -270,6 +274,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.attn_out", # llada "layers.{bid}.self_attn.o_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.o_proj", # nemotron-h + "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm ), # Attention output norm @@ -402,6 +407,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.up_proj", # llada "layers.{bid}.mlp.up_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.up_proj", # nemotron-h + "model.layers.{bid}.mlp.language_mlp.up_proj", # cogvlm ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -430,21 +436,22 @@ class TensorNameMap: # Feed-forward gate MODEL_TENSOR.FFN_GATE: ( - "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 - "layers.{bid}.mlp.gate_proj", # embeddinggemma - "layers.{bid}.feed_forward.w1", # llama-pth - "transformer.h.{bid}.mlp.w2", # qwen - "transformer.h.{bid}.mlp.c_fc2", # jais - "model.layers.layers.{bid}.mlp.gate_proj", # plamo - "model.layers.{bid}.feed_forward.w1", # internlm2 - "encoder.layers.{bid}.mlp.fc12", # nomic-bert - "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) - "transformer.h.{bid}.mlp.linear_1", # refact - "model.layers.{bid}.residual_mlp.w1", # arctic - "transformer.h.{bid}.mlp.c_fc_0", # exaone - "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid - "model.transformer.blocks.{bid}.ff_proj", # llada - "layers.{bid}.mlp.gate_proj", # qwen3-embedding + "model.layers.{bid}.mlp.gate_proj", # llama-hf refact olmo2 + "layers.{bid}.mlp.gate_proj", # embeddinggemma + "layers.{bid}.feed_forward.w1", # llama-pth + "transformer.h.{bid}.mlp.w2", # qwen + "transformer.h.{bid}.mlp.c_fc2", # jais + "model.layers.layers.{bid}.mlp.gate_proj", # plamo + "model.layers.{bid}.feed_forward.w1", # internlm2 + "encoder.layers.{bid}.mlp.fc12", # nomic-bert + "encoder.layer.{bid}.mlp.gated_layers_w", # jina-bert-v2 (split up/gate, no longer used) + "transformer.h.{bid}.mlp.linear_1", # refact + "model.layers.{bid}.residual_mlp.w1", # arctic + "transformer.h.{bid}.mlp.c_fc_0", # exaone + "model.layers.{bid}.feed_forward.gate_proj", # llama4 jamba granite-hybrid + "model.transformer.blocks.{bid}.ff_proj", # llada + "layers.{bid}.mlp.gate_proj", # qwen3-embedding + "model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm ), MODEL_TENSOR.FFN_GATE_EXP: ( @@ -498,6 +505,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.ff_out", # llada "layers.{bid}.mlp.down_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.down_proj", # nemotron-h + "model.layers.{bid}.mlp.language_mlp.down_proj", # cogvlm ), MODEL_TENSOR.FFN_DOWN_EXP: ( @@ -1014,6 +1022,34 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + MODEL_TENSOR.VISEXP_UP: ( + "model.layers.{bid}.mlp.vision_mlp.up_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_GATE: ( + "model.layers.{bid}.mlp.vision_mlp.gate_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_DOWN: ( + "model.layers.{bid}.mlp.vision_mlp.down_proj", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_OUT: ( + "model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_Q: ( + "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_K: ( + "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm + ), + + MODEL_TENSOR.VISEXP_ATTN_V: ( + "model.layers.{bid}.self_attn.vision_expert_value", # cogvlm + ), + ############################################################################ # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( From 6de9d169ca93a7e725849f3d6c06f5fac9ea5864 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 8 Jul 2025 23:02:36 +0000 Subject: [PATCH 02/18] Add tensor mapping for CogVLM visual encoder --- gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/tensor_mapping.py | 24 ++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5f6521dab47c6..2ae331514ea53 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -614,6 +614,9 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY = auto() # minicpmv V_TOK_EMBD_IMG_BREAK = auto() # pixtral V_MM_PATCH_MERGER = auto() # mistral small 3.1 + V_MM_UP = auto() # cogvlm + V_MM_DOWN = auto() # cogvlm + V_MM_GATE = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 35839d737d61a..135876f4cb7c9 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -99,6 +99,7 @@ class TensorNameMap: "backbone.final_layer_norm", # wavtokenizer "model.norm", # llama4 "model.transformer.ln_f", # llada + "model.norm", # cogvlm ), # Rope frequencies @@ -1157,6 +1158,7 @@ class TensorNameMap: "model.mm_projector.mlp.mlp.{bid}", "vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "mlp1.{bid}", # InternVL + "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_PEG: ( @@ -1179,6 +1181,7 @@ class TensorNameMap: "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl "vision_tower.patch_embed.proj", # kimi-vl + "model.vision.patch_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1200,6 +1203,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated + "model.vision.transformer.layers.{bid}.attention.query", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1217,6 +1221,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated + "model.vision.transformer.layers.{bid}.attention.key", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1234,6 +1239,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated + "model.vision.transformer.layers.{bid}.attention.value", # cogvlm ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( @@ -1247,6 +1253,7 @@ class TensorNameMap: "vision_model.model.layers.{bid}.input_layernorm", # llama4 "visual.blocks.{bid}.norm1", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1) + "model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_O: ( @@ -1260,6 +1267,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral "visual.blocks.{bid}.attn.proj", # qwen2vl "vision_tower.encoder.blocks.{bid}.wo", # kimi-vl + "model.vision.transformer.layers.{bid}.attention.dense", # cogvlm ), MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( @@ -1273,6 +1281,7 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral "visual.blocks.{bid}.norm2", # qwen2vl "vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1) + "model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm ), MODEL_TENSOR.V_ENC_FFN_UP: ( @@ -1286,6 +1295,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.fc1", # qwen2vl "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl "vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1) + "model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm ), MODEL_TENSOR.V_ENC_FFN_GATE: ( @@ -1305,6 +1315,7 @@ class TensorNameMap: "visual.blocks.{bid}.mlp.fc2", # qwen2vl "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl "vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1) + "model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm ), MODEL_TENSOR.V_LAYER_SCALE_1: ( @@ -1330,6 +1341,7 @@ class TensorNameMap: "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl "vision_tower.encoder.final_layernorm", # kimi-vl + "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1400,6 +1412,18 @@ class TensorNameMap: "patch_merger.merging_layer", # mistral ), + MODEL_TENSOR.V_MM_UP: ( + "model.vision.linear_proj.dense_h_to_4h", # cogvlm + ), + + MODEL_TENSOR.V_MM_DOWN: ( + "model.vision.linear_proj.dense_4h_to_h", # cogvlm + ), + + MODEL_TENSOR.V_MM_GATE: ( + "model.vision.linear_proj.gate_proj", # cogvlm + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( From 11fac0c0e7a52c48334724a7e2d17abcc656fd88 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Wed, 9 Jul 2025 22:47:37 +0000 Subject: [PATCH 03/18] Add CogVLM to conversion script, no vision part yet --- convert_hf_to_gguf.py | 26 ++++++++++++++++++++++++++ gguf-py/gguf/constants.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 62a546ee22201..17c61463b4e4a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8693,6 +8693,32 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors + +@ModelBase.register("CogVLMForCausalLM") +class CogVLMModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.COGVLM + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + # block vision tensors + if name.startswith("model.vision."): + return [] + + if "query_key_value" in name: + # Slice tensor into three along first axis + q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) + return [ + (self.map_tensor_name(name.replace("query_key_value", "query")), q), + (self.map_tensor_name(name.replace("query_key_value", "key")), k), + (self.map_tensor_name(name.replace("query_key_value", "value")), v), + ] + + return [(self.map_tensor_name(name), data_torch)] + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 2ae331514ea53..b05bbe8108591 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -392,6 +392,7 @@ class MODEL_ARCH(IntEnum): SMALLTHINKER = auto() LLADA = auto() SEED_OSS = auto() + COGVLM = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -402,6 +403,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): GLM_EDGE = auto() MERGER = auto() GEMMA3 = auto() + COGVLM = auto() class MODEL_TENSOR(IntEnum): @@ -738,6 +740,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", MODEL_ARCH.SEED_OSS: "seed_oss", + MODEL_ARCH.COGVLM: "cogvlm", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -913,6 +916,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", + MODEL_TENSOR.VISEXP_ATTN_Q: "blk.{bid}.vis_attn_q", + MODEL_TENSOR.VISEXP_ATTN_K: "blk.{bid}.vis_attn_k", + MODEL_TENSOR.VISEXP_ATTN_V: "blk.{bid}.vis_attn_v", + MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", + MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", + MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", + MODEL_TENSOR.VISEXP_UP: "blk.{bid}.vis_up", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2694,6 +2704,27 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.COGVLM: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.VISEXP_ATTN_Q, + MODEL_TENSOR.VISEXP_ATTN_K, + MODEL_TENSOR.VISEXP_ATTN_V, + MODEL_TENSOR.VISEXP_ATTN_OUT, + MODEL_TENSOR.VISEXP_GATE, + MODEL_TENSOR.VISEXP_UP, + MODEL_TENSOR.VISEXP_DOWN, + ], # TODO } From a3adac122ac521371f34053b3967da51a414aced Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Thu, 10 Jul 2025 00:27:01 +0000 Subject: [PATCH 04/18] Added CogVLM vision model to conversion script --- convert_hf_to_gguf.py | 25 ++++++++++++++++++++++++- gguf-py/gguf/constants.py | 12 ++++++++++++ gguf-py/gguf/tensor_mapping.py | 12 +++++++++++- 3 files changed, 47 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 17c61463b4e4a..834c688f65bc8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8694,6 +8694,29 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("CogVLMForCausalLM") +class CogVLMVisionModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if not name.startswith("model.vision."): + return [] + + if "query_key_value" in name: + # Split tensor into three along first axis + q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) + return [ + (self.map_tensor_name(name.replace("query_key_value", "query")), q), + (self.map_tensor_name(name.replace("query_key_value", "key")), k), + (self.map_tensor_name(name.replace("query_key_value", "value")), v), + ] + + return [(self.map_tensor_name(name), data_torch)] + @ModelBase.register("CogVLMForCausalLM") class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM @@ -8708,7 +8731,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.vision."): return [] - if "query_key_value" in name: + if "query_key_value.weight" in name: # Slice tensor into three along first axis q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) return [ diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b05bbe8108591..e5e4ef719f3a3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -619,6 +619,8 @@ class MODEL_TENSOR(IntEnum): V_MM_UP = auto() # cogvlm V_MM_DOWN = auto() # cogvlm V_MM_GATE = auto() # cogvlm + V_TOK_BOI = auto() # cogvlm + V_TOK_EOI = auto() # cogvlm # audio (mtmd) A_ENC_EMBD_POS = auto() A_ENC_CONV1D = auto() @@ -963,6 +965,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 + MODEL_TENSOR.V_MM_UP: "mm.up", + MODEL_TENSOR.V_MM_DOWN: "mm.down", + MODEL_TENSOR.V_MM_GATE: "mm.gate", + MODEL_TENSOR.V_TOK_BOI: "v.boi", + MODEL_TENSOR.V_TOK_EOI: "v.eoi", # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd", MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}", @@ -1031,6 +1038,11 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY, MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, MODEL_TENSOR.V_MM_PATCH_MERGER, + MODEL_TENSOR.V_MM_UP, + MODEL_TENSOR.V_MM_DOWN, + MODEL_TENSOR.V_MM_GATE, + MODEL_TENSOR.V_TOK_BOI, + MODEL_TENSOR.V_TOK_EOI, # audio MODEL_TENSOR.A_ENC_EMBD_POS, MODEL_TENSOR.A_ENC_CONV1D, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 135876f4cb7c9..198ad0b02a247 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1169,6 +1169,7 @@ class TensorNameMap: "vision_tower.vision_model.embeddings.class_embedding", "model.vision_tower.embeddings.cls_token", # Intern-S1 "vision_model.class_embedding", # llama 4 + "model.vision.patch_embedding.cls_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( @@ -1181,7 +1182,7 @@ class TensorNameMap: "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl "vision_tower.patch_embed.proj", # kimi-vl - "model.vision.patch_embedding", # cogvlm + "model.vision.patch_embedding.proj", # cogvlm ), MODEL_TENSOR.V_ENC_EMBD_POS: ( @@ -1191,6 +1192,7 @@ class TensorNameMap: "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 "vision_tower.patch_embed.pos_emb", # kimi-vl + "model.vision.patch_embedding.position_embedding", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q: ( @@ -1424,6 +1426,14 @@ class TensorNameMap: "model.vision.linear_proj.gate_proj", # cogvlm ), + MODEL_TENSOR.V_TOK_BOI: ( + "model.vision.boi", # cogvlm + ), + + MODEL_TENSOR.V_TOK_EOI: ( + "model.vision.eoi", # cogvlm + ), + # audio (mtmd) MODEL_TENSOR.A_ENC_EMBD_POS: ( From 302678106b4b4ee7cb22958a24ffe791bccd305b Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 15 Jul 2025 22:42:24 +0000 Subject: [PATCH 05/18] Add graph for CogVLM CLIP model --- gguf-py/gguf/constants.py | 3 + gguf-py/gguf/tensor_mapping.py | 2 +- tools/mtmd/clip-impl.h | 10 +++ tools/mtmd/clip.cpp | 115 +++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 7 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e5e4ef719f3a3..f5d3780731881 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -616,6 +616,7 @@ class MODEL_TENSOR(IntEnum): V_RESMPL_QUERY = auto() # minicpmv V_TOK_EMBD_IMG_BREAK = auto() # pixtral V_MM_PATCH_MERGER = auto() # mistral small 3.1 + V_MM_POST_FC_NORM = auto() # cogvlm V_MM_UP = auto() # cogvlm V_MM_DOWN = auto() # cogvlm V_MM_GATE = auto() # cogvlm @@ -965,6 +966,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query", MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1 + MODEL_TENSOR.V_MM_POST_FC_NORM: "mm.post_fc_norm", # cogvlm MODEL_TENSOR.V_MM_UP: "mm.up", MODEL_TENSOR.V_MM_DOWN: "mm.down", MODEL_TENSOR.V_MM_GATE: "mm.gate", @@ -1038,6 +1040,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_RESMPL_QUERY, MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK, MODEL_TENSOR.V_MM_PATCH_MERGER, + MODEL_TENSOR.V_MM_POST_FC_NORM, MODEL_TENSOR.V_MM_UP, MODEL_TENSOR.V_MM_DOWN, MODEL_TENSOR.V_MM_GATE, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 198ad0b02a247..858018f938888 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1343,7 +1343,6 @@ class TensorNameMap: "vision_model.layernorm_post", # llama4 "visual.merger.ln_q", # qwen2vl "vision_tower.encoder.final_layernorm", # kimi-vl - "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_INP_PROJ: ( @@ -1355,6 +1354,7 @@ class TensorNameMap: "multi_modal_projector.layer_norm", "multi_modal_projector.pre_norm", "pre_mm_projector_norm", + "model.vision.linear_proj.norm1", # cogvlm ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 664b0c9ac6e36..26ae031934c5a 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -113,6 +113,14 @@ #define TN_MM_NORM_PRE "mm.a.norm_pre.%s" #define TN_MM_NORM_MID "mm.a.norm_mid.%s" +// cogvlm +#define TN_MM_POST_FC_NORM "mm.post_fc_norm.%s" +#define TN_MM_H_TO_4H "mm.up.%s" +#define TN_MM_GATE "mm.gate.%s" +#define TN_MM_4H_TO_H "mm.down.%s" +#define TN_TOK_BOI "v.boi" +#define TN_TOK_EOI "v.eoi" + // align x to upper multiple of n #define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n)) @@ -137,6 +145,7 @@ enum projector_type { PROJECTOR_TYPE_LFM2, PROJECTOR_TYPE_KIMIVL, PROJECTOR_TYPE_UNKNOWN, + PROJECTOR_TYPE_COGVLM, }; static std::map PROJECTOR_TYPE_NAMES = { @@ -158,6 +167,7 @@ static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, { PROJECTOR_TYPE_LFM2, "lfm2"}, { PROJECTOR_TYPE_KIMIVL, "kimivl"}, + { PROJECTOR_TYPE_COGVLM, "cogvlm"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index e7c516d2de8d1..856211590175e 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -357,6 +357,15 @@ struct clip_model { ggml_tensor * mm_norm_pre_w = nullptr; ggml_tensor * mm_norm_mid_w = nullptr; + // cogvlm + ggml_tensor * mm_post_fc_norm_w = nullptr; + ggml_tensor * mm_post_fc_norm_b = nullptr; + ggml_tensor * mm_h_to_4h_w = nullptr; + ggml_tensor * mm_gate_w = nullptr; + ggml_tensor * mm_4h_to_h_w = nullptr; + ggml_tensor * mm_boi = nullptr; + ggml_tensor * mm_eoi = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; @@ -1610,6 +1619,66 @@ struct clip_graph { return gf; } + // cogvlm vision encoder + ggml_cgraph * build_cogvlm() { + GGML_ASSERT(model.class_embedding != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + + const int n_pos = n_patches + 1; // +1 for [CLS] + + // build input and concatenate class embedding + ggml_tensor * inp = build_inp(); + inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + + // build ViT transformer + ggml_tensor * cur = build_vit( + inp, n_pos, + NORM_TYPE_NORMAL, + hparams.ffn_op, + model.position_embeddings, + nullptr); + + // remove CLS token (like build_llama4 does) + cur = ggml_view_2d(ctx0, cur, + n_embd, n_patches, + ggml_row_size(cur->type, n_embd), 0); + + // Multiply with mm_model_proj + cur = ggml_mul_mat(ctx0, model.mm_model_proj_w, cur); + + // Apply layernorm, weight, bias + cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); + + // Apply GELU + // TODO: Not 100% sure about gelu and silu configuration + cur = ggml_gelu_inplace(ctx0, cur); + + // Branch 1: multiply with mm_h_to_4h_w + ggml_tensor * h_to_4h = ggml_mul_mat(ctx0, model.mm_h_to_4h_w, cur); + + // Branch 2: multiply with mm_gate_w + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); + + // Apply silu + gate = ggml_silu_inplace(ctx0, gate); + + // Multiply together + cur = ggml_mul(ctx0, gate, h_to_4h); + + // Apply mm_4h_to_h_w + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, cur); + + // Concatenate with boi and eoi + // TODO: The shape may be incorrect + cur = ggml_concat(ctx0, model.mm_boi, cur, 1); + cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; + } + private: // // utility functions @@ -1678,9 +1747,14 @@ struct clip_graph { auto & layer = model.layers[il]; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // layernorm1 - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); + // Check if this is COGVLM projector type for post-norm layernorm order + const bool is_cogvlm = ctx->proj_type() == PROJECTOR_TYPE_COGVLM; + + // layernorm1 (only for non-COGVLM) + if (!is_cogvlm) { + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); + } // self-attention { @@ -1734,6 +1808,12 @@ struct clip_graph { cb(cur, "attn_out_scaled", il); } + // Apply layernorm AFTER attention for COGVLM (post-norm) + if (is_cogvlm) { + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "attn_post_norm", il); + } + // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, inpL); @@ -1741,9 +1821,11 @@ struct clip_graph { cb(cur, "ffn_inp", il); - // layernorm2 - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); + // layernorm2 (only for non-COGVLM) + if (!is_cogvlm) { + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); + } // ffn cur = build_ffn(cur, @@ -1759,6 +1841,12 @@ struct clip_graph { cb(cur, "ffn_out_scaled", il); } + // Apply layernorm AFTER MLP for COGVLM (post-norm) + if (is_cogvlm) { + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_post_norm", il); + } + // residual 2 cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); @@ -2122,6 +2210,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_kimivl(); } break; + case PROJECTOR_TYPE_COGVLM: + { + res = graph.build_cogvlm(); + } break; default: { res = graph.build_llava(); @@ -2750,6 +2842,17 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight")); } break; + case PROJECTOR_TYPE_COGVLM: + { + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); + model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); + model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); + model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); + model.mm_boi = get_tensor(TN_TOK_BOI); + model.mm_eoi = get_tensor(TN_TOK_EOI); + } break; default: GGML_ASSERT(false && "unknown projector type"); } From 183ca2eb50b0ff76648f8b0efcf5fe7b1e5a0a97 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Thu, 17 Jul 2025 01:00:30 +0000 Subject: [PATCH 06/18] Add graph for CogVLM --- src/llama-arch.cpp | 32 ++++++++++ src/llama-arch.h | 8 +++ src/llama-model.cpp | 146 ++++++++++++++++++++++++++++++++++++++++++++ src/llama-model.h | 9 +++ 4 files changed, 195 insertions(+) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 77b2fecf18fb8..624c609156d4c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -97,6 +97,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, { LLM_ARCH_SEED_OSS, "seed_oss" }, + { LLM_ARCH_COGVLM, "cogvlm" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2152,6 +2153,30 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_COGVLM, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.visexp_attn_wq" }, + { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.visexp_attn_wk" }, + { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.visexp_attn_wv" }, + { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.visexp_attn_wo" }, + { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.visexp_ffn_gate" }, + { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.visexp_ffn_down" }, + { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.visexp_ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -2323,6 +2348,13 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WQ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WK, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_WV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // NextN/MTP tensors are currently ignored (reserved for future MTP support) // These tensors only exist in the last layer(s) and are treated as output tensors {LLM_TENSOR_NEXTN_EH_PROJ, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 21ab47bd7af2a..c863308ce3be0 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -101,6 +101,7 @@ enum llm_arch { LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, LLM_ARCH_SEED_OSS, + LLM_ARCH_COGVLM, LLM_ARCH_UNKNOWN, }; @@ -420,6 +421,13 @@ enum llm_tensor { LLM_TENSOR_SHORTCONV_CONV, LLM_TENSOR_SHORTCONV_INPROJ, LLM_TENSOR_SHORTCONV_OUTPROJ, + LLM_TENSOR_VISEXP_ATTN_WQ, + LLM_TENSOR_VISEXP_ATTN_WK, + LLM_TENSOR_VISEXP_ATTN_WV, + LLM_TENSOR_VISEXP_ATTN_OUT, + LLM_TENSOR_VISEXP_FFN_GATE, + LLM_TENSOR_VISEXP_FFN_DOWN, + LLM_TENSOR_VISEXP_FFN_UP, LLM_TENSOR_NEXTN_EH_PROJ, LLM_TENSOR_NEXTN_EMBED_TOKENS, LLM_TENSOR_NEXTN_ENORM, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b9e4634a7061c..e2a765c7e3d52 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5719,6 +5719,34 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert }, 0); } } break; + case LLM_ARCH_COGVLM: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.visexp_attn_wq = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WQ, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wk = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WK, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -18564,6 +18592,120 @@ struct llm_build_smallthinker : public llm_graph_context{ } }; +struct llm_build_cogvlm : public llm_graph_context { + llm_build_cogvlm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + float kq_scale = 1.0f / sqrtf(float(n_embd_head)); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * inpL, * cur; + inpL = build_inp_embd(model.tok_embd); + + // check ubatch to see if we have input tokens (text) + // or an input embedding vector (image) + bool is_text; + if (ubatch.token) { + is_text = true; + } else { + is_text = false; + } + + for (int il = 0; il < n_layer; ++il) { + // get either the text or image weight tensors + ggml_tensor * wq, * wk, * wv, * wo; + ggml_tensor * ffn_gate, * ffn_down, * ffn_up; + + if (is_text) { + wq = model.layers[il].wq; + wk = model.layers[il].wk; + wv = model.layers[il].wv; + wo = model.layers[il].wo; + ffn_gate = model.layers[il].ffn_gate; + ffn_down = model.layers[il].ffn_down; + ffn_up = model.layers[il].ffn_up; + } else { + wq = model.layers[il].visexp_attn_wq; + wk = model.layers[il].visexp_attn_wk; + wv = model.layers[il].visexp_attn_wv; + wo = model.layers[il].visexp_attn_wo; + ffn_gate = model.layers[il].visexp_ffn_gate; + ffn_down = model.layers[il].visexp_ffn_down; + ffn_up = model.layers[il].visexp_ffn_up; + } + + ggml_tensor * inpSA = inpL; + cur = build_norm(inpSA, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + + // build self attention + { + ggml_tensor * Qcur = build_lora_mm(wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // TODO: Check Rope because this might not be the same as cogvlm + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + + cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // Make a standard ffn without the build_ffn function + ggml_tensor * tmp = build_lora_mm(ffn_up, cur); + ggml_tensor * gate = build_lora_mm(ffn_gate, cur); + gate = ggml_silu(ctx0, gate); + cur = ggml_mul(ctx0, gate, tmp); + cur = build_lora_mm(ffn_down, cur); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + ggml_build_forward_expand(gf, cur); + + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -19076,6 +19218,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { llm = std::make_unique>(*this, params); } } break; + case LLM_ARCH_COGVLM: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-model.h b/src/llama-model.h index 10b1767f27228..7514a5cce7773 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -370,6 +370,15 @@ struct llama_layer { // openai-moe struct ggml_tensor * attn_sinks = nullptr; + // cogvlm + struct ggml_tensor * visexp_attn_wq = nullptr; + struct ggml_tensor * visexp_attn_wk = nullptr; + struct ggml_tensor * visexp_attn_wv = nullptr; + struct ggml_tensor * visexp_attn_wo = nullptr; + struct ggml_tensor * visexp_ffn_gate = nullptr; + struct ggml_tensor * visexp_ffn_down = nullptr; + struct ggml_tensor * visexp_ffn_up = nullptr; + struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; From ac3e34895e2e7863a09fbd2f090fc6516bc10a24 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Mon, 21 Jul 2025 21:58:14 +0000 Subject: [PATCH 07/18] Fixes for CogVLM. Now compiles. --- src/llama-model.cpp | 5 +++++ tools/mtmd/clip.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e2a765c7e3d52..039da32a5bc2e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18603,6 +18603,10 @@ struct llm_build_cogvlm : public llm_graph_context { ggml_tensor * inpL, * cur; inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_unified(); + // check ubatch to see if we have input tokens (text) // or an input embedding vector (image) bool is_text; @@ -19378,6 +19382,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: + case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 856211590175e..913ca623dcbf6 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1644,7 +1644,7 @@ struct clip_graph { ggml_row_size(cur->type, n_embd), 0); // Multiply with mm_model_proj - cur = ggml_mul_mat(ctx0, model.mm_model_proj_w, cur); + cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); // Apply layernorm, weight, bias cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); From 0634dc94ebd75d7811223b6b66e5131f6475b4b1 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 22 Jul 2025 00:15:22 +0000 Subject: [PATCH 08/18] Model now runs --- convert_hf_to_gguf.py | 5 +++++ gguf-py/gguf/constants.py | 1 + gguf-py/gguf/tensor_mapping.py | 6 +++++- src/llama-arch.cpp | 14 +++++++------- src/llama-model.cpp | 23 +++++++++++++++++++++-- tools/mtmd/clip.cpp | 7 +++++++ 6 files changed, 46 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 834c688f65bc8..d8714c7b1344d 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8699,6 +8699,11 @@ class CogVLMVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f5d3780731881..c2b23870bae7a 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2965,6 +2965,7 @@ class VisionProjectorType: VOXTRAL = "voxtral" LFM2 = "lfm2" KIMIVL = "kimivl" + COGVLM = "cogvlm" # Items here are (block size, type size) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 858018f938888..8e914caa6c36e 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1152,13 +1152,13 @@ class TensorNameMap: MODEL_TENSOR.V_MMPROJ_FC: ( "model.connector.modality_projection.proj", # SmolVLM + "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_MLP: ( "model.mm_projector.mlp.mlp.{bid}", "vision_model.vision_adapter.mlp.fc{bid}", # llama 4 "mlp1.{bid}", # InternVL - "model.vision.linear_proj.linear_proj", # cogvlm ), MODEL_TENSOR.V_MMPROJ_PEG: ( @@ -1414,6 +1414,10 @@ class TensorNameMap: "patch_merger.merging_layer", # mistral ), + MODEL_TENSOR.V_MM_POST_FC_NORM: ( + "model.vision.linear_proj.norm1", # cogvlm + ), + MODEL_TENSOR.V_MM_UP: ( "model.vision.linear_proj.dense_h_to_4h", # cogvlm ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 624c609156d4c..36eaa20f6f845 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2168,13 +2168,13 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.visexp_attn_wq" }, - { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.visexp_attn_wk" }, - { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.visexp_attn_wv" }, - { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.visexp_attn_wo" }, - { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.visexp_ffn_gate" }, - { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.visexp_ffn_down" }, - { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.visexp_ffn_up" }, + { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.vis_attn_q" }, + { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.vis_attn_k" }, + { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.vis_attn_v" }, + { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, + { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, + { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, + { LLM_TENSOR_VISEXP_FFN_UP, "blk.%d.vis_up" }, }, }, { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 039da32a5bc2e..4a93385fcd5bf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1945,6 +1945,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_COGVLM: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: type = LLM_TYPE_13B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -5727,6 +5735,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -5741,10 +5754,16 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + layer.visexp_ffn_gate = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.visexp_ffn_down = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.visexp_ffn_up = create_tensor(tn(LLM_TENSOR_VISEXP_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; default: @@ -18670,8 +18689,8 @@ struct llm_build_cogvlm : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // TODO: Check Rope because this might not be the same as cogvlm - Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); - Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); + Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); + Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 913ca623dcbf6..9c9548876b154 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3939,6 +3939,10 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im n_patches /= 2; } } break; + case PROJECTOR_TYPE_COGVLM: + { + n_patches_sq += 2; + } break; default: GGML_ABORT("unsupported projector type"); } @@ -4346,6 +4350,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_ULTRAVOX: case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_VOXTRAL: + case PROJECTOR_TYPE_COGVLM: { // do nothing } break; @@ -4449,6 +4454,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { case PROJECTOR_TYPE_LFM2: case PROJECTOR_TYPE_KIMIVL: return ctx->model.mm_2_w->ne[1]; + case PROJECTOR_TYPE_COGVLM: + return ctx->model.mm_4h_to_h_w->ne[1]; default: GGML_ABORT("Unknown projector type"); } From 4c080bf5ff928e779281bfc5ab0eee780106db6c Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 29 Jul 2025 21:30:46 +0000 Subject: [PATCH 09/18] Fixes for cogvlm graph --- src/llama-model.cpp | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4a93385fcd5bf..78a5d29b9d31e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18665,24 +18665,12 @@ struct llm_build_cogvlm : public llm_graph_context { { ggml_tensor * Qcur = build_lora_mm(wq, cur); cb(Qcur, "Qcur", il); - if (model.layers[il].bq) { - Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); - cb(Qcur, "Qcur", il); - } ggml_tensor * Kcur = build_lora_mm(wk, cur); cb(Kcur, "Kcur", il); - if (model.layers[il].bk) { - Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); - cb(Kcur, "Kcur", il); - } ggml_tensor * Vcur = build_lora_mm(wv, cur); cb(Vcur, "Vcur", il); - if (model.layers[il].bv) { - Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); - cb(Vcur, "Vcur", il); - } Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); @@ -18703,11 +18691,17 @@ struct llm_build_cogvlm : public llm_graph_context { cb(cur, "ffn_norm", il); // Make a standard ffn without the build_ffn function - ggml_tensor * tmp = build_lora_mm(ffn_up, cur); - ggml_tensor * gate = build_lora_mm(ffn_gate, cur); - gate = ggml_silu(ctx0, gate); - cur = ggml_mul(ctx0, gate, tmp); - cur = build_lora_mm(ffn_down, cur); + //ggml_tensor * tmp = build_lora_mm(ffn_up, cur); + //ggml_tensor * gate = build_lora_mm(ffn_gate, cur); + //gate = ggml_silu(ctx0, gate); + //cur = ggml_mul(ctx0, gate, tmp); + //cur = build_lora_mm(ffn_down, cur); + cur = build_ffn(cur, + ffn_up, NULL, NULL, + ffn_gate, NULL, NULL, + ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); From bc3f08474418868d6830689e704000643ad1cbf7 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 18:38:15 +0000 Subject: [PATCH 10/18] Account for graph context change after rebase --- src/llama-model.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 78a5d29b9d31e..e6ee50686e3bd 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18612,7 +18612,7 @@ struct llm_build_smallthinker : public llm_graph_context{ }; struct llm_build_cogvlm : public llm_graph_context { - llm_build_cogvlm(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + llm_build_cogvlm(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; float kq_scale = 1.0f / sqrtf(float(n_embd_head)); @@ -18680,7 +18680,7 @@ struct llm_build_cogvlm : public llm_graph_context { Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); - cur = build_attn(inp_attn, gf, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -19237,7 +19237,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_COGVLM: { - llm = std::make_unique(*this, params, gf); + llm = std::make_unique(*this, params); } break; default: GGML_ABORT("fatal error"); From bcbd6ef4979d3f9c62df17160290940a755aa09d Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:10:57 +0000 Subject: [PATCH 11/18] Changes for whitespace --- convert_hf_to_gguf.py | 9 +++++---- gguf-py/gguf/tensor_mapping.py | 4 ++-- tools/mtmd/clip.cpp | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d8714c7b1344d..2c9bc36061ad9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8704,7 +8704,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6)) self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.COGVLM) - + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -8719,13 +8719,14 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter (self.map_tensor_name(name.replace("query_key_value", "key")), k), (self.map_tensor_name(name.replace("query_key_value", "value")), v), ] - + return [(self.map_tensor_name(name), data_torch)] + @ModelBase.register("CogVLMForCausalLM") class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM - + def set_gguf_parameters(self): super().set_gguf_parameters() @@ -8735,7 +8736,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # block vision tensors if name.startswith("model.vision."): return [] - + if "query_key_value.weight" in name: # Slice tensor into three along first axis q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 8e914caa6c36e..d28b992596d24 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1042,7 +1042,7 @@ class TensorNameMap: MODEL_TENSOR.VISEXP_ATTN_Q: ( "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm ), - + MODEL_TENSOR.VISEXP_ATTN_K: ( "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm ), @@ -1433,7 +1433,7 @@ class TensorNameMap: MODEL_TENSOR.V_TOK_BOI: ( "model.vision.boi", # cogvlm ), - + MODEL_TENSOR.V_TOK_EOI: ( "model.vision.eoi", # cogvlm ), diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9c9548876b154..3c3f178ece3af 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1661,7 +1661,7 @@ struct clip_graph { // Apply silu gate = ggml_silu_inplace(ctx0, gate); - + // Multiply together cur = ggml_mul(ctx0, gate, h_to_4h); From c65f5aa900a186dd3d889252d470b2d2e8d3785d Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:12:52 +0000 Subject: [PATCH 12/18] Changes in convert script according to comments --- convert_hf_to_gguf.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2c9bc36061ad9..fc6d97ddc3b8c 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1346,7 +1346,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"])) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys)) - self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) + self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads", "num_heads"])) # preprocessor config image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] @@ -8696,9 +8696,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("CogVLMForCausalLM") class CogVLMVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.hparams_vision['num_attention_heads'] = self.hparams['num_heads'] def set_gguf_parameters(self): super().set_gguf_parameters() @@ -8727,9 +8724,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class CogVLMModel(LlamaModel): model_arch = gguf.MODEL_ARCH.COGVLM - def set_gguf_parameters(self): - super().set_gguf_parameters() - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused From 00af7ee55f1e6aebb9ccb945657272267856c183 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:33:39 +0000 Subject: [PATCH 13/18] Switch CogVLM LLM graph to merged QKV tensor --- convert_hf_to_gguf.py | 9 --------- gguf-py/gguf/constants.py | 16 ++++------------ gguf-py/gguf/tensor_mapping.py | 16 +++------------- src/llama-arch.cpp | 12 +++--------- src/llama-arch.h | 4 +--- src/llama-model.cpp | 35 +++++++++++++--------------------- src/llama-model.h | 4 +--- 7 files changed, 25 insertions(+), 71 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fc6d97ddc3b8c..f48830ab6c92b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8731,15 +8731,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name.startswith("model.vision."): return [] - if "query_key_value.weight" in name: - # Slice tensor into three along first axis - q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) - return [ - (self.map_tensor_name(name.replace("query_key_value", "query")), q), - (self.map_tensor_name(name.replace("query_key_value", "key")), k), - (self.map_tensor_name(name.replace("query_key_value", "value")), v), - ] - return [(self.map_tensor_name(name), data_torch)] ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c2b23870bae7a..e0ed54ffb22b4 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -569,9 +569,7 @@ class MODEL_TENSOR(IntEnum): SHORTCONV_CONV = auto() SHORTCONV_INPROJ = auto() SHORTCONV_OUTPROJ = auto() - VISEXP_ATTN_Q = auto() - VISEXP_ATTN_K = auto() - VISEXP_ATTN_V = auto() + VISEXP_ATTN_QKV = auto() VISEXP_ATTN_OUT = auto() VISEXP_GATE = auto() VISEXP_DOWN = auto() @@ -919,9 +917,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", - MODEL_TENSOR.VISEXP_ATTN_Q: "blk.{bid}.vis_attn_q", - MODEL_TENSOR.VISEXP_ATTN_K: "blk.{bid}.vis_attn_k", - MODEL_TENSOR.VISEXP_ATTN_V: "blk.{bid}.vis_attn_v", + MODEL_TENSOR.VISEXP_ATTN_QKV: "blk.{bid}.vis_attn_qkv", MODEL_TENSOR.VISEXP_ATTN_OUT: "blk.{bid}.vis_attn_output", MODEL_TENSOR.VISEXP_GATE: "blk.{bid}.vis_gate", MODEL_TENSOR.VISEXP_DOWN: "blk.{bid}.vis_down", @@ -2724,17 +2720,13 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.OUTPUT_NORM, MODEL_TENSOR.OUTPUT, MODEL_TENSOR.ATTN_NORM, - MODEL_TENSOR.ATTN_Q, - MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_QKV, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, - MODEL_TENSOR.VISEXP_ATTN_Q, - MODEL_TENSOR.VISEXP_ATTN_K, - MODEL_TENSOR.VISEXP_ATTN_V, + MODEL_TENSOR.VISEXP_ATTN_QKV, MODEL_TENSOR.VISEXP_ATTN_OUT, MODEL_TENSOR.VISEXP_GATE, MODEL_TENSOR.VISEXP_UP, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d28b992596d24..c3912a347d68c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -178,6 +178,7 @@ class TensorNameMap: "encoder.layers.{bid}.self_attention.query_key_value", # chatglm "transformer.layers.{bid}.attn.qkv_proj", # openelm "transformer_encoder.{bid}.qkv", # neobert + "model.layers.{bid}.self_attn.language_expert_query_key_value", # cogvlm ), # Attention query @@ -197,7 +198,6 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h - "model.layers.{bid}.self_attn.language_expert_query", # cogvlm ), # Attention key @@ -218,7 +218,6 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h - "model.layers.{bid}.self_attn.language_expert_key", # cogvlm ), # Attention value @@ -238,7 +237,6 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h - "model.layers.{bid}.self_attn.language_expert_value", # cogvlm ), # Attention output @@ -1039,16 +1037,8 @@ class TensorNameMap: "model.layers.{bid}.self_attn.vision_expert_dense", # cogvlm ), - MODEL_TENSOR.VISEXP_ATTN_Q: ( - "model.layers.{bid}.self_attn.vision_expert_query", # cogvlm - ), - - MODEL_TENSOR.VISEXP_ATTN_K: ( - "model.layers.{bid}.self_attn.vision_expert_key", # cogvlm - ), - - MODEL_TENSOR.VISEXP_ATTN_V: ( - "model.layers.{bid}.self_attn.vision_expert_value", # cogvlm + MODEL_TENSOR.VISEXP_ATTN_QKV: ( + "model.layers.{bid}.self_attn.vision_expert_query_key_value", # cogvlm ), ############################################################################ diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 36eaa20f6f845..8a4af69d5acb0 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2160,17 +2160,13 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, - { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, - { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, - { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, - { LLM_TENSOR_VISEXP_ATTN_WQ, "blk.%d.vis_attn_q" }, - { LLM_TENSOR_VISEXP_ATTN_WK, "blk.%d.vis_attn_k" }, - { LLM_TENSOR_VISEXP_ATTN_WV, "blk.%d.vis_attn_v" }, + { LLM_TENSOR_VISEXP_ATTN_QKV, "blk.%d.vis_attn_qkv" }, { LLM_TENSOR_VISEXP_ATTN_OUT, "blk.%d.vis_attn_output" }, { LLM_TENSOR_VISEXP_FFN_GATE, "blk.%d.vis_gate" }, { LLM_TENSOR_VISEXP_FFN_DOWN, "blk.%d.vis_down" }, @@ -2348,9 +2344,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WQ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WK, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_VISEXP_ATTN_WV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_VISEXP_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_VISEXP_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index c863308ce3be0..116142fcc7527 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -421,9 +421,7 @@ enum llm_tensor { LLM_TENSOR_SHORTCONV_CONV, LLM_TENSOR_SHORTCONV_INPROJ, LLM_TENSOR_SHORTCONV_OUTPROJ, - LLM_TENSOR_VISEXP_ATTN_WQ, - LLM_TENSOR_VISEXP_ATTN_WK, - LLM_TENSOR_VISEXP_ATTN_WV, + LLM_TENSOR_VISEXP_ATTN_QKV, LLM_TENSOR_VISEXP_ATTN_OUT, LLM_TENSOR_VISEXP_FFN_GATE, LLM_TENSOR_VISEXP_FFN_DOWN, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e6ee50686e3bd..85e68aef4ba76 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5744,14 +5744,10 @@ bool llama_model::load_tensors(llama_model_loader & ml) { auto & layer = layers[i]; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); - layer.visexp_attn_wq = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WQ, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.visexp_attn_wk = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WK, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); - layer.visexp_attn_wv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_WV, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.visexp_attn_wqkv = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_QKV, "weight", i), {n_embd, n_embd_head_k * n_head * 3}, 0); layer.visexp_attn_wo = create_tensor(tn(LLM_TENSOR_VISEXP_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0)); @@ -18637,21 +18633,17 @@ struct llm_build_cogvlm : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // get either the text or image weight tensors - ggml_tensor * wq, * wk, * wv, * wo; + ggml_tensor * wqkv, * wo; ggml_tensor * ffn_gate, * ffn_down, * ffn_up; if (is_text) { - wq = model.layers[il].wq; - wk = model.layers[il].wk; - wv = model.layers[il].wv; + wqkv = model.layers[il].wqkv; wo = model.layers[il].wo; ffn_gate = model.layers[il].ffn_gate; ffn_down = model.layers[il].ffn_down; ffn_up = model.layers[il].ffn_up; } else { - wq = model.layers[il].visexp_attn_wq; - wk = model.layers[il].visexp_attn_wk; - wv = model.layers[il].visexp_attn_wv; + wqkv = model.layers[il].visexp_attn_wqkv; wo = model.layers[il].visexp_attn_wo; ffn_gate = model.layers[il].visexp_ffn_gate; ffn_down = model.layers[il].visexp_ffn_down; @@ -18663,17 +18655,16 @@ struct llm_build_cogvlm : public llm_graph_context { // build self attention { - ggml_tensor * Qcur = build_lora_mm(wq, cur); - cb(Qcur, "Qcur", il); + ggml_tensor * qkv = build_lora_mm(wqkv, cur); - ggml_tensor * Kcur = build_lora_mm(wk, cur); - cb(Kcur, "Kcur", il); + // split qkv into Q, K, V along the first dimension + ggml_tensor * Qcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), + qkv->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + qkv->nb[1], n_embd * ggml_element_size(qkv)); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, + qkv->nb[1], 2 * n_embd * ggml_element_size(qkv))); - ggml_tensor * Vcur = build_lora_mm(wv, cur); - cb(Vcur, "Vcur", il); - - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // TODO: Check Rope because this might not be the same as cogvlm diff --git a/src/llama-model.h b/src/llama-model.h index 7514a5cce7773..b3e4fb98ede2c 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -371,9 +371,7 @@ struct llama_layer { struct ggml_tensor * attn_sinks = nullptr; // cogvlm - struct ggml_tensor * visexp_attn_wq = nullptr; - struct ggml_tensor * visexp_attn_wk = nullptr; - struct ggml_tensor * visexp_attn_wv = nullptr; + struct ggml_tensor * visexp_attn_wqkv = nullptr; struct ggml_tensor * visexp_attn_wo = nullptr; struct ggml_tensor * visexp_ffn_gate = nullptr; struct ggml_tensor * visexp_ffn_down = nullptr; From 79e5640cc48a1f744608f7cc65cdba7272cd4918 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 19:57:09 +0000 Subject: [PATCH 14/18] Use rope_type variable instead of direct definition --- src/llama-model.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 85e68aef4ba76..c406600c0f689 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18667,9 +18667,8 @@ struct llm_build_cogvlm : public llm_graph_context { Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - // TODO: Check Rope because this might not be the same as cogvlm - Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); - Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, GGML_ROPE_TYPE_NEOX); + Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type); + Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type); cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); @@ -18681,12 +18680,6 @@ struct llm_build_cogvlm : public llm_graph_context { cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "ffn_norm", il); - // Make a standard ffn without the build_ffn function - //ggml_tensor * tmp = build_lora_mm(ffn_up, cur); - //ggml_tensor * gate = build_lora_mm(ffn_gate, cur); - //gate = ggml_silu(ctx0, gate); - //cur = ggml_mul(ctx0, gate, tmp); - //cur = build_lora_mm(ffn_down, cur); cur = build_ffn(cur, ffn_up, NULL, NULL, ffn_gate, NULL, NULL, @@ -19386,7 +19379,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ARCEE: case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: - case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 @@ -19435,6 +19427,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_GLM4_MOE: case LLM_ARCH_SEED_OSS: + case LLM_ARCH_COGVLM: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: From 86a10bc10bf97d86acee98a505f4884ea24be22e Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 20:01:57 +0000 Subject: [PATCH 15/18] Change CogVLM CLIP encoder to use SWIGLU --- tools/mtmd/clip.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 3c3f178ece3af..b9b3a0c7cbc09 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1650,7 +1650,6 @@ struct clip_graph { cur = build_norm(cur, model.mm_post_fc_norm_w, model.mm_post_fc_norm_b, NORM_TYPE_NORMAL, 1e-5, -1); // Apply GELU - // TODO: Not 100% sure about gelu and silu configuration cur = ggml_gelu_inplace(ctx0, cur); // Branch 1: multiply with mm_h_to_4h_w @@ -1660,16 +1659,12 @@ struct clip_graph { ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_gate_w, cur); // Apply silu - gate = ggml_silu_inplace(ctx0, gate); - - // Multiply together - cur = ggml_mul(ctx0, gate, h_to_4h); + gate = ggml_swiglu_split(ctx0, gate, h_to_4h); // Apply mm_4h_to_h_w - cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, cur); + cur = ggml_mul_mat(ctx0, model.mm_4h_to_h_w, gate); // Concatenate with boi and eoi - // TODO: The shape may be incorrect cur = ggml_concat(ctx0, model.mm_boi, cur, 1); cur = ggml_concat(ctx0, cur, model.mm_eoi, 1); From a959a1fa71c7553f70b55e6f3ff4f8276f961f61 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Sun, 10 Aug 2025 20:57:39 +0000 Subject: [PATCH 16/18] Switch CogVLM CLIP to use merged QKV --- convert_hf_to_gguf.py | 9 --- gguf-py/gguf/constants.py | 3 + gguf-py/gguf/tensor_mapping.py | 7 ++- tools/mtmd/clip-impl.h | 1 + tools/mtmd/clip.cpp | 106 +++++++++++++++++++++------------ 5 files changed, 77 insertions(+), 49 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index f48830ab6c92b..3f0b160746c52 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -8708,15 +8708,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if not name.startswith("model.vision."): return [] - if "query_key_value" in name: - # Split tensor into three along first axis - q, k, v = data_torch.split(data_torch.shape[0] // 3, dim=0) - return [ - (self.map_tensor_name(name.replace("query_key_value", "query")), q), - (self.map_tensor_name(name.replace("query_key_value", "key")), k), - (self.map_tensor_name(name.replace("query_key_value", "value")), v), - ] - return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index e0ed54ffb22b4..b45b8a13b3c0f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -583,6 +583,7 @@ class MODEL_TENSOR(IntEnum): V_ENC_EMBD_PATCH = auto() V_ENC_EMBD_POS = auto() V_ENC_INPUT_NORM = auto() + V_ENC_ATTN_QKV = auto() V_ENC_ATTN_Q = auto() V_ENC_ATTN_Q_NORM = auto() V_ENC_ATTN_K = auto() @@ -930,6 +931,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd", MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd", MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd", + MODEL_TENSOR.V_ENC_ATTN_QKV: "v.blk.{bid}.attn_qkv", MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q", MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm", MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k", @@ -1005,6 +1007,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.V_ENC_EMBD_PATCH, MODEL_TENSOR.V_ENC_EMBD_POS, MODEL_TENSOR.V_ENC_INPUT_NORM, + MODEL_TENSOR.V_ENC_ATTN_QKV, MODEL_TENSOR.V_ENC_ATTN_Q, MODEL_TENSOR.V_ENC_ATTN_Q_NORM, MODEL_TENSOR.V_ENC_ATTN_K, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index c3912a347d68c..c1a2d07b73ca8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1185,6 +1185,10 @@ class TensorNameMap: "model.vision.patch_embedding.position_embedding", # cogvlm ), + MODEL_TENSOR.V_ENC_ATTN_QKV: ( + "model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm + ), + MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 @@ -1195,7 +1199,6 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wq", # kimi-vl, generated - "model.vision.transformer.layers.{bid}.attention.query", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( @@ -1213,7 +1216,6 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wk", # kimi-vl, generated - "model.vision.transformer.layers.{bid}.attention.key", # cogvlm ), MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( @@ -1231,7 +1233,6 @@ class TensorNameMap: "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated "vision_tower.encoder.blocks.{bid}.wv", # kimi-vl, generated - "model.vision.transformer.layers.{bid}.attention.value", # cogvlm ), MODEL_TENSOR.V_ENC_INPUT_NORM: ( diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 26ae031934c5a..32bd79c9424c2 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -60,6 +60,7 @@ #define TN_PATCH_EMBD "v.patch_embd.weight" // not rename tensor with ".0" postfix for backwrad compat #define TN_PATCH_EMBD_1 "v.patch_embd.weight.1" #define TN_PATCH_BIAS "v.patch_embd.bias" +#define TN_ATTN_QKV "%s.blk.%d.attn_qkv.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index b9b3a0c7cbc09..8911a3450a271 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -212,6 +212,8 @@ struct clip_layer { ggml_tensor * q_b = nullptr; ggml_tensor * v_w = nullptr; ggml_tensor * v_b = nullptr; + ggml_tensor * qkv_w = nullptr; + ggml_tensor * qkv_b = nullptr; ggml_tensor * o_w = nullptr; ggml_tensor * o_b = nullptr; @@ -1630,18 +1632,65 @@ struct clip_graph { ggml_tensor * inp = build_inp(); inp = ggml_concat(ctx0, inp, model.class_embedding, 1); - // build ViT transformer - ggml_tensor * cur = build_vit( - inp, n_pos, - NORM_TYPE_NORMAL, - hparams.ffn_op, - model.position_embeddings, - nullptr); + inp = ggml_add(ctx0, inp, model.position_embeddings); + cb(inp, "inp_pos", -1); + + ggml_tensor * inpL = inp; + + for (int il = 0; il < n_layer; il++) { + auto & layer = model.layers[il]; + ggml_tensor * cur = inpL; + + cur = ggml_mul_mat(ctx0, layer.qkv_w, cur); + + cur = ggml_add(ctx0, cur, layer.qkv_b); + + ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], 0)); + ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], n_embd * sizeof(float))); + ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, + cur->nb[1], 2 * n_embd * sizeof(float))); + + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(layer.o_w, layer.o_b, + Qcur, Kcur, Vcur, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "attn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + inpL = cur; + + cur = build_ffn(cur, + layer.ff_up_w, layer.ff_up_b, + layer.ff_gate_w, layer.ff_gate_b, + layer.ff_down_w, layer.ff_down_b, + hparams.ffn_op, il); + + cb(cur, "ffn_out", il); + + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, NORM_TYPE_NORMAL, eps, il); + cb(cur, "ffn_post_norm", il); + + cur = ggml_add(ctx0, cur, inpL); + cb(cur, "layer_out", il); + inpL = cur; + + } // remove CLS token (like build_llama4 does) - cur = ggml_view_2d(ctx0, cur, + ggml_tensor * cur = ggml_view_2d(ctx0, inpL, n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); + ggml_row_size(inpL->type, n_embd), 0); // Multiply with mm_model_proj cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur); @@ -1742,14 +1791,9 @@ struct clip_graph { auto & layer = model.layers[il]; ggml_tensor * cur = inpL; // inpL = residual, cur = hidden_states - // Check if this is COGVLM projector type for post-norm layernorm order - const bool is_cogvlm = ctx->proj_type() == PROJECTOR_TYPE_COGVLM; - - // layernorm1 (only for non-COGVLM) - if (!is_cogvlm) { - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "layer_inp_normed", il); - } + // layernorm1 + cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); + cb(cur, "layer_inp_normed", il); // self-attention { @@ -1803,12 +1847,6 @@ struct clip_graph { cb(cur, "attn_out_scaled", il); } - // Apply layernorm AFTER attention for COGVLM (post-norm) - if (is_cogvlm) { - cur = build_norm(cur, layer.ln_1_w, layer.ln_1_b, norm_t, eps, il); - cb(cur, "attn_post_norm", il); - } - // re-add the layer input, e.g., residual cur = ggml_add(ctx0, cur, inpL); @@ -1816,11 +1854,9 @@ struct clip_graph { cb(cur, "ffn_inp", il); - // layernorm2 (only for non-COGVLM) - if (!is_cogvlm) { - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_inp_normed", il); - } + // layernorm2 + cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); + cb(cur, "ffn_inp_normed", il); // ffn cur = build_ffn(cur, @@ -1836,12 +1872,6 @@ struct clip_graph { cb(cur, "ffn_out_scaled", il); } - // Apply layernorm AFTER MLP for COGVLM (post-norm) - if (is_cogvlm) { - cur = build_norm(cur, layer.ln_2_w, layer.ln_2_b, norm_t, eps, il); - cb(cur, "ffn_post_norm", il); - } - // residual 2 cur = ggml_add(ctx0, inpL, cur); cb(cur, "layer_out", il); @@ -2601,10 +2631,11 @@ struct clip_model_loader { model.layers.resize(hparams.n_layer); for (int il = 0; il < hparams.n_layer; ++il) { auto & layer = model.layers[il]; - layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight")); - layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight")); - layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight")); + layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"), false); + layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"), false); + layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"), false); layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight")); + layer.qkv_w = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "weight"), false); layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false); layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false); layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false); @@ -2616,6 +2647,7 @@ struct clip_model_loader { layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false); layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false); layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false); + layer.qkv_b = get_tensor(string_format(TN_ATTN_QKV, prefix, il, "bias"), false); layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false); layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false); From 06a07194b176fddebbcae9e6ce7bfbdb23b25fb7 Mon Sep 17 00:00:00 2001 From: Tianyue-Zhao Date: Tue, 9 Sep 2025 00:12:21 +0000 Subject: [PATCH 17/18] Apply rebase edits and remove ggml_cont call that is now unnecessary --- src/llama-model.cpp | 10 ++++------ tools/mtmd/clip.cpp | 18 +++++++----------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c406600c0f689..1af38c15b552a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -18620,7 +18620,7 @@ struct llm_build_cogvlm : public llm_graph_context { ggml_tensor * inp_pos = build_inp_pos(); - auto * inp_attn = build_attn_inp_kv_unified(); + auto * inp_attn = build_attn_inp_kv(); // check ubatch to see if we have input tokens (text) // or an input embedding vector (image) @@ -18662,15 +18662,13 @@ struct llm_build_cogvlm : public llm_graph_context { qkv->nb[1], 0); ggml_tensor * Kcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), qkv->nb[1], n_embd * ggml_element_size(qkv)); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, qkv, n_embd, n_tokens, - qkv->nb[1], 2 * n_embd * ggml_element_size(qkv))); - - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + ggml_tensor * Vcur = ggml_view_3d(ctx0, qkv, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + qkv->nb[1], 2 * n_embd * ggml_element_size(qkv)); Qcur = ggml_rope(ctx0, Qcur, inp_pos, n_embd_head, rope_type); Kcur = ggml_rope(ctx0, Kcur, inp_pos, n_embd_head, rope_type); - cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); + cur = build_attn(inp_attn, wo, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 8911a3450a271..1c0af0ba5779a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1645,16 +1645,12 @@ struct clip_graph { cur = ggml_add(ctx0, cur, layer.qkv_b); - ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, - cur->nb[1], 0)); - ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, - cur->nb[1], n_embd * sizeof(float))); - ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_pos, - cur->nb[1], 2 * n_embd * sizeof(float))); - - Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); - Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); - Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + ggml_tensor * Qcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 0); + ggml_tensor * Kcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], n_embd * sizeof(float)); + ggml_tensor * Vcur = ggml_view_3d(ctx0, cur, d_head, n_head, n_pos, d_head*sizeof(float), + cur->nb[1], 2 * n_embd * sizeof(float)); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); @@ -3968,7 +3964,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_COGVLM: { - n_patches_sq += 2; + n_patches += 2; } break; default: GGML_ABORT("unsupported projector type"); From 85764bd8a8bec76609f46c2b7722547115659226 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 30 Oct 2025 00:07:27 +0100 Subject: [PATCH 18/18] clean up --- tools/mtmd/clip.cpp | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 11fb86432aebd..a135ba0a2b353 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -288,8 +288,6 @@ struct clip_model { // GLMV-Edge projection ggml_tensor * mm_model_adapter_conv_w = nullptr; ggml_tensor * mm_model_adapter_conv_b = nullptr; - ggml_tensor * mm_glm_tok_boi = nullptr; - ggml_tensor * mm_glm_tok_eoi = nullptr; // MobileVLM projection ggml_tensor * mm_model_mlp_1_w = nullptr; @@ -1505,8 +1503,8 @@ struct clip_graph { // note: these embeddings are not present in text model, hence we cannot process them as text tokens // see: https://huggingface.co/THUDM/glm-edge-v-2b/blob/main/siglip.py#L53 { - embeddings = ggml_concat(ctx0, model.mm_glm_tok_boi, embeddings, 1); // BOI - embeddings = ggml_concat(ctx0, embeddings, model.mm_glm_tok_eoi, 1); // EOI + embeddings = ggml_concat(ctx0, model.mm_boi, embeddings, 1); // BOI + embeddings = ggml_concat(ctx0, embeddings, model.mm_eoi, 1); // EOI } } @@ -2797,8 +2795,8 @@ struct clip_model_loader { model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H, "weight")); model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE, "weight")); model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H, "weight")); - model.mm_glm_tok_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight")); - model.mm_glm_tok_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight")); + model.mm_boi = get_tensor(string_format(TN_TOK_GLM_BOI, "weight")); + model.mm_eoi = get_tensor(string_format(TN_TOK_GLM_EOI, "weight")); } break; case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: @@ -2894,14 +2892,14 @@ struct clip_model_loader { } break; case PROJECTOR_TYPE_COGVLM: { - model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); + model.mm_model_proj = get_tensor(TN_MM_PROJECTOR); model.mm_post_fc_norm_w = get_tensor(string_format(TN_MM_POST_FC_NORM, "weight")); model.mm_post_fc_norm_b = get_tensor(string_format(TN_MM_POST_FC_NORM, "bias")); - model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); - model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); - model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); - model.mm_boi = get_tensor(TN_TOK_BOI); - model.mm_eoi = get_tensor(TN_TOK_EOI); + model.mm_h_to_4h_w = get_tensor(string_format(TN_MM_H_TO_4H, "weight")); + model.mm_gate_w = get_tensor(string_format(TN_MM_GATE, "weight")); + model.mm_4h_to_h_w = get_tensor(string_format(TN_MM_4H_TO_H, "weight")); + model.mm_boi = get_tensor(TN_TOK_BOI); + model.mm_eoi = get_tensor(TN_TOK_EOI); } break; default: GGML_ASSERT(false && "unknown projector type"); @@ -3951,7 +3949,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_GLM_EDGE: { n_patches /= 4; - if (ctx->model.mm_glm_tok_boi) { + if (ctx->model.mm_boi) { n_patches += 2; // for BOI and EOI token embeddings } } break; @@ -4043,7 +4041,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im } break; case PROJECTOR_TYPE_COGVLM: { - n_patches += 2; + n_patches += 2; // for BOI and EOI token embeddings } break; default: GGML_ABORT("unsupported projector type");