Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
64add82
First attempt
pwilkin Sep 7, 2025
86cfc18
No permute during convert (fixes qk tensors), proper norm application.
pwilkin Sep 8, 2025
8c762a3
RoPE = NeoX
pwilkin Sep 8, 2025
ffdfd1d
Coherence!
pwilkin Sep 8, 2025
74dcf89
Merge branch 'ggml-org:master' into apertus-implementation
pwilkin Sep 9, 2025
ab11d94
Migrate xielu params from tensors to hyperparameters
pwilkin Sep 9, 2025
1b18472
Simple CUDA kernel
pwilkin Sep 13, 2025
1606a3c
Revert stupid LLM refactorings
pwilkin Sep 14, 2025
eec384f
Chat template support
pwilkin Sep 14, 2025
b2a92d0
configchecker / flake8 errors
pwilkin Sep 14, 2025
ef9ef66
Reorder unary.cu
pwilkin Sep 15, 2025
d009194
I do conclude that LLMs are, in fact, stupid.
pwilkin Sep 15, 2025
73bd64f
Merge branch 'master' into apertus-implementation
pwilkin Sep 16, 2025
28f9086
Fix after merge
pwilkin Sep 16, 2025
2f68c03
Final newline
pwilkin Sep 16, 2025
4294dbf
Make xIELU an UNARY_OP
pwilkin Sep 17, 2025
b2aa4fb
Final newline
pwilkin Sep 17, 2025
86a239c
Correctly account for parameter shift
pwilkin Sep 17, 2025
bd19026
Argh.
pwilkin Sep 17, 2025
13cc3be
Update ggml/src/ggml-cpu/unary-ops.cpp
pwilkin Sep 18, 2025
40f2f80
Refactor: remove unused methods, inline and factorize softplus, add c…
pwilkin Sep 18, 2025
dbe0ccb
Merge branch 'master' into apertus-implementation
pwilkin Sep 23, 2025
0d36139
Revert CUDA changes, implement xIELU as a separate OP
pwilkin Sep 23, 2025
fc58d3b
Pesky newline
pwilkin Sep 23, 2025
db9eb29
Add float2half / half2float for F16 inputs/outputs
pwilkin Sep 23, 2025
dc1e4d5
CUDA variants, attempt 2
pwilkin Sep 24, 2025
6c843ce
Actually, attempt 3
pwilkin Sep 24, 2025
f11ab3c
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
57e2263
Missing convert header
pwilkin Sep 25, 2025
62401d8
Proper formula and reference for xIELU in the comments.
pwilkin Sep 25, 2025
58e6e0f
Modify unary-ops.cpp to add the functor-based logic besides the templ…
pwilkin Sep 25, 2025
5a02bd4
Apply suggestions from code review
pwilkin Sep 25, 2025
90f052d
Add tensor mappings for Apertus to global list instead
pwilkin Sep 25, 2025
6972404
Fix lazy on scalars
pwilkin Sep 25, 2025
d29cb45
Merge remote-tracking branch 'pwilkin/master' into apertus-implementa…
pwilkin Sep 25, 2025
2ca353b
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
66f37c0
Add comment about the constraints on positive/negative alpha
pwilkin Oct 2, 2025
78ac06b
Change `softplus` to `ggml_softplus`
pwilkin Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Apply suggestions from code review
Co-authored-by: Sigbjørn Skjæret <[email protected]>
  • Loading branch information
pwilkin and CISC authored Sep 25, 2025
commit 5a02bd486ba7c6c458ddf06e90c9bebb5a3ed6ca
2 changes: 1 addition & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8772,7 +8772,7 @@ class ApertusModel(LlamaModel):

def modify_tensors(self, data_torch, name, bid):
# Handle xIELU activation parameters
n_layers = self.hparams.get("num_hidden_layers")
n_layers = self.hparams["num_hidden_layers"]
if name.endswith(".act_fn.alpha_n"):
self._alpha_n[bid] = data_torch.to("cpu").float().item()
if (len(self._alpha_n) == n_layers):
Expand Down
12 changes: 4 additions & 8 deletions gguf-py/gguf/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,10 +295,10 @@ class Diffusion:
SHIFT_LOGITS = "diffusion.shift_logits"

class xIELU:
XIELU_ALPHA_P = "xielu.alpha_p"
XIELU_ALPHA_N = "xielu.alpha_n"
XIELU_BETA = "xielu.beta"
XIELU_EPS = "xielu.eps"
ALPHA_P = "xielu.alpha_p"
ALPHA_N = "xielu.alpha_n"
BETA = "xielu.beta"
EPS = "xielu.eps"


#
Expand Down Expand Up @@ -458,10 +458,6 @@ class MODEL_TENSOR(IntEnum):
FFN_GATE_SHEXP = auto()
FFN_DOWN_SHEXP = auto()
FFN_UP_SHEXP = auto()
FFN_ACT_ALPHA_N = auto()
FFN_ACT_ALPHA_P = auto()
FFN_ACT_BETA = auto()
FFN_ACT_EPS = auto()
FFN_EXP_PROBS_B = auto()
ATTN_Q_NORM = auto()
ATTN_K_NORM = auto()
Expand Down
16 changes: 8 additions & 8 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1075,17 +1075,17 @@ def add_audio_num_mel_bins(self, value: int) -> None:
def add_audio_stack_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)

def add_xielu_alpha_p(self, value: Sequence[float]):
self.add_array(Keys.xIELU.XIELU_ALPHA_P, value)
def add_xielu_alpha_p(self, values: Sequence[float]):
self.add_array(Keys.xIELU.ALPHA_P, values)

def add_xielu_alpha_n(self, value: Sequence[float]):
self.add_array(Keys.xIELU.XIELU_ALPHA_N, value)
def add_xielu_alpha_n(self, values: Sequence[float]):
self.add_array(Keys.xIELU.ALPHA_N, values)

def add_xielu_beta(self, value: Sequence[float]):
self.add_array(Keys.xIELU.XIELU_BETA, value)
def add_xielu_beta(self, values: Sequence[float]):
self.add_array(Keys.xIELU.BETA, values)

def add_xielu_eps(self, value: Sequence[float]):
self.add_array(Keys.xIELU.XIELU_EPS, value)
def add_xielu_eps(self, values: Sequence[float]):
self.add_array(Keys.xIELU.EPS, values)

# diffusion models

Expand Down
44 changes: 0 additions & 44 deletions gguf-py/gguf/tensor_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,22 +451,6 @@ class TensorNameMap:
"layers.{bid}.mlp.gate_proj", # qwen3-embedding
),

MODEL_TENSOR.FFN_ACT_ALPHA_N: (
"model.layers.{bid}.mlp.act_fn.alpha_n", # apertus xIELU
),

MODEL_TENSOR.FFN_ACT_ALPHA_P: (
"model.layers.{bid}.mlp.act_fn.alpha_p", # apertus xIELU
),

MODEL_TENSOR.FFN_ACT_BETA: (
"model.layers.{bid}.mlp.act_fn.beta", # apertus xIELU
),

MODEL_TENSOR.FFN_ACT_EPS: (
"model.layers.{bid}.mlp.act_fn.eps", # apertus xIELU
),

MODEL_TENSOR.FFN_GATE_EXP: (
"layers.{bid}.feed_forward.experts.w1", # mixtral (merged)
"transformer.decoder_layer.{bid}.moe.linear", # Grok (merged)
Expand Down Expand Up @@ -1491,34 +1475,6 @@ class TensorNameMap:
"model.layers.{bid}.post_attention_layernorm",
),
},
MODEL_ARCH.APERTUS: {
MODEL_TENSOR.ATTN_NORM: (
"model.layers.{bid}.attention_layernorm",
),
MODEL_TENSOR.ATTN_Q_NORM: (
"model.layers.{bid}.attention.query_layernorm",
"model.layers.{bid}.self_attn.q_norm",
),
MODEL_TENSOR.ATTN_K_NORM: (
"model.layers.{bid}.attention.key_layernorm",
"model.layers.{bid}.self_attn.k_norm",
),
MODEL_TENSOR.FFN_NORM: (
"model.layers.{bid}.feedforward_layernorm",
),
MODEL_TENSOR.FFN_ACT_ALPHA_N: (
"model.layers.{bid}.mlp.act_fn.alpha_n",
),
MODEL_TENSOR.FFN_ACT_ALPHA_P: (
"model.layers.{bid}.mlp.act_fn.alpha_p",
),
MODEL_TENSOR.FFN_ACT_BETA: (
"model.layers.{bid}.mlp.act_fn.beta",
),
MODEL_TENSOR.FFN_ACT_EPS: (
"model.layers.{bid}.mlp.act_fn.eps",
),
},
}

mapping: dict[str, tuple[MODEL_TENSOR, str]]
Expand Down
27 changes: 13 additions & 14 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -513,10 +513,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
std::fill(hparams.rope_sections.begin(), hparams.rope_sections.end(), 0);
std::fill(hparams.swa_layers.begin(), hparams.swa_layers.end(), 0);

std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0);
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0);
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0);
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0);
std::fill(hparams.xielu_alpha_n.begin(), hparams.xielu_alpha_n.end(), 0.0f);
std::fill(hparams.xielu_alpha_p.begin(), hparams.xielu_alpha_p.end(), 0.0f);
std::fill(hparams.xielu_beta.begin(), hparams.xielu_beta.end(), 0.0f);
std::fill(hparams.xielu_eps.begin(), hparams.xielu_eps.end(), 0.0f);

ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false);
ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false);
Expand Down Expand Up @@ -2014,10 +2014,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
case LLM_ARCH_APERTUS:
{
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_N, hparams.xielu_alpha_n, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_ALPHA_P, hparams.xielu_alpha_p, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_BETA, hparams.xielu_beta, hparams.n_layer);
ml.get_key_or_arr(LLM_KV_XIELU_EPS, hparams.xielu_eps, hparams.n_layer);

switch (hparams.n_layer) {
case 32: type = LLM_TYPE_8B; break;
Expand Down Expand Up @@ -5858,19 +5858,18 @@ bool llama_model::load_tensors(llama_model_loader & ml) {

// output
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, 0);

for (int i = 0; i < n_layer; ++i) {
auto & layer = layers[i];

layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, 0);

if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}
else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
} else {
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), { n_rot/2 }, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
}

layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), { n_embd, n_embd_head_k * n_head }, 0);
Expand Down