Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Merged
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
64add82
First attempt
pwilkin Sep 7, 2025
86cfc18
No permute during convert (fixes qk tensors), proper norm application.
pwilkin Sep 8, 2025
8c762a3
RoPE = NeoX
pwilkin Sep 8, 2025
ffdfd1d
Coherence!
pwilkin Sep 8, 2025
74dcf89
Merge branch 'ggml-org:master' into apertus-implementation
pwilkin Sep 9, 2025
ab11d94
Migrate xielu params from tensors to hyperparameters
pwilkin Sep 9, 2025
1b18472
Simple CUDA kernel
pwilkin Sep 13, 2025
1606a3c
Revert stupid LLM refactorings
pwilkin Sep 14, 2025
eec384f
Chat template support
pwilkin Sep 14, 2025
b2a92d0
configchecker / flake8 errors
pwilkin Sep 14, 2025
ef9ef66
Reorder unary.cu
pwilkin Sep 15, 2025
d009194
I do conclude that LLMs are, in fact, stupid.
pwilkin Sep 15, 2025
73bd64f
Merge branch 'master' into apertus-implementation
pwilkin Sep 16, 2025
28f9086
Fix after merge
pwilkin Sep 16, 2025
2f68c03
Final newline
pwilkin Sep 16, 2025
4294dbf
Make xIELU an UNARY_OP
pwilkin Sep 17, 2025
b2aa4fb
Final newline
pwilkin Sep 17, 2025
86a239c
Correctly account for parameter shift
pwilkin Sep 17, 2025
bd19026
Argh.
pwilkin Sep 17, 2025
13cc3be
Update ggml/src/ggml-cpu/unary-ops.cpp
pwilkin Sep 18, 2025
40f2f80
Refactor: remove unused methods, inline and factorize softplus, add c…
pwilkin Sep 18, 2025
dbe0ccb
Merge branch 'master' into apertus-implementation
pwilkin Sep 23, 2025
0d36139
Revert CUDA changes, implement xIELU as a separate OP
pwilkin Sep 23, 2025
fc58d3b
Pesky newline
pwilkin Sep 23, 2025
db9eb29
Add float2half / half2float for F16 inputs/outputs
pwilkin Sep 23, 2025
dc1e4d5
CUDA variants, attempt 2
pwilkin Sep 24, 2025
6c843ce
Actually, attempt 3
pwilkin Sep 24, 2025
f11ab3c
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
57e2263
Missing convert header
pwilkin Sep 25, 2025
62401d8
Proper formula and reference for xIELU in the comments.
pwilkin Sep 25, 2025
58e6e0f
Modify unary-ops.cpp to add the functor-based logic besides the templ…
pwilkin Sep 25, 2025
5a02bd4
Apply suggestions from code review
pwilkin Sep 25, 2025
90f052d
Add tensor mappings for Apertus to global list instead
pwilkin Sep 25, 2025
6972404
Fix lazy on scalars
pwilkin Sep 25, 2025
d29cb45
Merge remote-tracking branch 'pwilkin/master' into apertus-implementa…
pwilkin Sep 25, 2025
2ca353b
Update ggml/src/ggml-cuda/unary.cu
pwilkin Sep 25, 2025
66f37c0
Add comment about the constraints on positive/negative alpha
pwilkin Oct 2, 2025
78ac06b
Change `softplus` to `ggml_softplus`
pwilkin Oct 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
CUDA variants, attempt 2
  • Loading branch information
pwilkin committed Sep 24, 2025
commit dc1e4d5f879287fcff348546e52fd7da4820f531
26 changes: 2 additions & 24 deletions ggml/src/ggml-cuda/unary.cu
Original file line number Diff line number Diff line change
Expand Up @@ -375,28 +375,6 @@ void ggml_cuda_op_swiglu_oai(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
swiglu_oai_cuda(src0_p, src1_p, (float *)dst_d, ggml_nelements(dst), nc, src0_o / sizeof(float), src1_o / sizeof(float), alpha, limit, stream);
}

/* xIELU */
struct op_xielu_functor {
float alpha_n, alpha_p, beta, eps;

__host__ __device__ __forceinline__ op_xielu_functor(float a_n, float a_p, float b, float e)
: alpha_n(a_n), alpha_p(a_p), beta(b), eps(e) {}

__device__ __forceinline__ float operator()(float x) const {
const float gate_pos = (x > 0.0f); // positive branch gate

// Positive branch: alpha_p * v^2 + beta * v
const float y_pos = alpha_p * x * x + beta * x;

// Negative branch:
const float min_v_eps = fminf(x, eps); // works fine even if eps < 0
const float y_neg = (expm1f(min_v_eps) - x) * alpha_n + beta * x;

// Select the appropriate branch based on the gate
return gate_pos * y_pos + (1.0f - gate_pos) * y_neg;
}
};

/* CUDA kernel + launcher for xIELU */

template <typename T>
Expand All @@ -407,7 +385,7 @@ static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alp
return;
}

const float xi = x->type == GGML_TYPE_F32 ? (float) x[i] : __half2float(x[i]);
const float xi = sizeof(x[i]) == sizeof(half) ? __half2float(x[i]) : (float) x[i];
const float gate_pos = (xi > 0.0f);

const float y_pos = alpha_p * xi * xi + beta * xi;
Expand All @@ -417,7 +395,7 @@ static __global__ void xielu_kernel(const T * x, T * dst, const int k, float alp

const float out = gate_pos * y_pos + (1.0f - gate_pos) * y_neg;

dst[i] = (T) (dst->type == GGML_TYPE_F32 ? out : __float2half(out));
dst[i] = (T) (sizeof(dst[i]) == sizeof(float)) ? out : __float2half(out));
}

template <typename T>
Expand Down
Loading