1
1
#include " ggml_extra.h"
2
+ #include " ggml.h"
2
3
3
4
#include < limits>
4
5
#include < vector>
@@ -27,8 +28,7 @@ inline int toNearestInt(float fval) {
27
28
// Adapted from PR #835, function quantize_row_q4_0_rmse()
28
29
//
29
30
// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835.
30
- // Instead, I get rmse = 0.00197 with the original and rmse = 0.00192
31
- // with the modification that determines the scale actually minimizing
31
+ // Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 // with the modification that determines the scale actually minimizing
32
32
// the rmse.
33
33
//
34
34
// Do I have a bug? iI don't see it.
@@ -79,12 +79,58 @@ float quanizeRmse(int n, const float* X, int8_t* L) {
79
79
// return 1/bestScale;
80
80
}
81
81
82
+ float quanizeRmseK (int n, const float * X, int8_t * L,
83
+ int nCandidates, const float * candidates, int nmin, int nmax) {
84
+ float max = 0 ;
85
+ for (int i=0 ; i<n; ++i) max = std::max (max, std::abs (X[i]));
86
+ if (!max) { // all zero
87
+ for (int i=0 ; i<n; ++i) L[i] = 0 ;
88
+ return 1 .f ;
89
+ }
90
+ float best = 0 , bestScale = 0 ;
91
+ for (int si=0 ; si<nCandidates; ++si) {
92
+ float iscale = candidates[si]/max;
93
+ float sumlx = 0 ; int suml2 = 0 ;
94
+ for (int i=0 ; i<n; ++i) {
95
+ int l = std::max (nmin, std::min (nmax, toNearestInt (iscale*X[i])));
96
+ sumlx += X[i]*l; suml2 += l*l;
97
+ }
98
+ if (sumlx*sumlx > best*suml2) {
99
+ best = sumlx*sumlx/suml2; bestScale = iscale;
100
+ }
101
+ }
102
+ float sumlx = 0 ; int suml2 = 0 ;
103
+ for (int i=0 ; i<n; ++i) {
104
+ int l = std::max (nmin, std::min (nmax, toNearestInt (bestScale*X[i])));
105
+ sumlx += X[i]*l; suml2 += l*l;
106
+ L[i] = l;
107
+ }
108
+ return sumlx/suml2;
109
+ }
82
110
// The following improves the above.
83
111
// It gives RMSE = 0.00185228 for the 7B model.
84
- float quanizeRmseK (int n, const float * X, int8_t * L) {
112
+ float quanizeRmseK7 (int n, const float * X, int8_t * L) {
85
113
constexpr int kCandiateCount = 20 ;
86
114
static const float candidates[kCandiateCount ] = { -8 .7f , -8 .5f , -8 .3f , -8 .1f , -7 .9f , -7 .7f , -7 .2f , -7 .0f , -6 .3f , -5 .7f ,
87
115
+8 .7f , +8 .5f , +8 .3f , +8 .1f , +7 .9f , +7 .7f , +7 .2f , +7 .0f , +6 .3f , +5 .7f };
116
+ return quanizeRmseK (n, X, L, kCandiateCount , candidates, -8 , 7 );
117
+ }
118
+
119
+ float quanizeRmseK15 (int n, const float * X, int8_t * L) {
120
+ constexpr int kCandiateCount = 16 ;
121
+ static const float candidates[kCandiateCount ] = {
122
+ +17 .75f , +17 .25f , +16 .75f , +16 .25f , +15 .75f , +15 .25f , +14 .75f , +14 .25f , +13 .75f , +13 .25f , +12 .75f , +12.25 , +11 .75f ,
123
+ +11 .25f , +10 .75f , +10 .25f
124
+ };
125
+ return quanizeRmseK (n, X, L, kCandiateCount , candidates, 0 , 15 );
126
+ }
127
+
128
+ // Fast (as much faster than doing the optimization), but not very good.
129
+ float quanizeRmseFast (int n, const float * X, int8_t * L) {
130
+ // constexpr int kCandiateCount = 3;
131
+ // static const float candidates[kCandiateCount] = { +8.3f, +7.2f, +5.7f};
132
+ constexpr int kCandiateCount = 4 ;
133
+ static const float candidates[kCandiateCount ] = { +8 .7f , +7 .9f , +7 .2f , +5 .7f };
88
134
float max = 0 ;
89
135
for (int i=0 ; i<n; ++i) max = std::max (max, std::abs (X[i]));
90
136
if (!max) { // all zero
@@ -94,13 +140,25 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
94
140
float best = 0 , bestScale = 0 ;
95
141
for (int si=0 ; si<kCandiateCount ; ++si) {
96
142
float iscale = candidates[si]/max;
97
- float sumlx = 0 ; int suml2 = 0 ;
143
+ float sumxlp = 0 , sumxlm = 0 ;
144
+ int suml2p = 0 , suml2m = 0 ;
98
145
for (int i=0 ; i<n; ++i) {
99
- int l = std::max (-8 , std::min (7 , toNearestInt (iscale*X[i])));
100
- sumlx += X[i]*l; suml2 += l*l;
146
+ float x = X[i];
147
+ float sx = iscale*x;
148
+ int lx = toNearestInt (sx);
149
+ int lp = std::max (-8 , std::min (7 , +lx));
150
+ int lm = std::max (-8 , std::min (7 , -lx));
151
+ sumxlp += x*lp; sumxlm += x*lm;
152
+ suml2p += lp*lp; suml2m += lm*lm;
101
153
}
102
- if (sumlx*sumlx > best*suml2) {
103
- best = sumlx*sumlx/suml2; bestScale = iscale;
154
+ if (sumxlp*sumxlp*suml2m >= sumxlm*sumxlm*suml2p) {
155
+ if (sumxlp*sumxlp > best*suml2p) {
156
+ best = sumxlp*sumxlp/suml2p; bestScale = iscale;
157
+ }
158
+ } else {
159
+ if (sumxlm*sumxlm > best*suml2m) {
160
+ best = sumxlm*sumxlm/suml2m; bestScale = -iscale;
161
+ }
104
162
}
105
163
}
106
164
float sumlx = 0 ; int suml2 = 0 ;
@@ -112,6 +170,40 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
112
170
return sumlx/suml2;
113
171
}
114
172
173
+ float quanizeRmseOpt (int n, const float * X, int8_t * L, std::vector<std::pair<float ,int >>& work) {
174
+ work.clear ();
175
+ work.reserve (n*17 );
176
+ for (int l=-8 ; l<=8 ; ++l) {
177
+ float scale = l - 0 .4999f ;
178
+ for (int i=0 ; i<n; ++i) {
179
+ if (X[i]) work.push_back ({scale/std::abs (X[i]), i});
180
+ }
181
+ }
182
+ for (int i=0 ; i<n; ++i) L[i] = 0 ;
183
+ if (work.empty ()) return 1 .f ; // all values are zero
184
+ std::sort (work.begin (), work.end ());
185
+ float best = 0 , bestScale = 0 , lasts = work.front ().first - 1 ;
186
+ double sumlx = 0 ; int suml2 = 0 ;
187
+ for (int k=0 ; k<int (work.size ()); ++k) {
188
+ float s = work[k].first ; int i = work[k].second ;
189
+ int l = std::max (-8 , std::min (7 , toNearestInt (s*X[i])));
190
+ if (l != L[i]) {
191
+ sumlx += X[i]*(l-L[i]); suml2 += l*l - L[i]*L[i];
192
+ L[i] = l;
193
+ if ((s != lasts || k == int (work.size ())-1 ) && suml2 > 0 && sumlx*sumlx > best*suml2) {
194
+ best = sumlx*sumlx/suml2; bestScale = s;
195
+ }
196
+ }
197
+ }
198
+ sumlx = 0 ; suml2 = 0 ;
199
+ for (int i=0 ; i<n; ++i) {
200
+ int l = std::max (-8 , std::min (7 , toNearestInt (bestScale*X[i])));
201
+ sumlx += X[i]*l; suml2 += l*l;
202
+ L[i] = l;
203
+ }
204
+ return sumlx/suml2;
205
+ }
206
+
115
207
std::pair<float , float > kQuantize0 (int n, const float * X, int8_t * L, std::vector<std::pair<float ,int >>& work, int nmin, int nmax) {
116
208
work.clear ();
117
209
work.reserve (n*(nmax+2 ));
@@ -200,9 +292,10 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
200
292
return {min, 1 .f };
201
293
}
202
294
if (int (tmpX.size ()) < n) tmpX.resize (n);
203
- double a = min, b;
204
- for (int itry=0 ; itry<3 ; ++itry) {
295
+ double a = min, b = 0 ;
296
+ for (int itry=0 ; itry<5 ; ++itry) {
205
297
for (int i=0 ; i<n; ++i) tmpX[i] = X[i] - a;
298
+ // quanizeRmseK15(n, tmpX.data(), L);
206
299
kQuantize0 (n, tmpX.data (), L, work, 0 , 2 *nmax+1 );
207
300
double sumlx = 0 , sumx = 0 ;
208
301
int suml2 = 0 , suml = 0 ;
@@ -214,9 +307,37 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
214
307
sumx += X[i];
215
308
}
216
309
int64_t D = suml2*n - suml*suml;
310
+ auto aold = a, bold = b;
217
311
a = (sumx*suml2 - sumlx*suml)/D;
218
312
b = (sumlx*n - sumx*suml)/D;
313
+ if (itry > 0 && std::abs (a - aold) < 1e-6 *std::abs (aold) && std::abs (b - bold ) < 1e-6 *std::abs (bold )) break ;
314
+ }
315
+ return {a, b};
316
+ }
317
+
318
+ std::pair<float , float > kQuantize1Fast (int n, const float * X, int8_t * L, int nmax) {
319
+ float min = X[0 ], max = X[1 ];
320
+ for (int i=1 ; i<n; ++i) {
321
+ min = std::min (min, X[i]); max = std::max (max, X[i]);
322
+ }
323
+ if (max == min) {
324
+ for (int i=0 ; i<n; ++i) L[i] = 0 ;
325
+ return {min, 1 .f };
326
+ }
327
+ float scale = (nmax - 0 .499f )/(max - min);
328
+ double sumlx = 0 , sumx = 0 ;
329
+ int suml2 = 0 , suml = 0 ;
330
+ for (int i=0 ; i<n; ++i) {
331
+ int l = toNearestInt (scale*(X[i] - min));
332
+ L[i] = l;
333
+ sumlx += X[i]*l;
334
+ suml2 += l*l;
335
+ suml += l;
336
+ sumx += X[i];
219
337
}
338
+ int64_t D = suml2*n - suml*suml;
339
+ double a = (sumx*suml2 - sumlx*suml)/D;
340
+ double b = (sumlx*n - sumx*suml)/D;
220
341
return {a, b};
221
342
}
222
343
@@ -226,7 +347,9 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
226
347
auto processOne = [type] (const float * X, int8_t * L, char * y, std::vector<std::pair<float , int >>& work, std::vector<float >& tmpX) {
227
348
auto q = (uint8_t *)y;
228
349
if (type == 0 ) {
229
- auto scale = quanizeRmseK (QK, X, L);
350
+ auto scale = quanizeRmseK7 (QK, X, L);
351
+ // auto scale = quanizeRmseFast(QK, X, L);
352
+ // auto scale = quanizeRmseOpt(QK, X, L, work);
230
353
// The following is not quite as good as quanizeRmseK() and it is slower too.
231
354
// if (int(tmpX.size()) < QK) tmpX.resize(QK);
232
355
// auto r1 = kQuantize0(QK, X, L, work, -8, 7);
@@ -241,11 +364,29 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
241
364
// //float scale = kQuantize0(QK, X, L, work, -7, 7);
242
365
std::memcpy (q, &scale, sizeof (scale)); q += sizeof (scale);
243
366
for (int k=0 ; k<QK/2 ; ++k) q[k] = (L[2 *k] + 8 ) | ((L[2 *k+1 ] + 8 ) << 4 );
244
- } else {
367
+ } else if (type == 1 ) {
245
368
auto result = kQuantize1 (QK, X, L, tmpX, work, 7 );
246
369
std::memcpy (q, &result.second , sizeof (result.second )); q += sizeof (result.second );
247
370
std::memcpy (q, &result.first , sizeof (result.first )); q += sizeof (result.first );
248
371
for (int k=0 ; k<QK/2 ; ++k) q[k] = L[2 *k] | (L[2 *k+1 ] << 4 );
372
+ } else {
373
+ auto result = type == 2 ? kQuantize1 (QK, X, L, tmpX, work, 15 ) : kQuantize1Fast (QK, X, L, 31 );
374
+ auto afp16 = ggml_fp32_to_fp16 (result.first );
375
+ auto bfp16 = ggml_fp32_to_fp16 (result.second );
376
+ std::memcpy (q, &afp16, sizeof (afp16)); q += sizeof (afp16);
377
+ std::memcpy (q, &bfp16, sizeof (bfp16)); q += sizeof (bfp16);
378
+ auto u = (uint32_t *)q;
379
+ *u = 0 ;
380
+ q += sizeof (uint32_t );
381
+ uint32_t m = 1u ;
382
+ for (int k=0 ; k<QK/2 ; ++k) {
383
+ auto l1 = L[2 *k], l2 = L[2 *k+1 ];
384
+ if (l1 > 15 ) { l1 -= 16 ; *u |= m; }
385
+ m <<= 1 ;
386
+ if (l2 > 15 ) { l2 -= 16 ; *u |= m; }
387
+ m <<= 1 ;
388
+ q[k] = l1 | (l2 << 4 );
389
+ }
249
390
}
250
391
};
251
392
@@ -318,6 +459,14 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
318
459
kQuantizeQ4 (x, buffer, k, 1 );
319
460
}
320
461
462
+ void kQuantizeQ5_1 (const float * x, void * buffer, int k) {
463
+ kQuantizeQ4 (x, buffer, k, 2 );
464
+ }
465
+
466
+ void kQuantizeQ5_1_Fast (const float * x, void * buffer, int k) {
467
+ kQuantizeQ4 (x, buffer, k, 3 );
468
+ }
469
+
321
470
size_t kQuantizeQ4_0H (const float * x, void * buffer, int k, int64_t * hist) {
322
471
kQuantizeQ4 (x, buffer, k, 0 );
323
472
collectHisto (k, buffer, hist, 0 );
@@ -330,4 +479,42 @@ size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
330
479
return (k / QK) * kBucketSize1 ;
331
480
}
332
481
482
+ size_t kQuantizeQ5_1H (const float * x, void * buffer, int k, int64_t * hist) {
483
+ kQuantizeQ4 (x, buffer, k, 2 );
484
+ collectHisto (k, buffer, hist, 1 );
485
+ return (k / QK) * kBucketSize1 ;
486
+ }
487
+
488
+ size_t kQuantizeQ5_1H_Fast (const float * x, void * buffer, int k, int64_t * hist) {
489
+ kQuantizeQ4 (x, buffer, k, 3 );
490
+ collectHisto (k, buffer, hist, 1 );
491
+ return (k / QK) * kBucketSize1 ;
492
+ }
493
+
494
+ void kDequantizeQ5_1 (const void * x, float * y, int k) {
495
+ assert (k % QK == 0 );
496
+ int n = k / QK;
497
+ auto data = (const uint8_t *)x;
498
+ for (int i=0 ; i<n; ++i) {
499
+ ggml_fp16_t afp16, bfp16;
500
+ std::memcpy (&afp16, data, sizeof (afp16)); data += sizeof (afp16);
501
+ std::memcpy (&bfp16, data, sizeof (bfp16)); data += sizeof (bfp16);
502
+ auto a = ggml_fp16_to_fp32 (afp16);
503
+ auto b = ggml_fp16_to_fp32 (bfp16);
504
+ uint32_t u;
505
+ std::memcpy (&u, data, sizeof (u)); data += sizeof (u);
506
+ uint32_t m = 1u ;
507
+ for (int k=0 ; k<16 ; ++k) {
508
+ auto l1 = data[k] & 15 , l2 = data[k] >> 4 ;
509
+ if (u & m) l1 += 16 ;
510
+ m <<= 1 ;
511
+ if (u & m) l2 += 16 ;
512
+ m <<= 1 ;
513
+ *y++ = a + b*l1;
514
+ *y++ = a + b*l2;
515
+ }
516
+ data += 16 ;
517
+ }
518
+ }
519
+
333
520
}
0 commit comments