Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 29b83e5

Browse files
committed
Various experiments, including 5-bit qunatization
1 parent 6bfb00a commit 29b83e5

File tree

3 files changed

+211
-14
lines changed

3 files changed

+211
-14
lines changed

examples/quantize-stats/quantize-stats.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,13 +306,17 @@ int main(int argc, char ** argv) {
306306
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
307307

308308
// loop throught quantization types
309-
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
309+
//for (int i = 0; i < GGML_TYPE_COUNT; i++) {
310+
for (int i = 1; i < 2; i++) {
310311
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
311312
continue;
312313
}
313314
quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
314315
if (i < 2 && checkNewQuantization) {
315-
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
316+
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ4_1;
317+
//qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1;
318+
qfns.quantize_row_q = i == 0 ? kQuantizeQ4_0 : kQuantizeQ5_1_Fast;
319+
if (i == 1) qfns.dequantize_row_q = kDequantizeQ5_1;
316320
}
317321
if (qfns.quantize_row_q && qfns.dequantize_row_q) {
318322
if (params.verbose) {

ggml_extra.cpp

Lines changed: 199 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "ggml_extra.h"
2+
#include "ggml.h"
23

34
#include <limits>
45
#include <vector>
@@ -27,8 +28,7 @@ inline int toNearestInt(float fval) {
2728
// Adapted from PR #835, function quantize_row_q4_0_rmse()
2829
//
2930
// I absolutely cannot reproduce the rmse = 0.00185915 reported in #835.
30-
// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192
31-
// with the modification that determines the scale actually minimizing
31+
// Instead, I get rmse = 0.00197 with the original and rmse = 0.00192 // with the modification that determines the scale actually minimizing
3232
// the rmse.
3333
//
3434
// Do I have a bug? iI don't see it.
@@ -79,12 +79,58 @@ float quanizeRmse(int n, const float* X, int8_t* L) {
7979
//return 1/bestScale;
8080
}
8181

82+
float quanizeRmseK(int n, const float* X, int8_t* L,
83+
int nCandidates, const float* candidates, int nmin, int nmax) {
84+
float max = 0;
85+
for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
86+
if (!max) { // all zero
87+
for (int i=0; i<n; ++i) L[i] = 0;
88+
return 1.f;
89+
}
90+
float best = 0, bestScale = 0;
91+
for (int si=0; si<nCandidates; ++si) {
92+
float iscale = candidates[si]/max;
93+
float sumlx = 0; int suml2 = 0;
94+
for (int i=0; i<n; ++i) {
95+
int l = std::max(nmin, std::min(nmax, toNearestInt(iscale*X[i])));
96+
sumlx += X[i]*l; suml2 += l*l;
97+
}
98+
if (sumlx*sumlx > best*suml2) {
99+
best = sumlx*sumlx/suml2; bestScale = iscale;
100+
}
101+
}
102+
float sumlx = 0; int suml2 = 0;
103+
for (int i=0; i<n; ++i) {
104+
int l = std::max(nmin, std::min(nmax, toNearestInt(bestScale*X[i])));
105+
sumlx += X[i]*l; suml2 += l*l;
106+
L[i] = l;
107+
}
108+
return sumlx/suml2;
109+
}
82110
// The following improves the above.
83111
// It gives RMSE = 0.00185228 for the 7B model.
84-
float quanizeRmseK(int n, const float* X, int8_t* L) {
112+
float quanizeRmseK7(int n, const float* X, int8_t* L) {
85113
constexpr int kCandiateCount = 20;
86114
static const float candidates[kCandiateCount] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, -7.0f, -6.3f, -5.7f,
87115
+8.7f, +8.5f, +8.3f, +8.1f, +7.9f, +7.7f, +7.2f, +7.0f, +6.3f, +5.7f};
116+
return quanizeRmseK(n, X, L, kCandiateCount, candidates, -8, 7);
117+
}
118+
119+
float quanizeRmseK15(int n, const float* X, int8_t* L) {
120+
constexpr int kCandiateCount = 16;
121+
static const float candidates[kCandiateCount] = {
122+
+17.75f, +17.25f, +16.75f, +16.25f, +15.75f, +15.25f, +14.75f, +14.25f, +13.75f, +13.25f, +12.75f, +12.25, +11.75f,
123+
+11.25f, +10.75f, +10.25f
124+
};
125+
return quanizeRmseK(n, X, L, kCandiateCount, candidates, 0, 15);
126+
}
127+
128+
// Fast (as much faster than doing the optimization), but not very good.
129+
float quanizeRmseFast(int n, const float* X, int8_t* L) {
130+
//constexpr int kCandiateCount = 3;
131+
//static const float candidates[kCandiateCount] = { +8.3f, +7.2f, +5.7f};
132+
constexpr int kCandiateCount = 4;
133+
static const float candidates[kCandiateCount] = { +8.7f, +7.9f, +7.2f, +5.7f};
88134
float max = 0;
89135
for (int i=0; i<n; ++i) max = std::max(max, std::abs(X[i]));
90136
if (!max) { // all zero
@@ -94,13 +140,25 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
94140
float best = 0, bestScale = 0;
95141
for (int si=0; si<kCandiateCount; ++si) {
96142
float iscale = candidates[si]/max;
97-
float sumlx = 0; int suml2 = 0;
143+
float sumxlp = 0, sumxlm = 0;
144+
int suml2p = 0, suml2m = 0;
98145
for (int i=0; i<n; ++i) {
99-
int l = std::max(-8, std::min(7, toNearestInt(iscale*X[i])));
100-
sumlx += X[i]*l; suml2 += l*l;
146+
float x = X[i];
147+
float sx = iscale*x;
148+
int lx = toNearestInt(sx);
149+
int lp = std::max(-8, std::min(7, +lx));
150+
int lm = std::max(-8, std::min(7, -lx));
151+
sumxlp += x*lp; sumxlm += x*lm;
152+
suml2p += lp*lp; suml2m += lm*lm;
101153
}
102-
if (sumlx*sumlx > best*suml2) {
103-
best = sumlx*sumlx/suml2; bestScale = iscale;
154+
if (sumxlp*sumxlp*suml2m >= sumxlm*sumxlm*suml2p) {
155+
if (sumxlp*sumxlp > best*suml2p) {
156+
best = sumxlp*sumxlp/suml2p; bestScale = iscale;
157+
}
158+
} else {
159+
if (sumxlm*sumxlm > best*suml2m) {
160+
best = sumxlm*sumxlm/suml2m; bestScale = -iscale;
161+
}
104162
}
105163
}
106164
float sumlx = 0; int suml2 = 0;
@@ -112,6 +170,40 @@ float quanizeRmseK(int n, const float* X, int8_t* L) {
112170
return sumlx/suml2;
113171
}
114172

173+
float quanizeRmseOpt(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work) {
174+
work.clear();
175+
work.reserve(n*17);
176+
for (int l=-8; l<=8; ++l) {
177+
float scale = l - 0.4999f;
178+
for (int i=0; i<n; ++i) {
179+
if (X[i]) work.push_back({scale/std::abs(X[i]), i});
180+
}
181+
}
182+
for (int i=0; i<n; ++i) L[i] = 0;
183+
if (work.empty()) return 1.f; // all values are zero
184+
std::sort(work.begin(), work.end());
185+
float best = 0, bestScale = 0, lasts = work.front().first - 1;
186+
double sumlx = 0; int suml2 = 0;
187+
for (int k=0; k<int(work.size()); ++k) {
188+
float s = work[k].first; int i = work[k].second;
189+
int l = std::max(-8, std::min(7, toNearestInt(s*X[i])));
190+
if (l != L[i]) {
191+
sumlx += X[i]*(l-L[i]); suml2 += l*l - L[i]*L[i];
192+
L[i] = l;
193+
if ((s != lasts || k == int(work.size())-1) && suml2 > 0 && sumlx*sumlx > best*suml2) {
194+
best = sumlx*sumlx/suml2; bestScale = s;
195+
}
196+
}
197+
}
198+
sumlx = 0; suml2 = 0;
199+
for (int i=0; i<n; ++i) {
200+
int l = std::max(-8, std::min(7, toNearestInt(bestScale*X[i])));
201+
sumlx += X[i]*l; suml2 += l*l;
202+
L[i] = l;
203+
}
204+
return sumlx/suml2;
205+
}
206+
115207
std::pair<float, float> kQuantize0(int n, const float* X, int8_t* L, std::vector<std::pair<float,int>>& work, int nmin, int nmax) {
116208
work.clear();
117209
work.reserve(n*(nmax+2));
@@ -200,9 +292,10 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
200292
return {min, 1.f};
201293
}
202294
if (int(tmpX.size()) < n) tmpX.resize(n);
203-
double a = min, b;
204-
for (int itry=0; itry<3; ++itry) {
295+
double a = min, b = 0;
296+
for (int itry=0; itry<5; ++itry) {
205297
for (int i=0; i<n; ++i) tmpX[i] = X[i] - a;
298+
//quanizeRmseK15(n, tmpX.data(), L);
206299
kQuantize0(n, tmpX.data(), L, work, 0, 2*nmax+1);
207300
double sumlx = 0, sumx = 0;
208301
int suml2 = 0, suml = 0;
@@ -214,9 +307,37 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
214307
sumx += X[i];
215308
}
216309
int64_t D = suml2*n - suml*suml;
310+
auto aold = a, bold = b;
217311
a = (sumx*suml2 - sumlx*suml)/D;
218312
b = (sumlx*n - sumx*suml)/D;
313+
if (itry > 0 && std::abs(a - aold) < 1e-6*std::abs(aold) && std::abs(b - bold) < 1e-6*std::abs(bold)) break;
314+
}
315+
return {a, b};
316+
}
317+
318+
std::pair<float, float> kQuantize1Fast(int n, const float* X, int8_t* L, int nmax) {
319+
float min = X[0], max = X[1];
320+
for (int i=1; i<n; ++i) {
321+
min = std::min(min, X[i]); max = std::max(max, X[i]);
322+
}
323+
if (max == min) {
324+
for (int i=0; i<n; ++i) L[i] = 0;
325+
return {min, 1.f};
326+
}
327+
float scale = (nmax - 0.499f)/(max - min);
328+
double sumlx = 0, sumx = 0;
329+
int suml2 = 0, suml = 0;
330+
for (int i=0; i<n; ++i) {
331+
int l = toNearestInt(scale*(X[i] - min));
332+
L[i] = l;
333+
sumlx += X[i]*l;
334+
suml2 += l*l;
335+
suml += l;
336+
sumx += X[i];
219337
}
338+
int64_t D = suml2*n - suml*suml;
339+
double a = (sumx*suml2 - sumlx*suml)/D;
340+
double b = (sumlx*n - sumx*suml)/D;
220341
return {a, b};
221342
}
222343

@@ -226,7 +347,9 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
226347
auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
227348
auto q = (uint8_t*)y;
228349
if (type == 0) {
229-
auto scale = quanizeRmseK(QK, X, L);
350+
auto scale = quanizeRmseK7(QK, X, L);
351+
//auto scale = quanizeRmseFast(QK, X, L);
352+
//auto scale = quanizeRmseOpt(QK, X, L, work);
230353
// The following is not quite as good as quanizeRmseK() and it is slower too.
231354
//if (int(tmpX.size()) < QK) tmpX.resize(QK);
232355
//auto r1 = kQuantize0(QK, X, L, work, -8, 7);
@@ -241,11 +364,29 @@ void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
241364
////float scale = kQuantize0(QK, X, L, work, -7, 7);
242365
std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
243366
for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
244-
} else {
367+
} else if (type == 1) {
245368
auto result = kQuantize1(QK, X, L, tmpX, work, 7);
246369
std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
247370
std::memcpy(q, &result.first, sizeof(result.first)); q += sizeof(result.first);
248371
for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
372+
} else {
373+
auto result = type == 2 ? kQuantize1(QK, X, L, tmpX, work, 15) : kQuantize1Fast(QK, X, L, 31);
374+
auto afp16 = ggml_fp32_to_fp16(result.first);
375+
auto bfp16 = ggml_fp32_to_fp16(result.second);
376+
std::memcpy(q, &afp16, sizeof(afp16)); q += sizeof(afp16);
377+
std::memcpy(q, &bfp16, sizeof(bfp16)); q += sizeof(bfp16);
378+
auto u = (uint32_t*)q;
379+
*u = 0;
380+
q += sizeof(uint32_t);
381+
uint32_t m = 1u;
382+
for (int k=0; k<QK/2; ++k) {
383+
auto l1 = L[2*k], l2 = L[2*k+1];
384+
if (l1 > 15) { l1 -= 16; *u |= m; }
385+
m <<= 1;
386+
if (l2 > 15) { l2 -= 16; *u |= m; }
387+
m <<= 1;
388+
q[k] = l1 | (l2 << 4);
389+
}
249390
}
250391
};
251392

@@ -318,6 +459,14 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
318459
kQuantizeQ4(x, buffer, k, 1);
319460
}
320461

462+
void kQuantizeQ5_1(const float* x, void* buffer, int k) {
463+
kQuantizeQ4(x, buffer, k, 2);
464+
}
465+
466+
void kQuantizeQ5_1_Fast(const float* x, void* buffer, int k) {
467+
kQuantizeQ4(x, buffer, k, 3);
468+
}
469+
321470
size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
322471
kQuantizeQ4(x, buffer, k, 0);
323472
collectHisto(k, buffer, hist, 0);
@@ -330,4 +479,42 @@ size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
330479
return (k / QK) * kBucketSize1;
331480
}
332481

482+
size_t kQuantizeQ5_1H(const float* x, void* buffer, int k, int64_t* hist) {
483+
kQuantizeQ4(x, buffer, k, 2);
484+
collectHisto(k, buffer, hist, 1);
485+
return (k / QK) * kBucketSize1;
486+
}
487+
488+
size_t kQuantizeQ5_1H_Fast(const float* x, void* buffer, int k, int64_t* hist) {
489+
kQuantizeQ4(x, buffer, k, 3);
490+
collectHisto(k, buffer, hist, 1);
491+
return (k / QK) * kBucketSize1;
492+
}
493+
494+
void kDequantizeQ5_1(const void* x, float* y, int k) {
495+
assert(k % QK == 0);
496+
int n = k / QK;
497+
auto data = (const uint8_t*)x;
498+
for (int i=0; i<n; ++i) {
499+
ggml_fp16_t afp16, bfp16;
500+
std::memcpy(&afp16, data, sizeof(afp16)); data += sizeof(afp16);
501+
std::memcpy(&bfp16, data, sizeof(bfp16)); data += sizeof(bfp16);
502+
auto a = ggml_fp16_to_fp32(afp16);
503+
auto b = ggml_fp16_to_fp32(bfp16);
504+
uint32_t u;
505+
std::memcpy(&u, data, sizeof(u)); data += sizeof(u);
506+
uint32_t m = 1u;
507+
for (int k=0; k<16; ++k) {
508+
auto l1 = data[k] & 15, l2 = data[k] >> 4;
509+
if (u & m) l1 += 16;
510+
m <<= 1;
511+
if (u & m) l2 += 16;
512+
m <<= 1;
513+
*y++ = a + b*l1;
514+
*y++ = a + b*l2;
515+
}
516+
data += 16;
517+
}
518+
}
519+
333520
}

ggml_extra.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k
2222
void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
2323
size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
2424

25+
void kQuantizeQ5_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
26+
size_t kQuantizeQ5_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
27+
void kQuantizeQ5_1_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
28+
size_t kQuantizeQ5_1H_Fast(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
29+
void kDequantizeQ5_1(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
30+
2531
#ifdef __cplusplus
2632
}
2733
#endif

0 commit comments

Comments
 (0)