From 40ebf819b0fd367e65c97c6d9cef3863dd54f882 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Fri, 7 Apr 2023 13:49:51 +0200 Subject: [PATCH 1/3] Q4_0 scale selection using RMSE --- Makefile | 2 +- SHA256SUMS | 4 + examples/quantize-stats/quantize-stats.cpp | 103 ++++++++------ examples/quantize/scale.py | 76 ++++++++++ ggml.c | 154 +++++++++++++++++---- ggml.h | 22 +-- llama.cpp | 25 +++- llama.h | 1 + tests/test-quantize.c | 13 +- 9 files changed, 304 insertions(+), 96 deletions(-) create mode 100644 examples/quantize/scale.py diff --git a/Makefile b/Makefile index c55338e18ae03..65101e62a7078 100644 --- a/Makefile +++ b/Makefile @@ -133,7 +133,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize perplexity embedding +default: main quantize quantize-stats perplexity embedding # # Build library diff --git a/SHA256SUMS b/SHA256SUMS index 63fac21ae1bef..ae43724a5464b 100644 --- a/SHA256SUMS +++ b/SHA256SUMS @@ -1,7 +1,11 @@ 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d models/7B/consolidated.00.pth +0cc0b0a3dc8cd29f005946f8364ac2bbce797e792a40c0fb4114615e4f825976 models/7B/ggml-model-f16.bin +5dec1979849d73e361a8bcc10bc8f53237cbbe435a572882dc87629e011e24b3 models/7B/ggml-model-q4_0.bin 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265 models/7B/params.json 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08 models/13B/consolidated.00.pth d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085 models/13B/consolidated.01.pth +7da75a2a164a8fb4cfbdd4823111f3545c690c5d75c345a2419a9f1e2d24080f models/13B/ggml-model-f16.bin +4c5a285985bac6b8dcc56a97752b8ab70687ce0584daa6bb418ee458d91126e8 models/13B/ggml-model-q4_0.bin 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f models/13B/params.json e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067 models/30B/consolidated.00.pth 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff models/30B/consolidated.01.pth diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index af1e6272e80b2..6a2fe61161399 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -17,12 +17,15 @@ static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); +static const char * impl_strs[] = { "simd", "reference", "rmse" }; +static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list"); + struct quantize_stats_params { std::string model = "models/7B/ggml-model-f16.bin"; bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; - bool reference = false; + std::vector include_impl; std::vector include_layers; std::vector exclude_layers; std::vector include_types; @@ -48,8 +51,8 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { fprintf(stderr, " -h, --help show this help message and exit\n"); fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, " -r, --reference\n"); - fprintf(stderr, " use reference implementation (default: false)\n"); + fprintf(stderr, " -i, --implementation\n"); + fprintf(stderr, " select implementation (simd, reference, rmse)\n"); fprintf(stderr, " -v, --verbose\n"); fprintf(stderr, " verbose output (default: false)\n"); fprintf(stderr, " -p, --per-layer-stats\n"); @@ -104,11 +107,12 @@ double find_quantile(const error_stats & stats, double quantile) { return INFINITY; } -void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) { +void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, const error_stats & stats, bool print_histogram) { double rmse = sqrt(stats.total_error / (double) stats.num_samples); double median = find_quantile(stats, .5); double pct95 = find_quantile(stats, .95); - printf("%-50s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), rmse, stats.max_error, pct95, median); + printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", + name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median); if (print_histogram) { printf("Error distribution:\n"); for (size_t i = 0; i < HISTOGRAM_BUCKETS; i++) { @@ -136,7 +140,7 @@ void test_roundtrip_on_layer( std::string & name, bool print_layer_stats, const quantize_fns_t & qfns, - bool use_reference, + ggml_quantize_impl_t impl, const ggml_tensor * layer, float * input_scratch, char *quantized_scratch, @@ -158,11 +162,7 @@ void test_roundtrip_on_layer( input_scratch = ggml_get_data_f32(layer) + offset; } - if (use_reference) { - qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size); - } else { - qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size); - } + qfns.quantize_row_q[impl](input_scratch, quantized_scratch, chunk_size); qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size); update_error_stats(chunk_size, input_scratch, output_scratch, total_error); @@ -171,7 +171,7 @@ void test_roundtrip_on_layer( } } if (print_layer_stats) { - print_error_stats(name, layer_error, false); + print_error_stats(name, impl, layer_error, false); } } @@ -190,8 +190,21 @@ int main(int argc, char ** argv) { if (arg == "-h" || arg == "--help") { quantize_stats_print_usage(argc, argv); exit(0); - } else if (arg == "-r" || arg == "--reference") { - params.reference = true; + } else if (arg == "-i" || arg == "--implementation") { + if (++i >= argc) { + invalid_param = true; + break; + } + int j; + for (j = 0; j < GGML_QUANTIZE_IMPL_COUNT && strcmp(argv[i], impl_strs[j]) != 0; j++) { + // find match + } + if (j < GGML_QUANTIZE_IMPL_COUNT) { + params.include_impl.push_back((ggml_quantize_impl_t)j); + } else { + fprintf(stderr, "error: %s not in list of implementations\n", argv[i]); + invalid_param = true; + } } else if (arg == "-v") { params.verbose = true; } else if (arg == "-p" || arg == "--per-layer-stats") { @@ -302,42 +315,48 @@ int main(int argc, char ** argv) { std::vector quantized_scratch(SCRATCH_ELEMENTS*4); std::vector output_scratch(SCRATCH_ELEMENTS); - // loop throught quantization types - for (int i = 0; i < GGML_TYPE_COUNT; i++) { - if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { + // loop through quantization types + for (int type = 0; type < GGML_TYPE_COUNT; type++) { + if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), type) == params.include_types.end()) { continue; } - quantize_fns_t qfns = ggml_internal_get_quantize_fn(i); + quantize_fns_t qfns = ggml_internal_get_quantize_fn(type); if (qfns.quantize_row_q && qfns.dequantize_row_q) { - if (params.verbose) { - printf("testing %s ...\n", type_strs[i]); - } - - error_stats global_stats {}; - - for (const auto& kv_tensor : tensors_sorted) { - if (!layer_included(params, kv_tensor.first)) { + for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) { + if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) { continue; } + if (params.verbose) { - printf(" %s ...\n", kv_tensor.first.c_str()); + printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]); } - std::string layer_name { type_strs[i] }; - layer_name += "::" + kv_tensor.first; - test_roundtrip_on_layer( - layer_name, - params.per_layer_stats, - qfns, - params.reference, - kv_tensor.second, - input_scratch.data(), - quantized_scratch.data(), - output_scratch.data(), - global_stats - ); - } - print_error_stats(type_strs[i], global_stats, params.print_histogram); + error_stats global_stats {}; + + for (const auto& kv_tensor : tensors_sorted) { + if (!layer_included(params, kv_tensor.first)) { + continue; + } + if (params.verbose) { + printf(" %s ...\n", kv_tensor.first.c_str()); + } + std::string layer_name { type_strs[type] }; + layer_name += "::" + kv_tensor.first; + test_roundtrip_on_layer( + layer_name, + params.per_layer_stats, + qfns, + (ggml_quantize_impl_t)impl, + kv_tensor.second, + input_scratch.data(), + quantized_scratch.data(), + output_scratch.data(), + global_stats + ); + } + + print_error_stats(type_strs[type], (ggml_quantize_impl_t)impl, global_stats, params.print_histogram); + } } } diff --git a/examples/quantize/scale.py b/examples/quantize/scale.py new file mode 100644 index 0000000000000..89028f9e4a0a2 --- /dev/null +++ b/examples/quantize/scale.py @@ -0,0 +1,76 @@ +import matplotlib.pyplot as plt + +# Generated by quantizing the entire 7B model with the first element of each tuple as the scale factor. +# The second element of the tuple is the number of q4_0 blocks for which that scale factor has lowest RMSE. +data = ( + (-10.0, 0), + (-9.9, 1), + (-9.8, 3), + (-9.7, 65), + (-9.6, 738), + (-9.5, 5779), + (-9.4, 30880), + (-9.3, 121078), + (-9.2, 375674), + (-9.1, 941350), + (-9.0, 1990278), + (-8.9, 3635317), + (-8.8, 5891752), + (-8.7, 8678748), + (-8.6, 11771759), + (-8.5, 14873993), + (-8.4, 17594260), + (-8.3, 19553100), + (-8.2, 20415428), + (-8.1, 20017134), + (-8.0, 18357204), + (-7.9, 15597612), + (-7.8, 11993688), + (-7.7, 7842970), + (-7.6, 2880878), + (-7.5, 3478), + (-7.4, 2648437), + (-7.3, 5641970), + (-7.2, 5935890), + (-7.1, 4910790), + (-7.0, 3425891), + (-6.9, 2068250), + (-6.8, 1089883), + (-6.7, 502462), + (-6.6, 156356), + (-6.5, 205), + (-6.4, 163500), + (-6.3, 386291), + (-6.2, 423018), + (-6.1, 319360), + (-6.0, 180783), + (-5.9, 78822), + (-5.8, 28254), + (-5.7, 8698), + (-5.6, 1969), + (-5.5, 0), + (-5.4, 2069), + (-5.3, 5722), + (-5.2, 7107), + (-5.1, 5113), + (-5.0, 2332), + (-4.9, 636), + (-4.8, 130), + (-4.7, 12), + (-4.6, 1), + (-4.5, 0), + (-4.4, 3), + (-4.3, 4), + (-4.2, 8), + (-4.1, 8), + (-4.0, 27), +) +x, y = zip(*data) + +fig, ax = plt.subplots() +b = ax.bar(x, y, 0.1, bottom=1) +ax.set_yscale("log") +ax.set_xlabel("scale") +ax.set_ylabel("N") +plt.title("Quantization scale factor with lowest RMS error") +plt.show() diff --git a/ggml.c b/ggml.c index dc084e6b6a8a9..d8c6583b92c57 100644 --- a/ggml.c +++ b/ggml.c @@ -73,11 +73,15 @@ static int sched_yield (void) { Sleep (0); return 0; } + +#define __attribute__(...) #else #include #include typedef void* thread_ret_t; + +#define __declspec(...) #endif // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512 @@ -517,39 +521,128 @@ typedef struct { static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); // reference implementation for deterministic creation of model files +static inline void quantize_block_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, float scale) { + uint8_t pp[QK/2]; + + float amax = 0.0f; // absolute max + float max = 0.0f; + + for (int l = 0; l < QK; l++) { + const float v = x[l]; + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } + } + + const float d = max / scale; + const float id = d ? 1.0f/d : 0.0f; + + y->d = d; + + for (int l = 0; l < QK; l += 2) { + const float v0 = x[l + 0]*id; + const float v1 = x[l + 1]*id; + + int8_t vs0 = roundf(v0); + int8_t vs1 = roundf(v1); + + vs0 = MIN(MAX(0 - 8, vs0), 15 - 8); + vs1 = MIN(MAX(0 - 8, vs1), 15 - 8); + + const uint8_t vi0 = vs0 + 8; // guaranteed to fit into 4 bits + const uint8_t vi1 = vs1 + 8; // thanks to the clamping of the signed values above + + pp[l/2] = vi0 | (vi1 << 4); + } + + memcpy(y->qs, pp, sizeof(pp)); +} + static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; + for (int i = 0; i < nb; i++) { + quantize_block_q4_0_reference(x + i*QK, y + i, 7); + } +} - uint8_t pp[QK/2]; +static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restrict y, int k) { + // For each q4_0 block, we try the following values to scale the shared float value + // and pick the one with lowest RMS error. We could do a more involved search, + // but this is a trade-off with speed of model generation and simplicity of the code. + // Operating on 8 values can reasonably be loop-unrolled or vectorized, but that is not + // manually done here. + // Values hand-picked according to histogram in examples/quantize/scale.py + // Include the value +7 of the old method to ensure we don't regress on RMSE on any block. + #define Q4_0_SCALE_CANDIDATE_COUNT 8 + static const float candidates[Q4_0_SCALE_CANDIDATE_COUNT] = { -8.7f, -8.5f, -8.3f, -8.1f, -7.9f, -7.7f, -7.2f, +7.0f }; + + assert(k % QK == 0); + const int nb = k / QK; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max + float max = 0.0f; for (int l = 0; l < QK; l++) { const float v = x[i*QK + l]; - amax = MAX(amax, fabsf(v)); + if (amax < fabsf(v)) { + amax = fabsf(v); + max = v; + } } - const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0f/d : 0.0f; - - y[i].d = d; - - for (int l = 0; l < QK; l += 2) { - const float v0 = x[i*QK + l + 0]*id; - const float v1 = x[i*QK + l + 1]*id; - - const uint8_t vi0 = (int8_t)roundf(v0) + 8; - const uint8_t vi1 = (int8_t)roundf(v1) + 8; - - assert(vi0 < 16); - assert(vi1 < 16); + // find scale with lowest sum of squared errors, equivalent to lowest RMS error + float best_sqerr = +INFINITY; + float best_scale = NAN; + + for (int si = 0; si < Q4_0_SCALE_CANDIDATE_COUNT; si++) { + const float scale = candidates[si]; + const float d = max / scale; + const float id = d ? 1.0f / d : 0.0f; + float sqe_acc = 0.f; +#ifdef __AVX2__ + const __m256 clamp_lo = _mm256_set1_ps( 0 - 8); + const __m256 clamp_hi = _mm256_set1_ps(15 - 8); + const __m256 id256 = _mm256_set1_ps(id); + for (int l = 0; l < QK; l += 8) { + // TODO: use _mm256_load_ps once the quantize loader uses mmap + __m256 v = _mm256_loadu_ps(&x[i * QK + l]); + v = _mm256_mul_ps(v, id256); + __m256 vs = _mm256_round_ps(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + vs = _mm256_min_ps(_mm256_max_ps(clamp_lo, vs), clamp_hi); + const __m256 err = _mm256_sub_ps(vs, v); + const __m256 sqe = _mm256_mul_ps(err, err); + + // this is far from optimal speed-wise, but ensures identical results to scalar implementation + // we have to add the floats in sqe to sqe_acc separately and in the correct order + // 8x _mm_add_ps(,_mm_permute_ps()) would work but isn't faster than this: + __declspec(align(32)) float out[8] __attribute__((aligned(32))); + _mm256_store_ps(out, sqe); + for (int ei= 0; ei < 8; ei++) { + sqe_acc += out[ei]; + } + } +#else + for (int l = 0; l < QK; l++) { + const float v = x[i * QK + l] * id; + int8_t vs = roundf(v); + vs = MIN(MAX(0 - 8, vs), 15 - 8); + sqe_acc += (vs - v) * (vs - v); + } +#endif + // the square error sum is calculated on un-scaled q's inside the inner loop + sqe_acc *= d * d; - pp[l/2] = vi0 | (vi1 << 4); + if (best_sqerr > sqe_acc) { + best_sqerr = sqe_acc; + best_scale = scale; + } } - - memcpy(y[i].qs, pp, sizeof(pp)); + assert(isfinite(best_sqerr)); + assert(isfinite(best_scale)); + quantize_block_q4_0_reference(x + i * QK, y + i, best_scale); } } @@ -6564,17 +6657,28 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } +static void quantize_row_q_missing(const float * x, void * y, int k) { + (void)x; (void)y; (void)k; + assert(false); +} + static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, - .quantize_row_q = quantize_row_q4_0, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_0_reference, + .quantize_row_q = { + [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_0, + [GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference, + [GGML_QUANTIZE_IMPL_RMSE] = (quantize_row_q_t)quantize_row_q4_0_rmse, + }, .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1, - .quantize_row_q = quantize_row_q4_1, - .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_1_reference, + .quantize_row_q = { + [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_1, + [GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference, + [GGML_QUANTIZE_IMPL_RMSE] = quantize_row_q_missing, + }, .vec_dot_q = ggml_vec_dot_q4_1, }, }; @@ -6632,7 +6736,7 @@ static void ggml_compute_forward_mul_mat_q_f32( GGML_ASSERT(ne3 == ne13); const enum ggml_type type = src0->type; - quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q; + quantize_row_q_t const quantize_row_q = quantize_fns[type].quantize_row_q[GGML_QUANTIZE_IMPL_SIMD]; vec_dot_q_t const vec_dot_q = quantize_fns[type].vec_dot_q; // we don't support permuted src0 or src1 @@ -10602,7 +10706,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * for (int j = 0; j < n; j += k) { block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK; - quantize_row_q4_0_reference(src + j, y, k); + quantize_row_q4_0_rmse(src + j, y, k); for (int i = 0; i < nb; i++) { for (int l = 0; l < QK; l += 2) { diff --git a/ggml.h b/ggml.h index 2c636c2a9bf74..9c28f781a8588 100644 --- a/ggml.h +++ b/ggml.h @@ -788,20 +788,20 @@ int ggml_cpu_has_vsx(void); // Internal types and functions exposed for tests and benchmarks // -#ifdef __cplusplus -// restrict not standard in C++ -#define GGML_RESTRICT -#else -#define GGML_RESTRICT restrict -#endif -typedef void (*dequantize_row_q_t)(const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -typedef void (*quantize_row_q_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -typedef void (*vec_dot_q_t)(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y); +typedef void (*dequantize_row_q_t)(const void * x, float * y, int k); +typedef void (*quantize_row_q_t)(const float * x, void * y, int k); +typedef void (*vec_dot_q_t)(const int n, float * s, const void * x, const void * y); + +typedef enum { + GGML_QUANTIZE_IMPL_SIMD, + GGML_QUANTIZE_IMPL_REFERENCE, + GGML_QUANTIZE_IMPL_RMSE, + GGML_QUANTIZE_IMPL_COUNT +} ggml_quantize_impl_t; typedef struct { dequantize_row_q_t dequantize_row_q; - quantize_row_q_t quantize_row_q; - quantize_row_q_t quantize_row_q_reference; + quantize_row_q_t quantize_row_q[GGML_QUANTIZE_IMPL_COUNT]; vec_dot_q_t vec_dot_q; } quantize_fns_t; diff --git a/llama.cpp b/llama.cpp index fc6f43afed81f..12a6b480c547c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -644,7 +644,7 @@ static bool llama_model_load( size_t total_size = 0; model.n_loaded = 0; - while (true) { + while (size_t(fin.tellg()) + 12 < file_size) { int32_t n_dims; int32_t length; int32_t ftype; @@ -653,10 +653,6 @@ static bool llama_model_load( fin.read(reinterpret_cast(&length), sizeof(length)); fin.read(reinterpret_cast(&ftype), sizeof(ftype)); - if (fin.eof()) { - break; - } - int32_t nelements = 1; int32_t ne[2] = { 1, 1 }; for (int i = 0; i < n_dims; ++i) { @@ -707,6 +703,10 @@ static bool llama_model_load( offset = (offset + 31) & -32; tensor->data = mm_addr + offset; fin.seekg(offset + tensor_data_size); + if (fin.eof()) { + fprintf(stderr, "%s: Truncated file?\n", __func__); + return false; + } total_size += tensor_data_size; model.n_loaded++; @@ -717,6 +717,15 @@ static bool llama_model_load( } } + uint32_t version_minor = 0; + fin.read((char *)&version_minor, sizeof(version_minor)); + if (fin.eof() || version_minor < LLAMA_FILE_VERSION_MINOR) { + static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Provide a helpful message that explains why the user may want to update their files"); + if (model.hparams.f16 == 2) { + fprintf(stderr, "%s: WARN no minor version detected - your file will work but consider re-creating it for better quantization\n", __func__); + } + } + fin.close(); fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, model.n_loaded); @@ -1572,6 +1581,12 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s } } + static_assert(LLAMA_FILE_VERSION_MINOR == 1, "Check if this condition needs updating for minimal model checksum changes"); + if ((LLAMA_FILE_VERSION_MINOR > 1) || (itype == 2)) { + uint32_t version_minor = LLAMA_FILE_VERSION_MINOR; + fout.write((char *)&version_minor, sizeof(version_minor)); + } + finp.close(); fout.close(); diff --git a/llama.h b/llama.h index deb09fe53959d..ab6040dcc3f84 100644 --- a/llama.h +++ b/llama.h @@ -20,6 +20,7 @@ #endif #define LLAMA_FILE_VERSION 1 +#define LLAMA_FILE_VERSION_MINOR 1 // for backward-compatible changes #define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files diff --git a/tests/test-quantize.c b/tests/test-quantize.c index 993e9dcc3cef3..2d2566dcc954d 100644 --- a/tests/test-quantize.c +++ b/tests/test-quantize.c @@ -13,18 +13,7 @@ int main(void) { src[i] = (float)(i + 1); } - size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist); - assert(size == 20); - float max_result = ((float *)dst)[0]; - float max_expected = src[31] / ((1 << 3) - 1); - assert(max_result == max_expected); - for (int i = 0; i < QK; i++) { - uint8_t q4_result = (i % 2) ? (dst[sizeof(float) + i/2] >> 4) : (dst[sizeof(float) + i/2] & 0xF); - uint8_t q4_expected = roundf(src[i] / max_expected) + 8; - assert(q4_result == q4_expected); - } - - size = ggml_quantize_q4_1(src, dst, QK, QK, hist); + size_t size = ggml_quantize_q4_1(src, dst, QK, QK, hist); assert(size == 24); float delta_result = ((float *)dst)[0]; float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); From 4dc62e78d885754c661fd5ee6bf9cb5e7fcc5275 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20H=2E=20Hitland?= Date: Fri, 7 Apr 2023 17:13:29 +0200 Subject: [PATCH 2/3] Really slow RMS "optimal" scaling for q4_0 Use a sweep line approach to scan all configurations of quantization, examining every changeover point where a quantize value changes, and find the optimal scaling for each configuration analytically. --- ggml.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 133 insertions(+) diff --git a/ggml.c b/ggml.c index d8c6583b92c57..4171ad804c20a 100644 --- a/ggml.c +++ b/ggml.c @@ -646,6 +646,139 @@ static void quantize_row_q4_0_rmse(const float * restrict x, block_q4_0 * restri } } +static int comparefloat(const void * f1p, const void * f2p) { + float f1 = *(const float *) f1p; + float f2 = *(const float *) f2p; + return (f1 > f2) - (f1 < f2); +} + +// Find the optimal quantization scaling for a set of values using a sweep line approach +// Returns the final scaling vale, and writes the quantized indices as bytes to y +static float find_optimal_scale(const float * restrict x, uint8_t * restrict qi) { + // The quantization shape is a set of values that will be scaled linearly with a value 'd' to produce a set of values to choose from. + // The input values will then be rounded to the nearest of the scaled values. + // The shape can contain any set of values, e.g. to fit a non-linear distribution, but must be in sorted order and have exactly one '0' + const float shape[16] = {-8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7}; + // Precalculate the midpoint between adjacent values in the shape. + float inv_midpoints[15] = {0}; + for (int i = 0; i < 15; i++) { + inv_midpoints[i] = 2/(shape[i] + shape[i+1]); + } + int zero_i; + for (zero_i = 0; shape[zero_i] != 0.0f; zero_i++) { + // find zero index + }; + + // Each event represents a value of d where one value in x changes its quantization + struct event { + float d; + uint8_t x_i; + uint8_t new_shape_i; + }; + // Each input value will go through each of the 16 quantization values + struct event events[16*QK]; + int nevents = 0; + for (int i = 0; i < QK; i++) { + if (x[i] == 0.0f) { + // We ignore the scaling of zero valued elements + continue; + } + for (int j = 0; j < 15; j++) { + // Positive valued elements sweep backwards from zero, negative elements sweep forward from zero, + // both will wrap around and end up back at zero + int forwardi = (x[i] > 0) ? j : j+1; + events[nevents++] = (struct event) { + .d = x[i] * inv_midpoints[j], + .x_i = i, + .new_shape_i = forwardi, + }; + } + // Add a wrap-around event at 0 + events[nevents++] = (struct event) { + .d = 0, + .x_i = i, + .new_shape_i = (x[i] > 0) ? 15 : 0 + }; + } + + // Order the events in increasing order of scaling factor d + qsort(events, nevents, sizeof(struct event), comparefloat); + + // We will keep track of our sum-of-squared-error score as we loop through the scales, which is + // sum(x_i^2) + d^2*sum(q_i^2) - 2*d*sum(x_i*q_i) + // sum(q_i^2) + float qv_sqr_sum = 0; + // sum(x_i*q_i) + float x_mul_qv_sum = 0; + + // Start scaling at negative infinity + float best_score = INFINITY; + float best_d = 0; + int best_i = 0; + for (int i = 0; i < QK; i++) { + qi[i] = zero_i; + } + + for (int i = 0; i < nevents; i++) { + struct event ev = events[i]; + // Update loop values + const int old_i = qi[ev.x_i]; + const float old_val = shape[old_i]; + const float new_val = shape[ev.new_shape_i]; + qv_sqr_sum -= old_val*old_val; + qv_sqr_sum += new_val*new_val; + x_mul_qv_sum -= x[ev.x_i] * old_val; + x_mul_qv_sum += x[ev.x_i] * new_val; + qi[ev.x_i] = ev.new_shape_i; + + if (ev.d == 0.0f || qv_sqr_sum == 0.0f) { + continue; + } + + // squared error score at best_d, ommitting the constant sum(x_i^2) factor + const float local_score = -(x_mul_qv_sum * x_mul_qv_sum) / qv_sqr_sum; + + if (local_score < best_score) { + // find the optimal scaling factor d for the current quantization assignments, + // solve for minima of d^2*sum(q_i^2) - 2*d*sum(x_i*q_i) + best_d = x_mul_qv_sum / qv_sqr_sum; + best_score = local_score; + best_i = i; + } + } + // restore qi values at position i + for (int i = 0; i < 16; i++) { + qi[i] = zero_i; + } + for (int i = 0; i <= best_i; i++) { + qi[events[i].x_i] = events[i].new_shape_i; + } + + return best_d; +} + +// Slow implementation of q4_0 that optimizes for RMSE +static void quantize_row_q4_0_slow(const float * restrict x, block_q4_0 * restrict y, int k) { + assert(k % QK == 0); + const int nb = k / QK; + + uint8_t pp[QK/2]; + + for (int i = 0; i < nb; i++) { + uint8_t qi[QK]; + y[i].d = find_optimal_scale(&x[i*QK], &qi[0]); + + for (int l = 0; l < QK; l += 2) { + assert(qi[l] < 16); + assert(qi[l+1] < 16); + + pp[l/2] = qi[l] | (qi[l+1] << 4); + } + + memcpy(y[i].qs, pp, sizeof(pp)); + } +} + static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { assert(k % QK == 0); const int nb = k / QK; From 678e1389701109842b39ea1c3415ef85e212836b Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Sat, 8 Apr 2023 10:46:49 +0200 Subject: [PATCH 3/3] Update stats tool for unbounded's method --- examples/quantize-stats/quantize-stats.cpp | 16 ++++++++-------- ggml.c | 17 ++++++----------- ggml.h | 3 ++- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 6a2fe61161399..051e1961c295e 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -17,7 +17,7 @@ static const char * type_strs[] = { "q4_0", "q4_1", "i8", "i16", "i32", "f16", "f32" }; static_assert(sizeof(type_strs) == GGML_TYPE_COUNT * sizeof(char *), "Incomplete type list"); -static const char * impl_strs[] = { "simd", "reference", "rmse" }; +static const char * impl_strs[] = { "simd", "reference", "rmse-sw", "rmse-unbounded" }; static_assert(sizeof(impl_strs) == GGML_QUANTIZE_IMPL_COUNT * sizeof(char *), "Incomplete implementation list"); struct quantize_stats_params { @@ -52,7 +52,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) { fprintf(stderr, " -m FNAME, --model FNAME\n"); fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); fprintf(stderr, " -i, --implementation\n"); - fprintf(stderr, " select implementation (simd, reference, rmse)\n"); + fprintf(stderr, " select implementation (simd, reference, rmse-sw, rmse-unbounded)\n"); fprintf(stderr, " -v, --verbose\n"); fprintf(stderr, " verbose output (default: false)\n"); fprintf(stderr, " -p, --per-layer-stats\n"); @@ -111,7 +111,7 @@ void print_error_stats(const std::string & name, ggml_quantize_impl_t impl, cons double rmse = sqrt(stats.total_error / (double) stats.num_samples); double median = find_quantile(stats, .5); double pct95 = find_quantile(stats, .95); - printf("%-4s %-10s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", + printf("%-4s %-15s: rmse %.8f, maxerr %.8f, 95pct<%.4f, median<%.4f\n", name.c_str(), impl_strs[impl], rmse, stats.max_error, pct95, median); if (print_histogram) { printf("Error distribution:\n"); @@ -321,12 +321,12 @@ int main(int argc, char ** argv) { continue; } quantize_fns_t qfns = ggml_internal_get_quantize_fn(type); - if (qfns.quantize_row_q && qfns.dequantize_row_q) { - for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) { - if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) { - continue; - } + for (int impl = 0; impl < GGML_QUANTIZE_IMPL_COUNT; impl++) { + if (!params.include_impl.empty() && std::find(params.include_impl.begin(), params.include_impl.end(), impl) == params.include_impl.end()) { + continue; + } + if (qfns.quantize_row_q[impl] && qfns.dequantize_row_q) { if (params.verbose) { printf("testing %s %s ...\n", type_strs[type], impl_strs[impl]); } diff --git a/ggml.c b/ggml.c index 4171ad804c20a..b301c9772bd37 100644 --- a/ggml.c +++ b/ggml.c @@ -6790,27 +6790,22 @@ static void ggml_compute_forward_mul_mat_f16_f32( //} } -static void quantize_row_q_missing(const float * x, void * y, int k) { - (void)x; (void)y; (void)k; - assert(false); -} - static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = { [GGML_TYPE_Q4_0] = { .dequantize_row_q = dequantize_row_q4_0, .quantize_row_q = { - [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_0, - [GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference, - [GGML_QUANTIZE_IMPL_RMSE] = (quantize_row_q_t)quantize_row_q4_0_rmse, + [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_0, + [GGML_QUANTIZE_IMPL_REFERENCE] = (quantize_row_q_t)quantize_row_q4_0_reference, + [GGML_QUANTIZE_IMPL_RMSE_SW] = (quantize_row_q_t)quantize_row_q4_0_rmse, + [GGML_QUANTIZE_IMPL_RMSE_UNBOUNDED] = (quantize_row_q_t)quantize_row_q4_0_slow, }, .vec_dot_q = ggml_vec_dot_q4_0, }, [GGML_TYPE_Q4_1] = { .dequantize_row_q = dequantize_row_q4_1, .quantize_row_q = { - [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_1, - [GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference, - [GGML_QUANTIZE_IMPL_RMSE] = quantize_row_q_missing, + [GGML_QUANTIZE_IMPL_SIMD] = quantize_row_q4_1, + [GGML_QUANTIZE_IMPL_REFERENCE] = quantize_row_q4_1_reference, }, .vec_dot_q = ggml_vec_dot_q4_1, }, diff --git a/ggml.h b/ggml.h index 9c28f781a8588..63c198a4f9806 100644 --- a/ggml.h +++ b/ggml.h @@ -795,7 +795,8 @@ typedef void (*vec_dot_q_t)(const int n, float * s, const void * x, const void * typedef enum { GGML_QUANTIZE_IMPL_SIMD, GGML_QUANTIZE_IMPL_REFERENCE, - GGML_QUANTIZE_IMPL_RMSE, + GGML_QUANTIZE_IMPL_RMSE_SW, + GGML_QUANTIZE_IMPL_RMSE_UNBOUNDED, GGML_QUANTIZE_IMPL_COUNT } ggml_quantize_impl_t;