Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Multi-threaded quantization #1075

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 101 additions & 34 deletions examples/quantize-stats/quantize-stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include <string>
#include <unordered_map>
#include <vector>
#include <thread>
#include <mutex>

struct quantize_stats_params {
std::string model = "models/7B/ggml-model-f16.bin";
Expand All @@ -27,7 +29,6 @@ struct quantize_stats_params {
std::vector<enum ggml_type> include_types;
};

const int64_t SCRATCH_ELEMENTS = 32*32;
const size_t HISTOGRAM_BUCKETS = 150;
const double HISTOGRAM_RANGE = 0.03;

Expand Down Expand Up @@ -90,6 +91,13 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
stats.num_samples += nelements;
}

void combine_error_stats(error_stats & into, const error_stats & from) {
into.num_samples += from.num_samples;
into.total_error += from.total_error;
if (from.max_error > into.max_error) into.max_error = from.max_error;
for (size_t i=0; i<HISTOGRAM_BUCKETS; ++i) into.error_histogram[i] += from.error_histogram[i];
}

double find_quantile(const error_stats & stats, double quantile) {
double sum = std::accumulate(std::begin(stats.error_histogram), std::end(stats.error_histogram), 0.0);

Expand Down Expand Up @@ -130,47 +138,98 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

void test_roundtrip_on_chunk(
const ggml_tensor * layer,
int64_t offset,
int64_t chunk_size,
const quantize_fns_t & qfns,
bool use_reference,
float * input_scratch,
char * quantized_scratch,
float * output_scratch,
error_stats & stats) {

if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
}
} else {
input_scratch = ggml_get_data_f32(layer) + offset;
}

if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);

update_error_stats(chunk_size, input_scratch, output_scratch, stats);
}


// Run quantization function for a single layer and update error stats
void test_roundtrip_on_layer(
std::string & name,
bool print_layer_stats,
const quantize_fns_t & qfns,
bool use_reference,
const ggml_tensor * layer,
float * input_scratch,
char *quantized_scratch,
float * output_scratch,
error_stats & total_error) {
std::vector<float> & input_scratch,
std::vector<char> & quantized_scratch,
std::vector<float> & output_scratch,
error_stats & total_error,
int max_thread = 0) {

assert(tensor_is_contiguous(layer));
error_stats layer_error {};
int64_t nelements = ggml_nelements(layer);

for (int64_t offset = 0; offset < nelements; offset += SCRATCH_ELEMENTS) {
int64_t chunk_size = std::min(SCRATCH_ELEMENTS, nelements - offset);
uint64_t nelements = ggml_nelements(layer);

if (layer->type == GGML_TYPE_F16) {
for (int i = 0; i < chunk_size; i++) {
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
float* input_scratch_ptr = nullptr;
if (layer->type == GGML_TYPE_F16) {
if (input_scratch.size() < nelements) input_scratch.resize(nelements);
input_scratch_ptr = input_scratch.data();
}
if (quantized_scratch.size() < 4*nelements) quantized_scratch.resize(4*nelements);
if (output_scratch.size() < nelements) output_scratch.resize(nelements);

if (max_thread < 1) max_thread = std::thread::hardware_concurrency();
int chunk_size = 32*512;
int num_chunks = (nelements + chunk_size - 1)/chunk_size;

if (num_chunks < 2 || max_thread < 2) {
test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(),
output_scratch.data(), print_layer_stats ? layer_error : total_error);
} else {
auto & stats = print_layer_stats ? layer_error : total_error;
std::mutex mutex;
uint64_t counter = 0;
auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr,
&quantized_scratch, &output_scratch, chunk_size] () {
error_stats local_stats {};
while (true) {
std::unique_lock<std::mutex> lock(mutex);
uint64_t offset = counter; counter += chunk_size;
if (offset >= nelements) {
combine_error_stats(stats, local_stats);
break;
}
lock.unlock();
uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset;
test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset,
quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats);
}
} else {
input_scratch = ggml_get_data_f32(layer) + offset;
}

if (use_reference) {
qfns.quantize_row_q_reference(input_scratch, quantized_scratch, chunk_size);
} else {
qfns.quantize_row_q(input_scratch, quantized_scratch, chunk_size);
}
qfns.dequantize_row_q(quantized_scratch, output_scratch, chunk_size);

update_error_stats(chunk_size, input_scratch, output_scratch, total_error);
if (print_layer_stats) {
update_error_stats(chunk_size, input_scratch, output_scratch, layer_error);
}
};
int nthread = std::min(num_chunks, max_thread);
std::vector<std::thread> workers(nthread-1);
for (auto& w : workers) w = std::thread(compute);
compute();
for (auto& w : workers) w.join();
}

if (print_layer_stats) {
print_error_stats(name, layer_error, false);
combine_error_stats(total_error, layer_error);
}
}

Expand All @@ -181,6 +240,7 @@ int main(int argc, char ** argv) {

// read command line

int max_thread = 0;
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
Expand Down Expand Up @@ -230,6 +290,12 @@ int main(int argc, char ** argv) {
fprintf(stderr, "error: %s not in list of types\n", argv[i]);
invalid_param = true;
}
} else if (arg == "-n" || arg == "--num-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
max_thread = atoi(argv[i]);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
quantize_stats_print_usage(argc, argv);
Expand Down Expand Up @@ -295,9 +361,9 @@ int main(int argc, char ** argv) {
}
printf("testing %d layers with max size %" PRId64 "\n", included_layers, max_nelements);
// allocate scratch space
std::vector<float> input_scratch(SCRATCH_ELEMENTS);
std::vector<char> quantized_scratch(SCRATCH_ELEMENTS*4);
std::vector<float> output_scratch(SCRATCH_ELEMENTS);
std::vector<float> input_scratch;
std::vector<char> quantized_scratch;
std::vector<float> output_scratch;

// loop throught quantization types
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
Expand Down Expand Up @@ -328,10 +394,11 @@ int main(int argc, char ** argv) {
qfns,
params.reference,
kv_tensor.second,
input_scratch.data(),
quantized_scratch.data(),
output_scratch.data(),
global_stats
input_scratch,
quantized_scratch,
output_scratch,
global_stats,
max_thread
);
}

Expand Down
7 changes: 4 additions & 3 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
int main(int argc, char ** argv) {
ggml_time_init();

if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
if (argc < 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
Expand All @@ -30,6 +30,7 @@ int main(int argc, char ** argv) {
const std::string fname_out = argv[2];

const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
int nthread = argc > 4 ? atoi(argv[4]) : 0;

const int64_t t_main_start_us = ggml_time_us();

Expand All @@ -39,7 +40,7 @@ int main(int argc, char ** argv) {
{
const int64_t t_start_us = ggml_time_us();

if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype)) {
if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), ftype, nthread)) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
Expand Down
27 changes: 27 additions & 0 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -12189,6 +12189,33 @@ size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t *
return (n/QK4_3*sizeof(block_q4_3));
}

size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
size_t result = 0;
switch (type) {
case GGML_TYPE_Q4_0:
{
GGML_ASSERT(start % QK4_0 == 0);
block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
result = ggml_quantize_q4_0(src + start, block, n, n, hist);
} break;
case GGML_TYPE_Q4_1:
{
GGML_ASSERT(start % QK4_1 == 0);
block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
result = ggml_quantize_q4_1(src + start, block, n, n, hist);
} break;
case GGML_TYPE_Q4_2:
{
GGML_ASSERT(start % QK4_2 == 0);
block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
result = ggml_quantize_q4_2(src + start, block, n, n, hist);
} break;
default:
assert(false);
}
return result;
}

////////////////////////////////////////////////////////////////////////////////

int ggml_cpu_has_avx(void) {
Expand Down
2 changes: 2 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,8 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t *
size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist);

size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

//
// system info
//
Expand Down
67 changes: 45 additions & 22 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
#include <memory>
#include <algorithm>
#include <initializer_list>
#include <thread>
#include <atomic>
#include <mutex>

#define LLAMA_USE_SCRATCH
#define LLAMA_MAX_SCRATCH_BUFFERS 16
Expand Down Expand Up @@ -1572,7 +1575,7 @@ static llama_vocab::id llama_sample_top_p_top_k(
// quantization
//

static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype) {
static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, enum llama_ftype ftype, int nthread) {
ggml_type quantized_type;
switch (ftype) {
case LLAMA_FTYPE_MOSTLY_Q4_0: quantized_type = GGML_TYPE_Q4_0; break;
Expand All @@ -1582,6 +1585,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
default: throw format("invalid output file type %d\n", ftype);
};

if (nthread <= 0) {
nthread = std::thread::hardware_concurrency();
}

std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp.c_str(), /*use_mmap*/ false,
/*vocab_only*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loaders.at(0).get(), ftype);
Expand All @@ -1590,6 +1597,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
size_t total_size_new = 0;
std::vector<int64_t> hist_all(1 << 4, 0);

std::vector<std::thread> workers;
std::mutex mutex;

size_t idx = 0;
for (llama_load_tensor & tensor : model_loader->tensors_map.tensors) {
llama_buffer read_data;
Expand Down Expand Up @@ -1643,25 +1653,37 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
new_data = work.addr;
std::vector<int64_t> hist_cur(1 << 4, 0);

switch (new_type) {
case GGML_TYPE_Q4_0:
{
new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
new_size = ggml_quantize_q4_2(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
case GGML_TYPE_Q4_3:
{
new_size = ggml_quantize_q4_3(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
} break;
default:
LLAMA_ASSERT(false);
int chunk_size = 32 * 512;
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
if (nthread_use < 2) {
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
} else {
size_t counter = 0;
new_size = 0;
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
std::vector<int64_t> local_hist;
size_t local_size = 0;
while (true) {
std::unique_lock<std::mutex> lock(mutex);
size_t first = counter; counter += chunk_size;
if (first >= nelements) {
if (!local_hist.empty()) {
for (int j=0; j<int(local_hist.size()); ++j) hist_cur[j] += local_hist[j];
new_size += local_size;
}
break;
}
lock.unlock();
size_t last = std::min(nelements, first + chunk_size);
if (local_hist.empty()) local_hist.resize(hist_cur.size(), 0);
local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
}
};
if (int(workers.size()) < nthread_use - 1) workers.resize(nthread_use - 1);
for (int it = 0; it < nthread_use - 1; ++it) workers[it] = std::thread(compute);
compute();
for (int it = 0; it < nthread_use - 1; ++it) workers[it].join();
}

printf("size = %8.2f MB -> %8.2f MB | hist: ", tensor.size/1024.0/1024.0, new_size/1024.0/1024.0);
Expand Down Expand Up @@ -1783,9 +1805,10 @@ void llama_free(struct llama_context * ctx) {
int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
enum llama_ftype ftype) {
enum llama_ftype ftype,
int nthread) {
try {
llama_model_quantize_internal(fname_inp, fname_out, ftype);
llama_model_quantize_internal(fname_inp, fname_out, ftype, nthread);
return 0;
} catch (const std::string & err) {
fprintf(stderr, "%s: failed to quantize: %s\n", __func__, err.c_str());
Expand Down
Loading