Thanks to visit codestin.com
Credit goes to github.com

Skip to content

quantize : use map to assign quantization type from string #1191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devops/tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ elif [[ $arg1 == '--all-in-one' || $arg1 == '-a' ]]; then
echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
else
echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
./quantize "$i" "${i/f16/q4_0}" 2
./quantize "$i" "${i/f16/q4_0}" q4_0
fi
done
else
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ python3 -m pip install -r requirements.txt
# convert the 7B model to ggml FP16 format
python3 convert.py models/7B/

# quantize the model to 4-bits (using method 2 = q4_0)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin 2
# quantize the model to 4-bits (using q4_0 method)
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0

# run the inference
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
Expand Down
30 changes: 24 additions & 6 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,17 @@
#include "llama.h"

#include <cstdio>
#include <map>
#include <string>

static const std::map<std::string, enum llama_ftype> LLAMA_FTYPE_MAP = {
{"q4_0", LLAMA_FTYPE_MOSTLY_Q4_0},
{"q4_1", LLAMA_FTYPE_MOSTLY_Q4_1},
{"q4_2", LLAMA_FTYPE_MOSTLY_Q4_2},
{"q4_3", LLAMA_FTYPE_MOSTLY_Q4_3},
{"q8_0", LLAMA_FTYPE_MOSTLY_Q8_0},
};

// usage:
// ./quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type
//
Expand All @@ -12,11 +21,9 @@ int main(int argc, char ** argv) {

if (argc < 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type [nthread]\n", argv[0]);
fprintf(stderr, " type = %d - q4_0\n", LLAMA_FTYPE_MOSTLY_Q4_0);
fprintf(stderr, " type = %d - q4_1\n", LLAMA_FTYPE_MOSTLY_Q4_1);
fprintf(stderr, " type = %d - q4_2\n", LLAMA_FTYPE_MOSTLY_Q4_2);
fprintf(stderr, " type = %d - q4_3\n", LLAMA_FTYPE_MOSTLY_Q4_3);
fprintf(stderr, " type = %d - q8_0\n", LLAMA_FTYPE_MOSTLY_Q8_0);
for (auto it = LLAMA_FTYPE_MAP.begin(); it != LLAMA_FTYPE_MAP.end(); it++) {
fprintf(stderr, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
return 1;
}

Expand All @@ -30,7 +37,18 @@ int main(int argc, char ** argv) {
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];

const enum llama_ftype ftype = (enum llama_ftype)atoi(argv[3]);
enum llama_ftype ftype;
if (argv[3][0] == 'q') {
auto it = LLAMA_FTYPE_MAP.find(argv[3]);
if (it == LLAMA_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, argv[3]);
return 1;
}
ftype = it->second;
} else {
ftype = (enum llama_ftype)atoi(argv[3]);
}

int nthread = argc > 4 ? atoi(argv[4]) : 0;

const int64_t t_main_start_us = ggml_time_us();
Expand Down