diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 7e601170e925..9803c6a2aa3c 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -173,6 +173,11 @@ def write_tensors(self): def write(self): self.write_tensors() + + if self.model_arch == gguf.MODEL_ARCH.COMMAND_R_PLUS: + sys.stderr.write(""" + Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged. If you distribute the files you are about to create, know that they will not be compatible with mainline llama.cpp.\n""") + self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() self.gguf_writer.write_tensors_to_file() @@ -2340,11 +2345,26 @@ class CommandR2Model(Model): model_arch = gguf.MODEL_ARCH.COMMAND_R def __init__(self, *args, **kwargs): + # Hack: Command-R+ (not to be confused with Command-R) uses the same + # transformers class CohereForCausalLM, but has a new use_qk_norm + # parameter. We can use that to detect it and use the Command-R+ model + # arch instead. + dir_model = None + if 'dir_model' in kwargs: + dir_model = kwargs['dir_model'] + else: + dir_model = args[0] + hparams = Model.load_hparams(dir_model) + if 'use_qk_norm' in hparams and hparams['use_qk_norm']: + self.model_arch = gguf.MODEL_ARCH.COMMAND_R_PLUS + del hparams + super().__init__(*args, **kwargs) # max_position_embeddings = 8192 in config.json but model was actually # trained on 128k context length - self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] + if 'model_max_length' in self.hparams: + self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] def set_gguf_parameters(self): super().set_gguf_parameters() @@ -2423,6 +2443,8 @@ def main() -> None: model_class = Model.from_model_architecture(hparams["architectures"][0]) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + print(f"Model architecture: {model_instance.model_arch.name}") + print("Set model parameters") model_instance.set_gguf_parameters() diff --git a/ggml.c b/ggml.c index c9b0a6a0ef77..d93ad8d9e73b 100644 --- a/ggml.c +++ b/ggml.c @@ -3238,6 +3238,10 @@ void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * } } +int64_t ggml_ravel_index(const struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) { + return i0 + i1*tensor->ne[0] + i2*tensor->ne[1]*tensor->ne[0] + i3*tensor->ne[2]*tensor->ne[1]*tensor->ne[0]; +} + int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { if (!ggml_is_contiguous(tensor)) { int64_t id[4] = { 0, 0, 0, 0 }; @@ -11539,6 +11543,8 @@ static void ggml_compute_forward_scale( } break; default: { + fprintf(stderr, "%s\n", ggml_get_name(src0)); + fprintf(stderr, "%s\n", ggml_get_name(dst)); GGML_ASSERT(false); } break; } @@ -11655,7 +11661,12 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_S: default: { - GGML_ASSERT(false); + if (src0->type >= 0 && src0->type < GGML_TYPE_COUNT) { + fprintf(stderr, "Unknown type for tensor %s: %d (%s)\n", ggml_get_name(dst), src0->type, ggml_type_name(src0->type)); + } else { + fprintf(stderr, "Unknown type for tensor %s: %d (out of GGML_TYPE_COUNT range)\n", ggml_get_name(dst), src0->type); + } + GGML_ASSERT(false && "unknown type"); } break; } } @@ -20335,12 +20346,19 @@ size_t ggml_quantize_chunk( int nrows, int n_per_row, const float * imatrix) { - const int n = nrows * n_per_row; + const size_t n = (size_t) nrows * n_per_row; if (ggml_quantize_requires_imatrix(type)) { GGML_ASSERT(imatrix != NULL); } + // TODO: remove when we know we handle tensors bigger than 2**31-1 + // properly. A lot of quant code uses 'int's to count rather than + // size_t or int64_t etc. + if (n > INT_MAX) { + fprintf(stderr, "warning: a tensor has more than 2**31-1 elements: %ld this can cause silent corruption.\n", (long int) n); + } + GGML_ASSERT(start % type_traits[type].blck_size == 0); GGML_ASSERT(start % n_per_row == 0); diff --git a/ggml.h b/ggml.h index 5cef45c0ba4a..108cc82f055a 100644 --- a/ggml.h +++ b/ggml.h @@ -826,6 +826,7 @@ extern "C" { // Converts a flat index into coordinates GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3); + GGML_API int64_t ggml_ravel_index(const struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3); GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 5214764a9ea9..ec84a0d60d4e 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -97,34 +97,35 @@ class Tokenizer: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - PERSIMMON = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - PHI2 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - GEMMA = auto() - STARCODER2 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() + LLAMA = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + PERSIMMON = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + PHI2 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + GEMMA = auto() + STARCODER2 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + COMMAND_R_PLUS = auto() class MODEL_TENSOR(IntEnum): @@ -194,6 +195,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.MAMBA: "mamba", MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.COMMAND_R_PLUS: "command-r-plus", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -639,6 +641,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.COMMAND_R_PLUS: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 345b1b0c7221..18d9352f6cfa 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -286,12 +286,14 @@ class TensorNameMap: "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon "transformer.blocks.{bid}.attn.q_ln", # sea-lion + "model.layers.{bid}.self_attn.q_norm", # command-r+ ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon "transformer.blocks.{bid}.attn.k_ln", # sea-lion + "model.layers.{bid}.self_attn.k_norm", # command-r+ ), MODEL_TENSOR.ROPE_FREQS: ( diff --git a/llama.cpp b/llama.cpp index 9a1c11043b94..438763da0181 100644 --- a/llama.cpp +++ b/llama.cpp @@ -220,39 +220,41 @@ enum llm_arch { LLM_ARCH_MAMBA, LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, + LLM_ARCH_COMMAND_R_PLUS, LLM_ARCH_UNKNOWN, }; static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GROK, "grok" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_PERSIMMON, "persimmon" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, - { LLM_ARCH_GEMMA, "gemma" }, - { LLM_ARCH_STARCODER2, "starcoder2" }, - { LLM_ARCH_MAMBA, "mamba" }, - { LLM_ARCH_XVERSE, "xverse" }, - { LLM_ARCH_COMMAND_R, "command-r" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_PERSIMMON, "persimmon" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, + { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_COMMAND_R_PLUS, "command-r-plus" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, }; enum llm_kv { @@ -926,6 +928,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_COMMAND_R_PLUS, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1692,6 +1711,7 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_104B, MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, @@ -2928,6 +2948,11 @@ struct llama_model_loader { get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); + + if (arch_name == "command-r-plus") { + fprintf(stderr, "Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged.\n"); + } + // Save tensors data offset of the main file. // For subsidiary files, `meta` tensor data offset must not be used, // so we build a unified tensors index for weights. @@ -3543,6 +3568,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; + case MODEL_104B: return "104B"; case MODEL_314B: return "314B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; @@ -3953,11 +3979,13 @@ static void llm_load_hparams( } } break; case LLM_ARCH_COMMAND_R: + case LLM_ARCH_COMMAND_R_PLUS: { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); switch (hparams.n_layer) { case 40: model.type = e_model::MODEL_35B; break; + case 64: model.type = e_model::MODEL_104B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -5384,6 +5412,7 @@ static bool llm_load_tensors( } } break; case LLM_ARCH_COMMAND_R: + case LLM_ARCH_COMMAND_R_PLUS: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -5402,8 +5431,17 @@ static bool llm_load_tensors( auto & layer = model.layers[i]; + const int64_t n_embd_head_k = hparams.n_embd_head_k; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + if (model.arch == LLM_ARCH_COMMAND_R_PLUS) { + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv}); + } + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); @@ -5726,6 +5764,283 @@ static void llm_build_kv_store( ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); } +static struct ggml_tensor * llama_stitch_2d_grid_of_tensors_together( + struct ggml_context * ctx, + struct ggml_tensor ** tensors, + size_t n_a, + size_t n_b) +{ + // (ab)use ggml_concat to stitch together a grid of 2D tensors into one + // big 1D tensor. + // + // ggml_concat only supports concatenating on the third dimension. + // + // 1. For each row in the 2D grid: + // Reshape 2D dimension to 3D dimension + // Concatenate along the third dimension + // Reshape back to 2D dimension + // 2. This gets us N tensors, where N is number of rows, concatenated. + // Do the same thing: + // Transpose the tensor + // Reshape 2D to 3D + // Concat + // Reshape back + + ggml_tensor * concatted_whole = nullptr; + for (size_t i = 0; i < n_a; ++i) { + ggml_tensor * concatted_row = nullptr; + for (size_t j = 0; j < n_b; ++j) { + struct ggml_tensor * cur = tensors[i * n_b + j]; + if (concatted_row == nullptr) { + concatted_row = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); + } else { + concatted_row = ggml_concat(ctx, concatted_row, ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1])); + } + } + concatted_row = ggml_cont(ctx, concatted_row); + //printf("concatted row dims: %d %d %d %d\n", concatted_row->ne[0], concatted_row->ne[1], concatted_row->ne[2], concatted_row->ne[3]); + if (concatted_whole == nullptr) { + concatted_row = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], concatted_row->ne[2], 1); + concatted_row = ggml_transpose(ctx, concatted_row); + concatted_row = ggml_cont(ctx, concatted_row); + concatted_whole = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], 1, concatted_row->ne[1]); + } else { + concatted_row = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], concatted_row->ne[2], 1); + concatted_row = ggml_transpose(ctx, concatted_row); + concatted_row = ggml_cont(ctx, concatted_row); + concatted_whole = ggml_concat(ctx, concatted_whole, ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], 1, concatted_row->ne[1])); + } + //printf("concatted whole dims: %d %d %d %d\n", concatted_whole->ne[0], concatted_whole->ne[1], concatted_whole->ne[2], concatted_whole->ne[3]); + } + concatted_whole = ggml_reshape_3d(ctx, concatted_whole, concatted_whole->ne[0], concatted_whole->ne[2], 1); + concatted_whole = ggml_cont(ctx, ggml_transpose(ctx, concatted_whole)); + //printf("final concatted whole dims: %d %d %d %d\n", concatted_whole->ne[0], concatted_whole->ne[1], concatted_whole->ne[2], concatted_whole->ne[3]); + return concatted_whole; +} + +static struct ggml_tensor * llama_build_mat_mul_blocked_computation( + /* + * Does (almost) same thing as ggml_mat_mul mathematically speaking, + * but splits the computation into chunks. + * + * Why would you want to do this? As part of Command-R+ coding, we + * discovered that quite a bit of the GPU code is not prepared for + * matrices with more than 2**31-1 elements (~2 billion). + * + * Some context: + * https://github.com/ggerganov/llama.cpp/pull/6491 + * + * This function has a limit (set to 2B) that if any constituent parts + * of it (input, output, result) would go over that limit byte-wise, + * it'll use the splitted computation. This is based on the idea that + * this minimizes the chance that somewhere downstream in GPU code, be + * it MPS or Cuda, has something like: int x = y * z; where the values + * of y and z overflow the multiplication and then silently (or not so + * silently) does something weird. At the time of writing (2024-04-05); + * it seems that CUDA code outright crashes and MPS silently gives bad + * results. + * + * This is a band-aid workaround. The ideal state of the world is that + * this function does nothing but "return ggml_mat_mul(ctx, a, b)". + * + * The last argument (forced_block_size) is for debugging. You can + * force a certain block size to use with the computation. If zero + * (default) then the block size is determined on the fly. Production + * code should always have it zero; and only set it to a non-zero value + * for debugging and testing. + */ + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + const llm_build_cb & cb, + int64_t il, + size_t forced_block_size) +{ + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + if (forced_block_size != 0) { + //fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size); + } + + const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000; + + // the actual ggml_mul_mat supports batching. But this one doesn't. + GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1); + GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1); + + // bail out if if the number of elements would be zero. + // nicer than getting a segfault. + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a')."); + GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b')."); + } + + // Use the max size of: a, b, result size + const size_t a_rows = a->ne[1]; + const size_t a_cols = a->ne[0]; + + // b is transposed + const size_t b_rows = b->ne[0]; + const size_t b_cols = b->ne[1]; + + const size_t c_rows = a_rows; + const size_t c_cols = b_cols; + + // determine a size of a block that's as big as possible. + // we start with block size of the maximum size, and if that passes, + // then we just use ggml_mat_mul() + // + // the block is square. + size_t cand_block_size = a_rows; + if (a_cols > cand_block_size) { cand_block_size = a_cols; } + if (b_rows > cand_block_size) { cand_block_size = b_rows; } + if (b_cols > cand_block_size) { cand_block_size = b_cols; } + if (c_rows > cand_block_size) { cand_block_size = c_rows; } + if (c_cols > cand_block_size) { cand_block_size = c_cols; } + + size_t block_size = 1; + while (block_size < cand_block_size) { + block_size <<= 1; + } + + if (forced_block_size != 0) { + block_size = forced_block_size; + } else { + // figure out what is largest block_size we can use that will never + // have an intermediate result bigger than + // MAX_BYTES_BEFORE_SPLIT + bool ok = true; + while (block_size > 0) { + ok = true; + + // keep the byte calculations in sync with the blocked code in + // the computation part. + + // Criteria: + // 1. result block size + { + const size_t i_min = 0; + const size_t j_min = 0; + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + + const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min); + if (bytes_size > MAX_BYTES_BEFORE_SPLIT) { + ok = false; + } + } + // 2. and 3. + // Block size from 'a' and 'b' + { + const size_t i_min = 0; + const size_t j_min = 0; + const size_t k_min = 0; + + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + size_t k_max = k_min + block_size; + + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + if (k_max > a_cols) { k_max = a_cols; } + + const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min); + const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min); + + if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) { + ok = false; + } + } + + if (!ok) { + block_size /= 2; + continue; + } + break; + } + block_size /= 2; + GGML_ASSERT(block_size > 0); + } + + + // O(N^3) nested loop, where N is number of blocks on one of the + // constituent parts. + size_t nb_A = (a_rows + block_size - 1) / block_size; + size_t nb_B = (b_cols + block_size - 1) / block_size; + size_t nb_A2 = (a_cols + block_size - 1) / block_size; + + // make placeholder tensors for each block results. + // 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y + struct ggml_tensor ** result_blocks = (struct ggml_tensor **) calloc(nb_A * nb_B, sizeof(struct ggml_tensor *)); + for (size_t i = 0; i < nb_A; ++i) { + for (size_t j = 0; j < nb_B; ++j) { + for (size_t k = 0; k < nb_A2; ++k) { + const size_t i_min = i * block_size; + const size_t j_min = j * block_size; + const size_t k_min = k * block_size; + + size_t i_max = i_min + block_size; + size_t j_max = j_min + block_size; + size_t k_max = k_min + block_size; + if (i_max > a_rows) { i_max = a_rows; } + if (j_max > b_cols) { j_max = b_cols; } + if (k_max > a_cols) { k_max = a_cols; } + + const size_t blck_size_a = (const size_t) ggml_blck_size(a->type); + const size_t blck_size_b = (const size_t) ggml_blck_size(b->type); + const size_t type_size_a = ggml_type_size(a->type); + const size_t type_size_b = ggml_type_size(b->type); + + GGML_ASSERT(k_min * type_size_a % blck_size_a == 0); + GGML_ASSERT(k_min * type_size_b % blck_size_b == 0); + + struct ggml_tensor * a_slice = ggml_cont(ctx, ggml_view_2d( + ctx, a, + k_max - k_min, + i_max - i_min, + ggml_row_size(a->type, a->ne[0]), + ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a)); + + cb(a_slice, "a_slice", il); + + struct ggml_tensor * b_slice = ggml_cont(ctx, ggml_view_2d( + ctx, b, + k_max - k_min, + j_max - j_min, + ggml_row_size(b->type, b->ne[0]), + ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b)); + + cb(b_slice, "b_slice", il); + + struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice); + cb(mm_result, "mm_result", il); + + if (result_blocks[i * nb_B + j] == nullptr) { + result_blocks[i * nb_B + j] = mm_result; + } else { + result_blocks[i * nb_B + j] = ggml_add_inplace(ctx, result_blocks[i * nb_B + j], mm_result); + } + + cb(result_blocks[i * nb_B + j], "result_slice", il); + } + } + } + + // concate the results into one chonky tensor. + struct ggml_tensor * result = llama_stitch_2d_grid_of_tensors_together( + ctx, + result_blocks, + nb_A, + nb_B); + cb(result, "result-stitched", il); + + free(result_blocks); + + return result; +} + static struct ggml_tensor * llm_build_norm( struct ggml_context * ctx, struct ggml_tensor * cur, @@ -9405,9 +9720,11 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + const int64_t n_head = hparams.n_head; const float f_logit_scale = hparams.f_logit_scale; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -9432,26 +9749,64 @@ struct llm_build_context { { // compute Q and K and RoPE them struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); - cb(Qcur, "Qcur", il); + cb(Qcur, "Qcur-first", il); if (model.layers[il].bq) { Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); cb(Qcur, "Qcur", il); } struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); - cb(Kcur, "Kcur", il); + cb(Kcur, "Kcur-first", il); if (model.layers[il].bk) { Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); cb(Kcur, "Kcur", il); } struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); - cb(Vcur, "Vcur", il); + cb(Vcur, "Vcur-first", il); if (model.layers[il].bv) { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); } + if (model.layers[il].attn_q_norm) + { + // Noeda hacks; TODO: remove everything extra you don't + // actually need. If you see this comment in a PR then + // someone forgot to clean up the hacks. + struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm; + struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm; + + // CPU did not like F16, so cast to F32 + attn_q_norm = ggml_cast(ctx0, attn_q_norm, GGML_TYPE_F32); + cb(attn_q_norm, "attn_q_norm_cast_F32", il); + attn_k_norm = ggml_cast(ctx0, attn_k_norm, GGML_TYPE_F32); + cb(attn_k_norm, "attn_k_norm_cast_F32", il); + + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_row_size(Qcur->type, n_embd_head), + ggml_row_size(Qcur->type, n_embd_head) * n_head, + 0); + cb(Qcur, "Qcur", il); + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_row_size(Kcur->type, n_embd_head), + ggml_row_size(Kcur->type, n_embd_head) * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, + attn_q_norm, + NULL, + LLM_NORM, cb, il); + cb(Qcur, "Qcur-normed", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, + attn_k_norm, + NULL, + LLM_NORM, cb, il); + cb(Kcur, "Kcur-normed", il); + } + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, @@ -9509,8 +9864,8 @@ struct llm_build_context { cb(cur, "result_norm", -1); // lm_head - cur = ggml_mul_mat(ctx0, model.output, cur); - + //cur = ggml_mul_mat(ctx0, model.output, cur); + cur = llama_build_mat_mul_blocked_computation(ctx0, model.output, cur, cb, -1, 0); if (f_logit_scale) { cur = ggml_scale(ctx0, cur, f_logit_scale); } @@ -9716,6 +10071,7 @@ static struct ggml_cgraph * llama_build_graph( result = llm.build_xverse(); } break; case LLM_ARCH_COMMAND_R: + case LLM_ARCH_COMMAND_R_PLUS: { result = llm.build_command_r(); } break; @@ -10267,6 +10623,7 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); ggml_backend_sched_reset(lctx.sched); + ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data); ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false); @@ -13062,9 +13419,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return std::make_pair(i_layer, n_layer); }; + // Command-R+ has such a large embedding weight tensor it overflows + // 32-bit signed integers. This is band-aid until quants can deal with + // that. + if (name == "token_embd.weight" && arch == LLM_ARCH_COMMAND_R_PLUS) { + new_type = GGML_TYPE_F16; + } // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor - if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { + else if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { new_type = qs.params->output_tensor_type; } else { @@ -13096,6 +13459,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n new_type = GGML_TYPE_IQ3_S; } } + } else if ((arch == LLM_ARCH_COMMAND_R || + arch == LLM_ARCH_COMMAND_R_PLUS) && + (name.find("q_norm") != std::string::npos || + name.find("k_norm") != std::string::npos)) { + new_type = GGML_TYPE_F32; + } else if (arch == LLM_ARCH_COMMAND_R_PLUS && name.find("attn_k") != std::string::npos) { + new_type = GGML_TYPE_F16; + } else if (arch == LLM_ARCH_COMMAND_R_PLUS && name.find("attn_v") != std::string::npos) { + new_type = GGML_TYPE_F16; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { @@ -14253,6 +14625,10 @@ struct llama_context * llama_new_context_with_model( type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states } + if (model->arch == LLM_ARCH_COMMAND_R_PLUS) { + fprintf(stderr, "Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged.\n"); + } + GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); @@ -14525,6 +14901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_MINICPM: case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: + case LLM_ARCH_COMMAND_R_PLUS: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2