diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 7e601170e925..9803c6a2aa3c 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -173,6 +173,11 @@ def write_tensors(self):
 
     def write(self):
         self.write_tensors()
+
+        if self.model_arch == gguf.MODEL_ARCH.COMMAND_R_PLUS:
+            sys.stderr.write("""
+                Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged. If you distribute the files you are about to create, know that they will not be compatible with mainline llama.cpp.\n""")
+
         self.gguf_writer.write_header_to_file()
         self.gguf_writer.write_kv_data_to_file()
         self.gguf_writer.write_tensors_to_file()
@@ -2340,11 +2345,26 @@ class CommandR2Model(Model):
     model_arch = gguf.MODEL_ARCH.COMMAND_R
 
     def __init__(self, *args, **kwargs):
+        # Hack: Command-R+ (not to be confused with Command-R) uses the same
+        # transformers class CohereForCausalLM, but has a new use_qk_norm
+        # parameter. We can use that to detect it and use the Command-R+ model
+        # arch instead.
+        dir_model = None
+        if 'dir_model' in kwargs:
+            dir_model = kwargs['dir_model']
+        else:
+            dir_model = args[0]
+        hparams = Model.load_hparams(dir_model)
+        if 'use_qk_norm' in hparams and hparams['use_qk_norm']:
+            self.model_arch = gguf.MODEL_ARCH.COMMAND_R_PLUS
+        del hparams
+
         super().__init__(*args, **kwargs)
 
         # max_position_embeddings = 8192 in config.json but model was actually
         # trained on 128k context length
-        self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+        if 'model_max_length' in self.hparams:
+            self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -2423,6 +2443,8 @@ def main() -> None:
         model_class = Model.from_model_architecture(hparams["architectures"][0])
         model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian)
 
+        print(f"Model architecture: {model_instance.model_arch.name}")
+
         print("Set model parameters")
         model_instance.set_gguf_parameters()
 
diff --git a/ggml.c b/ggml.c
index c9b0a6a0ef77..d93ad8d9e73b 100644
--- a/ggml.c
+++ b/ggml.c
@@ -3238,6 +3238,10 @@ void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t *
     }
 }
 
+int64_t ggml_ravel_index(const struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+    return i0 + i1*tensor->ne[0] + i2*tensor->ne[1]*tensor->ne[0] + i3*tensor->ne[2]*tensor->ne[1]*tensor->ne[0];
+}
+
 int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {
     if (!ggml_is_contiguous(tensor)) {
         int64_t id[4] = { 0, 0, 0, 0 };
@@ -11539,6 +11543,8 @@ static void ggml_compute_forward_scale(
             } break;
         default:
             {
+                fprintf(stderr, "%s\n", ggml_get_name(src0));
+                fprintf(stderr, "%s\n", ggml_get_name(dst));
                 GGML_ASSERT(false);
             } break;
     }
@@ -11655,7 +11661,12 @@ static void ggml_compute_forward_set(
         case GGML_TYPE_IQ2_S:
         default:
             {
-                GGML_ASSERT(false);
+                if (src0->type >= 0 && src0->type < GGML_TYPE_COUNT) {
+                    fprintf(stderr, "Unknown type for tensor %s: %d (%s)\n", ggml_get_name(dst), src0->type, ggml_type_name(src0->type));
+                } else {
+                    fprintf(stderr, "Unknown type for tensor %s: %d (out of GGML_TYPE_COUNT range)\n", ggml_get_name(dst), src0->type);
+                }
+                GGML_ASSERT(false && "unknown type");
             } break;
     }
 }
@@ -20335,12 +20346,19 @@ size_t ggml_quantize_chunk(
                    int   nrows,
                    int   n_per_row,
            const float * imatrix) {
-    const int n = nrows * n_per_row;
+    const size_t n = (size_t) nrows * n_per_row;
 
     if (ggml_quantize_requires_imatrix(type)) {
         GGML_ASSERT(imatrix != NULL);
     }
 
+    // TODO: remove when we know we handle tensors bigger than 2**31-1
+    // properly. A lot of quant code uses 'int's to count rather than
+    // size_t or int64_t etc.
+    if (n > INT_MAX) {
+        fprintf(stderr, "warning: a tensor has more than 2**31-1 elements: %ld this can cause silent corruption.\n", (long int) n);
+    }
+
     GGML_ASSERT(start % type_traits[type].blck_size == 0);
     GGML_ASSERT(start % n_per_row == 0);
 
diff --git a/ggml.h b/ggml.h
index 5cef45c0ba4a..108cc82f055a 100644
--- a/ggml.h
+++ b/ggml.h
@@ -826,6 +826,7 @@ extern "C" {
 
     // Converts a flat index into coordinates
     GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+    GGML_API int64_t ggml_ravel_index(const struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, int64_t i3);
 
     GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
     GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 5214764a9ea9..ec84a0d60d4e 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -97,34 +97,35 @@ class Tokenizer:
 
 
 class MODEL_ARCH(IntEnum):
-    LLAMA      = auto()
-    FALCON     = auto()
-    BAICHUAN   = auto()
-    GROK       = auto()
-    GPT2       = auto()
-    GPTJ       = auto()
-    GPTNEOX    = auto()
-    MPT        = auto()
-    STARCODER  = auto()
-    PERSIMMON  = auto()
-    REFACT     = auto()
-    BERT       = auto()
-    NOMIC_BERT = auto()
-    BLOOM      = auto()
-    STABLELM   = auto()
-    QWEN       = auto()
-    QWEN2      = auto()
-    PHI2       = auto()
-    PLAMO      = auto()
-    CODESHELL  = auto()
-    ORION      = auto()
-    INTERNLM2  = auto()
-    MINICPM    = auto()
-    GEMMA      = auto()
-    STARCODER2 = auto()
-    MAMBA      = auto()
-    XVERSE     = auto()
-    COMMAND_R  = auto()
+    LLAMA           = auto()
+    FALCON          = auto()
+    BAICHUAN        = auto()
+    GROK            = auto()
+    GPT2            = auto()
+    GPTJ            = auto()
+    GPTNEOX         = auto()
+    MPT             = auto()
+    STARCODER       = auto()
+    PERSIMMON       = auto()
+    REFACT          = auto()
+    BERT            = auto()
+    NOMIC_BERT      = auto()
+    BLOOM           = auto()
+    STABLELM        = auto()
+    QWEN            = auto()
+    QWEN2           = auto()
+    PHI2            = auto()
+    PLAMO           = auto()
+    CODESHELL       = auto()
+    ORION           = auto()
+    INTERNLM2       = auto()
+    MINICPM         = auto()
+    GEMMA           = auto()
+    STARCODER2      = auto()
+    MAMBA           = auto()
+    XVERSE          = auto()
+    COMMAND_R       = auto()
+    COMMAND_R_PLUS  = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -194,6 +195,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MAMBA:          "mamba",
     MODEL_ARCH.XVERSE:         "xverse",
     MODEL_ARCH.COMMAND_R:      "command-r",
+    MODEL_ARCH.COMMAND_R_PLUS: "command-r-plus",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -639,6 +641,20 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.COMMAND_R_PLUS: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 345b1b0c7221..18d9352f6cfa 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -286,12 +286,14 @@ class TensorNameMap:
             "language_model.encoder.layers.{bid}.self_attention.q_layernorm",
             "model.layers.{bid}.self_attn.q_layernorm",                       # persimmon
             "transformer.blocks.{bid}.attn.q_ln",                             # sea-lion
+            "model.layers.{bid}.self_attn.q_norm",                            # command-r+
         ),
 
         MODEL_TENSOR.ATTN_K_NORM: (
             "language_model.encoder.layers.{bid}.self_attention.k_layernorm",
             "model.layers.{bid}.self_attn.k_layernorm",                       # persimmon
             "transformer.blocks.{bid}.attn.k_ln",                             # sea-lion
+            "model.layers.{bid}.self_attn.k_norm",                            # command-r+
         ),
 
         MODEL_TENSOR.ROPE_FREQS: (
diff --git a/llama.cpp b/llama.cpp
index 9a1c11043b94..438763da0181 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -220,39 +220,41 @@ enum llm_arch {
     LLM_ARCH_MAMBA,
     LLM_ARCH_XVERSE,
     LLM_ARCH_COMMAND_R,
+    LLM_ARCH_COMMAND_R_PLUS,
     LLM_ARCH_UNKNOWN,
 };
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
-    { LLM_ARCH_LLAMA,           "llama"      },
-    { LLM_ARCH_FALCON,          "falcon"     },
-    { LLM_ARCH_GROK,            "grok"       },
-    { LLM_ARCH_GPT2,            "gpt2"       },
-    { LLM_ARCH_GPTJ,            "gptj"       },
-    { LLM_ARCH_GPTNEOX,         "gptneox"    },
-    { LLM_ARCH_MPT,             "mpt"        },
-    { LLM_ARCH_BAICHUAN,        "baichuan"   },
-    { LLM_ARCH_STARCODER,       "starcoder"  },
-    { LLM_ARCH_PERSIMMON,       "persimmon"  },
-    { LLM_ARCH_REFACT,          "refact"     },
-    { LLM_ARCH_BERT,            "bert"       },
-    { LLM_ARCH_NOMIC_BERT,      "nomic-bert" },
-    { LLM_ARCH_BLOOM,           "bloom"      },
-    { LLM_ARCH_STABLELM,        "stablelm"   },
-    { LLM_ARCH_QWEN,            "qwen"       },
-    { LLM_ARCH_QWEN2,           "qwen2"      },
-    { LLM_ARCH_PHI2,            "phi2"       },
-    { LLM_ARCH_PLAMO,           "plamo"      },
-    { LLM_ARCH_CODESHELL,       "codeshell"  },
-    { LLM_ARCH_ORION,           "orion"      },
-    { LLM_ARCH_INTERNLM2,       "internlm2"  },
-    { LLM_ARCH_MINICPM,         "minicpm"    },
-    { LLM_ARCH_GEMMA,           "gemma"      },
-    { LLM_ARCH_STARCODER2,      "starcoder2" },
-    { LLM_ARCH_MAMBA,           "mamba"      },
-    { LLM_ARCH_XVERSE,          "xverse"     },
-    { LLM_ARCH_COMMAND_R,       "command-r"  },
-    { LLM_ARCH_UNKNOWN,         "(unknown)"  },
+    { LLM_ARCH_LLAMA,           "llama"           },
+    { LLM_ARCH_FALCON,          "falcon"          },
+    { LLM_ARCH_GROK,            "grok"            },
+    { LLM_ARCH_GPT2,            "gpt2"            },
+    { LLM_ARCH_GPTJ,            "gptj"            },
+    { LLM_ARCH_GPTNEOX,         "gptneox"         },
+    { LLM_ARCH_MPT,             "mpt"             },
+    { LLM_ARCH_BAICHUAN,        "baichuan"        },
+    { LLM_ARCH_STARCODER,       "starcoder"       },
+    { LLM_ARCH_PERSIMMON,       "persimmon"       },
+    { LLM_ARCH_REFACT,          "refact"          },
+    { LLM_ARCH_BERT,            "bert"            },
+    { LLM_ARCH_NOMIC_BERT,      "nomic-bert"      },
+    { LLM_ARCH_BLOOM,           "bloom"           },
+    { LLM_ARCH_STABLELM,        "stablelm"        },
+    { LLM_ARCH_QWEN,            "qwen"            },
+    { LLM_ARCH_QWEN2,           "qwen2"           },
+    { LLM_ARCH_PHI2,            "phi2"            },
+    { LLM_ARCH_PLAMO,           "plamo"           },
+    { LLM_ARCH_CODESHELL,       "codeshell"       },
+    { LLM_ARCH_ORION,           "orion"           },
+    { LLM_ARCH_INTERNLM2,       "internlm2"       },
+    { LLM_ARCH_MINICPM,         "minicpm"         },
+    { LLM_ARCH_GEMMA,           "gemma"           },
+    { LLM_ARCH_STARCODER2,      "starcoder2"      },
+    { LLM_ARCH_MAMBA,           "mamba"           },
+    { LLM_ARCH_XVERSE,          "xverse"          },
+    { LLM_ARCH_COMMAND_R,       "command-r"       },
+    { LLM_ARCH_COMMAND_R_PLUS,  "command-r-plus"  },
+    { LLM_ARCH_UNKNOWN,         "(unknown)"       },
 };
 
 enum llm_kv {
@@ -926,6 +928,23 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_COMMAND_R_PLUS,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q_NORM,     "blk.%d.attn_q_norm"},
+            { LLM_TENSOR_ATTN_K_NORM,     "blk.%d.attn_k_norm"},
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1692,6 +1711,7 @@ enum e_model {
     MODEL_40B,
     MODEL_65B,
     MODEL_70B,
+    MODEL_104B,
     MODEL_314B,
     MODEL_SMALL,
     MODEL_MEDIUM,
@@ -2928,6 +2948,11 @@ struct llama_model_loader {
         get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
         llm_kv = LLM_KV(llm_arch_from_string(arch_name));
 
+
+        if (arch_name == "command-r-plus") {
+            fprintf(stderr, "Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged.\n");
+        }
+
         // Save tensors data offset of the main file.
         // For subsidiary files, `meta` tensor data offset must not be used,
         // so we build a unified tensors index for weights.
@@ -3543,6 +3568,7 @@ static const char * llama_model_type_name(e_model type) {
         case MODEL_40B:    return "40B";
         case MODEL_65B:    return "65B";
         case MODEL_70B:    return "70B";
+        case MODEL_104B:   return "104B";
         case MODEL_314B:   return "314B";
         case MODEL_SMALL:  return "0.1B";
         case MODEL_MEDIUM: return "0.4B";
@@ -3953,11 +3979,13 @@ static void llm_load_hparams(
                 }
             } break;
         case LLM_ARCH_COMMAND_R:
+        case LLM_ARCH_COMMAND_R_PLUS:
             {
                 ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
                 switch (hparams.n_layer) {
                     case 40: model.type = e_model::MODEL_35B; break;
+                    case 64: model.type = e_model::MODEL_104B; break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
@@ -5384,6 +5412,7 @@ static bool llm_load_tensors(
                     }
                 } break;
             case LLM_ARCH_COMMAND_R:
+            case LLM_ARCH_COMMAND_R_PLUS:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
@@ -5402,8 +5431,17 @@ static bool llm_load_tensors(
 
                         auto & layer = model.layers[i];
 
+                        const int64_t n_embd_head_k = hparams.n_embd_head_k;
+                        const int64_t n_head = hparams.n_head;
+                        const int64_t n_head_kv = hparams.n_head_kv;
+
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
+                        if (model.arch == LLM_ARCH_COMMAND_R_PLUS) {
+                            layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k, n_head});
+                            layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k, n_head_kv});
+                        }
+
                         layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q,   "weight", i), {n_embd, n_embd});
                         layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K,   "weight", i), {n_embd, n_embd_gqa});
                         layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V,   "weight", i), {n_embd, n_embd_gqa});
@@ -5726,6 +5764,283 @@ static void llm_build_kv_store(
     ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view));
 }
 
+static struct ggml_tensor * llama_stitch_2d_grid_of_tensors_together(
+    struct     ggml_context  * ctx,
+    struct     ggml_tensor  ** tensors,
+    size_t                     n_a,
+    size_t                     n_b)
+{
+    // (ab)use ggml_concat to stitch together a grid of 2D tensors into one
+    // big 1D tensor.
+    //
+    // ggml_concat only supports concatenating on the third dimension.
+    //
+    // 1. For each row in the 2D grid:
+    //      Reshape 2D dimension to 3D dimension
+    //      Concatenate along the third dimension
+    //      Reshape back to 2D dimension
+    // 2. This gets us N tensors, where N is number of rows, concatenated.
+    //      Do the same thing:
+    //        Transpose the tensor
+    //        Reshape 2D to 3D
+    //        Concat
+    //        Reshape back
+
+    ggml_tensor * concatted_whole = nullptr;
+    for (size_t i = 0; i < n_a; ++i) {
+        ggml_tensor * concatted_row = nullptr;
+        for (size_t j = 0; j < n_b; ++j) {
+            struct ggml_tensor * cur = tensors[i * n_b + j];
+            if (concatted_row == nullptr) {
+                concatted_row = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]);
+            } else {
+                concatted_row = ggml_concat(ctx, concatted_row, ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]));
+            }
+        }
+        concatted_row = ggml_cont(ctx, concatted_row);
+        //printf("concatted row dims: %d %d %d %d\n", concatted_row->ne[0], concatted_row->ne[1], concatted_row->ne[2], concatted_row->ne[3]);
+        if (concatted_whole == nullptr) {
+            concatted_row = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], concatted_row->ne[2], 1);
+            concatted_row = ggml_transpose(ctx, concatted_row);
+            concatted_row = ggml_cont(ctx, concatted_row);
+            concatted_whole = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], 1, concatted_row->ne[1]);
+        } else {
+            concatted_row = ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], concatted_row->ne[2], 1);
+            concatted_row = ggml_transpose(ctx, concatted_row);
+            concatted_row = ggml_cont(ctx, concatted_row);
+            concatted_whole = ggml_concat(ctx, concatted_whole, ggml_reshape_3d(ctx, concatted_row, concatted_row->ne[0], 1, concatted_row->ne[1]));
+        }
+        //printf("concatted whole dims: %d %d %d %d\n", concatted_whole->ne[0], concatted_whole->ne[1], concatted_whole->ne[2], concatted_whole->ne[3]);
+    }
+    concatted_whole = ggml_reshape_3d(ctx, concatted_whole, concatted_whole->ne[0], concatted_whole->ne[2], 1);
+    concatted_whole = ggml_cont(ctx, ggml_transpose(ctx, concatted_whole));
+    //printf("final concatted whole dims: %d %d %d %d\n", concatted_whole->ne[0], concatted_whole->ne[1], concatted_whole->ne[2], concatted_whole->ne[3]);
+    return concatted_whole;
+}
+
+static struct ggml_tensor * llama_build_mat_mul_blocked_computation(
+    /*
+     * Does (almost) same thing as ggml_mat_mul mathematically speaking,
+     * but splits the computation into chunks.
+     *
+     * Why would you want to do this? As part of Command-R+ coding, we
+     * discovered that quite a bit of the GPU code is not prepared for
+     * matrices with more than 2**31-1 elements (~2 billion).
+     *
+     * Some context:
+     * https://github.com/ggerganov/llama.cpp/pull/6491
+     *
+     * This function has a limit (set to 2B) that if any constituent parts
+     * of it (input, output, result) would go over that limit byte-wise,
+     * it'll use the splitted computation. This is based on the idea that
+     * this minimizes the chance that somewhere downstream in GPU code, be
+     * it MPS or Cuda, has something like: int x = y * z; where the values
+     * of y and z overflow the multiplication and then silently (or not so
+     * silently) does something weird. At the time of writing (2024-04-05);
+     * it seems that CUDA code outright crashes and MPS silently gives bad
+     * results.
+     *
+     * This is a band-aid workaround. The ideal state of the world is that
+     * this function does nothing but "return ggml_mat_mul(ctx, a, b)".
+     *
+     * The last argument (forced_block_size) is for debugging. You can
+     * force a certain block size to use with the computation. If zero
+     * (default) then the block size is determined on the fly. Production
+     * code should always have it zero; and only set it to a non-zero value
+     * for debugging and testing.
+     */
+    struct     ggml_context * ctx,
+    struct     ggml_tensor  * a,
+    struct     ggml_tensor  * b,
+    const     llm_build_cb  & cb,
+    int64_t                   il,
+    size_t                    forced_block_size)
+{
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    if (forced_block_size != 0) {
+        //fprintf(stderr, "warning: llama_build_mat_mul_blocked_computation() forced block size: %zu\n", forced_block_size);
+    }
+
+    const size_t MAX_BYTES_BEFORE_SPLIT = 2000000000;
+
+    // the actual ggml_mul_mat supports batching. But this one doesn't.
+    GGML_ASSERT(a->ne[2] == 1 && b->ne[2] == 1);
+    GGML_ASSERT(a->ne[3] == 1 && b->ne[3] == 1);
+
+    // bail out if if the number of elements would be zero.
+    // nicer than getting a segfault.
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        GGML_ASSERT(a->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('a').");
+        GGML_ASSERT(b->ne[i] > 0 && "Matrix multiplication with a 0-side length matrix ('b').");
+    }
+
+    // Use the max size of: a, b, result size
+    const size_t a_rows = a->ne[1];
+    const size_t a_cols = a->ne[0];
+
+    // b is transposed
+    const size_t b_rows = b->ne[0];
+    const size_t b_cols = b->ne[1];
+
+    const size_t c_rows = a_rows;
+    const size_t c_cols = b_cols;
+
+    // determine a size of a block that's as big as possible.
+    // we start with block size of the maximum size, and if that passes,
+    // then we just use ggml_mat_mul()
+    //
+    // the block is square.
+    size_t cand_block_size = a_rows;
+    if (a_cols > cand_block_size) { cand_block_size = a_cols; }
+    if (b_rows > cand_block_size) { cand_block_size = b_rows; }
+    if (b_cols > cand_block_size) { cand_block_size = b_cols; }
+    if (c_rows > cand_block_size) { cand_block_size = c_rows; }
+    if (c_cols > cand_block_size) { cand_block_size = c_cols; }
+
+    size_t block_size = 1;
+    while (block_size < cand_block_size) {
+        block_size <<= 1;
+    }
+
+    if (forced_block_size != 0) {
+        block_size = forced_block_size;
+    } else {
+        // figure out what is largest block_size we can use that will never
+        // have an intermediate result bigger than
+        // MAX_BYTES_BEFORE_SPLIT
+        bool ok = true;
+        while (block_size > 0) {
+            ok = true;
+
+            // keep the byte calculations in sync with the blocked code in
+            // the computation part.
+
+            // Criteria:
+            // 1. result block size
+            {
+                const size_t i_min = 0;
+                const size_t j_min = 0;
+                size_t i_max = i_min + block_size;
+                size_t j_max = j_min + block_size;
+                if (i_max > a_rows) { i_max = a_rows; }
+                if (j_max > b_cols) { j_max = b_cols; }
+
+                const size_t bytes_size = sizeof(float) * (i_max - i_min) * (j_max - j_min);
+                if (bytes_size > MAX_BYTES_BEFORE_SPLIT) {
+                    ok = false;
+                }
+            }
+            // 2. and 3.
+            // Block size from 'a' and 'b'
+            {
+                const size_t i_min = 0;
+                const size_t j_min = 0;
+                const size_t k_min = 0;
+
+                size_t i_max = i_min + block_size;
+                size_t j_max = j_min + block_size;
+                size_t k_max = k_min + block_size;
+
+                if (i_max > a_rows) { i_max = a_rows; }
+                if (j_max > b_cols) { j_max = b_cols; }
+                if (k_max > a_cols) { k_max = a_cols; }
+
+                const size_t bytes_size_a = sizeof(float) * (k_max - k_min) * (i_max - i_min);
+                const size_t bytes_size_b = sizeof(float) * (k_max - k_min) * (j_max - j_min);
+
+                if (bytes_size_a > MAX_BYTES_BEFORE_SPLIT || bytes_size_b > MAX_BYTES_BEFORE_SPLIT) {
+                    ok = false;
+                }
+            }
+
+            if (!ok) {
+                block_size /= 2;
+                continue;
+            }
+            break;
+        }
+        block_size /= 2;
+        GGML_ASSERT(block_size > 0);
+    }
+
+
+    // O(N^3) nested loop, where N is number of blocks on one of the
+    // constituent parts.
+    size_t nb_A = (a_rows + block_size - 1) / block_size;
+    size_t nb_B = (b_cols + block_size - 1) / block_size;
+    size_t nb_A2 = (a_cols + block_size - 1) / block_size;
+
+    // make placeholder tensors for each block results.
+    // 2D: (row, col) -> offset is: (x, y) -> x * nb_B + y
+    struct ggml_tensor ** result_blocks = (struct ggml_tensor **) calloc(nb_A * nb_B, sizeof(struct ggml_tensor *));
+    for (size_t i = 0; i < nb_A; ++i) {
+        for (size_t j = 0; j < nb_B; ++j) {
+            for (size_t k = 0; k < nb_A2; ++k) {
+                const size_t i_min = i * block_size;
+                const size_t j_min = j * block_size;
+                const size_t k_min = k * block_size;
+
+                size_t i_max = i_min + block_size;
+                size_t j_max = j_min + block_size;
+                size_t k_max = k_min + block_size;
+                if (i_max > a_rows) { i_max = a_rows; }
+                if (j_max > b_cols) { j_max = b_cols; }
+                if (k_max > a_cols) { k_max = a_cols; }
+
+                const size_t blck_size_a = (const size_t) ggml_blck_size(a->type);
+                const size_t blck_size_b = (const size_t) ggml_blck_size(b->type);
+                const size_t type_size_a = ggml_type_size(a->type);
+                const size_t type_size_b = ggml_type_size(b->type);
+
+                GGML_ASSERT(k_min * type_size_a % blck_size_a == 0);
+                GGML_ASSERT(k_min * type_size_b % blck_size_b == 0);
+
+                struct ggml_tensor * a_slice = ggml_cont(ctx, ggml_view_2d(
+                        ctx, a,
+                        k_max - k_min,
+                        i_max - i_min,
+                        ggml_row_size(a->type, a->ne[0]),
+                        ggml_row_size(a->type, a->ne[0]) * i_min + k_min * type_size_a / blck_size_a));
+
+                cb(a_slice, "a_slice", il);
+
+                struct ggml_tensor * b_slice = ggml_cont(ctx, ggml_view_2d(
+                        ctx, b,
+                        k_max - k_min,
+                        j_max - j_min,
+                        ggml_row_size(b->type, b->ne[0]),
+                        ggml_row_size(b->type, b->ne[0]) * j_min + k_min * type_size_b / blck_size_b));
+
+                cb(b_slice, "b_slice", il);
+
+                struct ggml_tensor * mm_result = ggml_mul_mat(ctx, a_slice, b_slice);
+                cb(mm_result, "mm_result", il);
+
+                if (result_blocks[i * nb_B + j] == nullptr) {
+                    result_blocks[i * nb_B + j] = mm_result;
+                } else {
+                    result_blocks[i * nb_B + j] = ggml_add_inplace(ctx, result_blocks[i * nb_B + j], mm_result);
+                }
+
+                cb(result_blocks[i * nb_B + j], "result_slice", il);
+            }
+        }
+    }
+
+    // concate the results into one chonky tensor.
+    struct ggml_tensor * result = llama_stitch_2d_grid_of_tensors_together(
+            ctx,
+            result_blocks,
+            nb_A,
+            nb_B);
+    cb(result, "result-stitched", il);
+
+    free(result_blocks);
+
+    return result;
+}
+
 static struct ggml_tensor * llm_build_norm(
         struct ggml_context * ctx,
          struct ggml_tensor * cur,
@@ -9405,9 +9720,11 @@ struct llm_build_context {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         const int64_t n_embd_head = hparams.n_embd_head_v;
-        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+        const int64_t n_head = hparams.n_head;
         const float f_logit_scale = hparams.f_logit_scale;
 
+        GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
@@ -9432,26 +9749,64 @@ struct llm_build_context {
             {
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
-                cb(Qcur, "Qcur", il);
+                cb(Qcur, "Qcur-first", il);
                 if (model.layers[il].bq) {
                     Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                     cb(Qcur, "Qcur", il);
                 }
 
                 struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
-                cb(Kcur, "Kcur", il);
+                cb(Kcur, "Kcur-first", il);
                 if (model.layers[il].bk) {
                     Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                     cb(Kcur, "Kcur", il);
                 }
 
                 struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
-                cb(Vcur, "Vcur", il);
+                cb(Vcur, "Vcur-first", il);
                 if (model.layers[il].bv) {
                     Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                     cb(Vcur, "Vcur", il);
                 }
 
+                if (model.layers[il].attn_q_norm)
+                {
+                    // Noeda hacks; TODO: remove everything extra you don't
+                    // actually need. If you see this comment in a PR then
+                    // someone forgot to clean up the hacks.
+                    struct ggml_tensor * attn_q_norm = model.layers[il].attn_q_norm;
+                    struct ggml_tensor * attn_k_norm = model.layers[il].attn_k_norm;
+
+                    // CPU did not like F16, so cast to F32
+                    attn_q_norm = ggml_cast(ctx0, attn_q_norm, GGML_TYPE_F32);
+                    cb(attn_q_norm, "attn_q_norm_cast_F32", il);
+                    attn_k_norm = ggml_cast(ctx0, attn_k_norm, GGML_TYPE_F32);
+                    cb(attn_k_norm, "attn_k_norm_cast_F32", il);
+
+                    Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens,
+                                ggml_row_size(Qcur->type, n_embd_head),
+                                ggml_row_size(Qcur->type, n_embd_head) * n_head,
+                                0);
+                    cb(Qcur, "Qcur", il);
+                    Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens,
+                                ggml_row_size(Kcur->type, n_embd_head),
+                                ggml_row_size(Kcur->type, n_embd_head) * n_head_kv,
+                                0);
+                    cb(Kcur, "Kcur", il);
+
+                    Qcur = llm_build_norm(ctx0, Qcur, hparams,
+                                attn_q_norm,
+                                NULL,
+                                LLM_NORM, cb, il);
+                    cb(Qcur, "Qcur-normed", il);
+
+                    Kcur = llm_build_norm(ctx0, Kcur, hparams,
+                            attn_k_norm,
+                            NULL,
+                            LLM_NORM, cb, il);
+                    cb(Kcur, "Kcur-normed", il);
+                }
+
                 Qcur = ggml_rope_custom(
                     ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
                     n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
@@ -9509,8 +9864,8 @@ struct llm_build_context {
         cb(cur, "result_norm", -1);
 
         // lm_head
-        cur = ggml_mul_mat(ctx0, model.output, cur);
-
+        //cur = ggml_mul_mat(ctx0, model.output, cur);
+        cur = llama_build_mat_mul_blocked_computation(ctx0, model.output, cur, cb, -1, 0);
         if (f_logit_scale) {
             cur = ggml_scale(ctx0, cur, f_logit_scale);
         }
@@ -9716,6 +10071,7 @@ static struct ggml_cgraph * llama_build_graph(
                 result = llm.build_xverse();
             } break;
         case LLM_ARCH_COMMAND_R:
+        case LLM_ARCH_COMMAND_R_PLUS:
             {
                 result = llm.build_command_r();
             } break;
@@ -10267,6 +10623,7 @@ static int llama_decode_internal(
         //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
 
         ggml_backend_sched_reset(lctx.sched);
+
         ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
 
         ggml_cgraph * gf = llama_build_graph(lctx, u_batch, false);
@@ -13062,9 +13419,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
         return std::make_pair(i_layer, n_layer);
     };
 
+    // Command-R+ has such a large embedding weight tensor it overflows
+    // 32-bit signed integers. This is band-aid until quants can deal with
+    // that.
+    if (name == "token_embd.weight" && arch == LLM_ARCH_COMMAND_R_PLUS) {
+        new_type = GGML_TYPE_F16;
+    }
     // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
     // with the quantization of the output tensor
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
+    else if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) {
         if (qs.params->output_tensor_type < GGML_TYPE_COUNT) {
             new_type = qs.params->output_tensor_type;
         } else {
@@ -13096,6 +13459,15 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
                 new_type = GGML_TYPE_IQ3_S;
             }
         }
+    } else if ((arch == LLM_ARCH_COMMAND_R ||
+                arch == LLM_ARCH_COMMAND_R_PLUS) &&
+               (name.find("q_norm") != std::string::npos ||
+                name.find("k_norm") != std::string::npos)) {
+        new_type = GGML_TYPE_F32;
+    } else if (arch == LLM_ARCH_COMMAND_R_PLUS && name.find("attn_k") != std::string::npos) {
+        new_type = GGML_TYPE_F16;
+    } else if (arch == LLM_ARCH_COMMAND_R_PLUS && name.find("attn_v") != std::string::npos) {
+        new_type = GGML_TYPE_F16;
     } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
                ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M    || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) {
         if (name.find("attn_v.weight") != std::string::npos) {
@@ -14253,6 +14625,10 @@ struct llama_context * llama_new_context_with_model(
         type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
     }
 
+    if (model->arch == LLM_ARCH_COMMAND_R_PLUS) {
+            fprintf(stderr, "Warning: The 'command-r-plus' architecture will be removed and only exists in Noeda branch. GGUFs in this arch will be incompatible. Please use the branch in https://github.com/ggerganov/llama.cpp/pull/6491 or main 'llama.cpp' branch once it is merged.\n");
+    }
+
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
 
@@ -14525,6 +14901,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_MINICPM:
         case LLM_ARCH_XVERSE:
         case LLM_ARCH_COMMAND_R:
+        case LLM_ARCH_COMMAND_R_PLUS:
             return LLAMA_ROPE_TYPE_NORM;
 
         // the pairs of head values are offset by n_rot/2