diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3fef12a6..17923928 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,11 +22,16 @@ jobs:
         # cmake should figure out OS and ARCH automatically when running build.sh (but we need mvn compile for it)
         run: |
           mvn compile
-          .github/build.sh
+          .github/build.sh -DLLAMA_VERBOSE=ON
       - name: Download model
         run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
       - name: Run tests
         run: mvn test
+      - if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          path: ${{ github.workspace }}/hs_err_pid*.log
+          if-no-files-found: warn
 
   build-and-test-macos:
     name: ${{ matrix.target.runner }}
@@ -37,11 +42,11 @@ jobs:
         target:
           - {
             runner: macos-13,
-            cmake: '-DLLAMA_METAL=OFF'
+            cmake: '-DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
           }
           - {
             runner: macos-14,
-            cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF'
+            cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
           }
     steps:
       - uses: actions/checkout@v4
@@ -70,8 +75,13 @@ jobs:
       - name: Build libraries
         run: |
           mvn compile
-          .github\build.bat
+          .github\build.bat -DLLAMA_VERBOSE=ON
       - name: Download model
         run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
       - name: Run tests
         run: mvn test
+      - if: failure()
+        uses: actions/upload-artifact@v3
+        with:
+          path: ${{ github.workspace }}\hs_err_pid*.log
+          if-no-files-found: warn
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index fc88d112..7d01ef41 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -137,25 +137,24 @@ jobs:
       - name: Run tests
         run: mvn test
 
-  # disabled for now, we don't have access to a macos arm64 runner and testing on x86_64 doesn't work
-#  test-macos:
-#    name: Test Mac
-#    needs: build-macos-native
-#    runs-on: macos-latest
-#    steps:
-#      - uses: actions/checkout@v4
-#      - uses: actions/download-artifact@v3
-#        with:
-#          name: artifacts
-#          path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
-#      - name: Download model
-#        run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
-#      - uses: actions/setup-java@v4
-#        with:
-#          distribution: 'zulu'
-#          java-version: '11'
-#      - name: Run tests
-#        run: mvn test
+  test-macos:
+    name: Test Mac
+    needs: build-macos-native
+    runs-on: macos-14
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/download-artifact@v3
+        with:
+          name: artifacts
+          path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
+      - name: Download model
+        run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+      - uses: actions/setup-java@v4
+        with:
+          distribution: 'zulu'
+          java-version: '11'
+      - name: Run tests
+        run: mvn test
 
 
   test-windows:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 550759f2..43a0c725 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ project(jllama CXX)
 include(FetchContent)
 
 set(BUILD_SHARED_LIBS ON)
-set(LLAMA_STATIC OFF)
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 
 option(LLAMA_VERBOSE	"llama: verbose output"		OFF)
@@ -24,7 +23,7 @@ FetchContent_MakeAvailable(json)
 FetchContent_Declare(
 	llama.cpp
 	GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-	GIT_TAG        b3008
+	GIT_TAG        b3534
 )
 FetchContent_MakeAvailable(llama.cpp)
 
@@ -98,11 +97,12 @@ target_compile_definitions(jllama PRIVATE
 )
 
 if(OS_NAME STREQUAL "Windows")
-	set_target_properties(jllama llama PROPERTIES
+	set_target_properties(jllama llama ggml PROPERTIES
+	  RUNTIME_OUTPUT_DIRECTORY_DEBUG ${JLLAMA_DIR}
 	  RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR}
 	)
 else()
-	set_target_properties(jllama llama PROPERTIES
+	set_target_properties(jllama llama ggml PROPERTIES
 	  LIBRARY_OUTPUT_DIRECTORY ${JLLAMA_DIR}
 	)
 endif()
diff --git a/README.md b/README.md
index 2f2d2dfd..60a1dcec 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,7 @@
 
 # Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
 
-The main goal of llama.cpp is to run the LLaMA model using 4-bit integer quantization on a MacBook.
-This repository provides Java bindings for the C++ library.
+Inference of Meta's LLaMA model (and others) in pure C/C++.
 
 **You are welcome to contribute**
 
@@ -32,14 +31,14 @@ Access this library via Maven:
 </dependency>
 ```
 
-There are multiple [examples](src/test/java/examples):
+There are multiple [examples](src/test/java/examples).
 
 ### No Setup required
 
 We support CPU inference for the following platforms out of the box:
 
 - Linux x86-64, aarch64
-- MacOS x86-64, aarch64 (M1)
+- MacOS x86-64, aarch64 (M-series)
 - Windows x86-64, x64, arm (32 bit)
 
 If any of these match your platform, you can include the Maven dependency and get started.
@@ -47,82 +46,53 @@ If any of these match your platform, you can include the Maven dependency and ge
 ### Setup required
 
 If none of the above listed platforms matches yours, currently you have to compile the library yourself (also if you 
-want GPU acceleration, see below).
+want GPU acceleration).
 
-This requires:
+This consists of two steps: 1) Compiling the libraries and 2) putting them in the right location.
 
-- Git
-- A C++11 conforming compiler
-- The [cmake](https://www.cmake.org/) build system
-- Java, Maven, and setting [JAVA_HOME](https://www.baeldung.com/java-home-on-windows-7-8-10-mac-os-x-linux)
+##### Library Compilation
 
-Make sure everything works by running
-
-```
-g++ -v  # depending on your compiler
-java -version
-mvn -v
-echo $JAVA_HOME # for linux/macos
-echo %JAVA_HOME% # for windows
-```
-
-Then, checkout [llama.cpp](https://github.com/ggerganov/llama.cpp) to know which build arguments to use (e.g. for CUDA support).
-Finally, you have to run following commands in the directory of this repository (java-llama.cpp).
-Remember to add your build arguments in the fourth line (`cmake ..`):
+First, have a look at [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to know which build arguments to use (e.g. for CUDA support).
+Any build option of llama.cpp works equivalently for this project.
+You then have to run the following commands in the directory of this repository (java-llama.cpp):
 
 ```shell
-mvn compile
-mkdir build
-cd build
-cmake .. # add any other arguments for your backend
-cmake --build . --config Release
+mvn compile  # don't forget this line
+cmake -B build # add any other arguments for your backend, e.g. -DGGML_CUDA=ON
+cmake --build build --config Release
 ```
 
 > [!TIP]
-> Use `-DLLAMA_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
+> Use `-DGGML_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
 
-All required files will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
+All compiled libraries will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
 
 ```shell
 --  Installing files to /java-llama.cpp/src/main/resources/de/kherud/llama/Linux/x86_64
 ```
 
-This includes:
-
-- Linux: `libllama.so`, `libjllama.so`
-- MacOS: `libllama.dylib`, `libjllama.dylib`, `ggml-metal.metal`
-- Windows: `llama.dll`, `jllama.dll`
-
-If you then compile your own JAR from this directory, you are ready to go. Otherwise, if you still want to use the library
-as a Maven dependency, see below how to set the necessary paths in order for Java to find your compiled libraries.
+#### Library Location
 
-### Custom llama.cpp Setup (GPU acceleration)
+This project has to load three shared libraries:
 
-This repository provides default support for CPU based inference. You can compile `llama.cpp` any way you want, however (see [Setup Required](#setup-required)).
-In order to use your self-compiled library, set either of the [JVM options](https://www.jetbrains.com/help/idea/tuning-the-ide.html#configure-jvm-options):
+- ggml
+- llama
+- jllama
 
-- `de.kherud.llama.lib.path`, for example `-Dde.kherud.llama.lib.path=/directory/containing/lib`
-- `java.library.path`, for example `-Djava.library.path=/directory/containing/lib`
+Note, that the file names vary between operating systems, e.g., `ggml.dll` on Windows, `libggml.so` on Linux, and `libggml.dylib` on macOS.
 
-This repository uses [`System#mapLibraryName`](https://docs.oracle.com/javase%2F7%2Fdocs%2Fapi%2F%2F/java/lang/System.html) to determine the name of the shared library for you platform.
-If for any reason your library has a different name, you can set it with
-
-- `de.kherud.llama.lib.name`, for example `-Dde.kherud.llama.lib.name=myname.so`
-
-For compiling `llama.cpp`, refer to the official [readme](https://github.com/ggerganov/llama.cpp#build) for details.
-The library can be built with the `llama.cpp` project:
-
-```shell
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=ON  # add any other arguments for your backend
-cmake --build . --config Release
-```
+The application will search in the following order in the following locations:
 
-Look for the shared library in `build`.
+- In **de.kherud.llama.lib.path**: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`.
+- In **java.library.path**: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux.
+  You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`.
+  Use this option if you want to install the shared libraries as system libraries.
+- From the **JAR**: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library.
+  This of course only works for the [supported platforms](#no-setup-required) .
 
-> [!IMPORTANT]
-> If you are running MacOS with Metal, you have to put the file `ggml-metal.metal` from `build/bin` in the same directory as the shared library.
+Not all libraries have to be in the same location.
+For example, if you already have a llama.cpp and ggml version you can install them as a system library and rely on the jllama library from the JAR.
+This way, you don't have to compile anything. 
 
 ## Documentation
 
diff --git a/pom.xml b/pom.xml
index 79f10350..5b00bb42 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
 
 	<groupId>de.kherud</groupId>
 	<artifactId>llama</artifactId>
-	<version>3.2.1</version>
+	<version>3.3.0</version>
 	<packaging>jar</packaging>
 
 	<name>${project.groupId}:${project.artifactId}</name>
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 2298c190..d59f3b77 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -355,13 +355,12 @@ JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *vm, void *reserved)
 JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jobject obj, jstring jparams)
 {
     gpt_params params;
-    server_params sparams;
 
     auto *ctx_server = new server_context();
 
     std::string c_params = parse_jstring(env, jparams);
     json json_params = json::parse(c_params);
-    server_params_parse(json_params, sparams, params);
+    server_params_parse(json_params, params);
 
     if (json_value(json_params, "disable_log", false))
     {
@@ -372,9 +371,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
         log_enable();
     }
 
-    if (!sparams.system_prompt.empty())
+    if (!params.system_prompt.empty())
     {
-        ctx_server->system_prompt_set(sparams.system_prompt);
+        ctx_server->system_prompt_set(params.system_prompt);
     }
 
     if (params.model_alias == "unknown")
@@ -395,6 +394,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
 
     std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
 
+    // Necessary similarity of prompt for slot selection
+    ctx_server->slot_prompt_similarity = params.slot_prompt_similarity;
+
     // load the model
     if (!ctx_server->load_model(params))
     {
@@ -411,32 +413,36 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
     const auto model_meta = ctx_server->model_meta();
 
     // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
-    if (sparams.chat_template.empty())
+    if (params.chat_template.empty())
     {
         if (!ctx_server->validate_model_chat_template())
         {
             LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
                       "may cause the model to output suboptimal responses",
                       {});
-            sparams.chat_template = "chatml";
+            params.chat_template = "chatml";
         }
     }
-    ctx_server->chat_template = sparams.chat_template;
 
-    // print sample chat example to make it clear which template is used
+    // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+    if (params.chat_template.empty())
     {
-        json chat;
-        chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
-        chat.push_back({{"role", "user"}, {"content", "Hello"}});
-        chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
-        chat.push_back({{"role", "user"}, {"content", "How are you?"}});
-
-        const std::string chat_example = format_chat(ctx_server->model, sparams.chat_template, chat);
+        if (!ctx_server->validate_model_chat_template())
+        {
+            LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
+                      "may cause the model to output suboptimal responses",
+                      {});
+            params.chat_template = "chatml";
+        }
+    }
 
-        LOG_INFO("chat template", {
-                                      {"chat_example", chat_example},
-                                      {"built_in", sparams.chat_template.empty()},
-                                  });
+    // print sample chat example to make it clear which template is used
+    {
+        LOG_INFO("chat template",
+                 {
+                     {"chat_example", llama_chat_format_example(ctx_server->model, params.chat_template)},
+                     {"built_in", params.chat_template.empty()},
+                 });
     }
 
     ctx_server->queue_tasks.on_new_task(
@@ -480,7 +486,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv
         json chat;
         chat.push_back({{"role", "system"}, {"content", ctx_server->system_prompt}});
         chat.push_back({{"role", "user"}, {"content", json_params["prompt"]}});
-        json_params["prompt"] = format_chat(ctx_server->model, ctx_server->chat_template, chat);
+        json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat);
     }
 
     const int id_task = ctx_server->queue_tasks.get_new_id();
diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp
index d3d4750a..0601dac4 100644
--- a/src/main/cpp/server.hpp
+++ b/src/main/cpp/server.hpp
@@ -103,12 +103,6 @@ struct slot_params
     json input_suffix;
 };
 
-struct server_params
-{
-    std::string chat_template = "";
-    std::string system_prompt = "";
-};
-
 struct server_slot
 {
     int id;
@@ -686,11 +680,6 @@ struct server_context
     std::string system_prompt;
     std::vector<llama_token> system_tokens;
 
-    std::string name_user; // this should be the antiprompt
-    std::string name_assistant;
-
-    std::string chat_template;
-
     // slots / clients
     std::vector<server_slot> slots;
     json default_generation_settings_for_props;
@@ -700,6 +689,9 @@ struct server_context
 
     server_metrics metrics;
 
+    // Necessary similarity of prompt for slot selection
+    float slot_prompt_similarity = 0.0f;
+
     ~server_context()
     {
         if (ctx)
@@ -733,7 +725,10 @@ struct server_context
         // dedicate one sequence to the system prompt
         params.n_parallel += 1;
 
-        std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+        model = llama_init.model;
+        ctx = llama_init.context;
         params.n_parallel -= 1; // but be sneaky about it
         if (model == nullptr)
         {
@@ -791,6 +786,8 @@ struct server_context
             slot.ga_n = ga_n;
             slot.ga_w = ga_w;
 
+            slot.sparams = params.sparams;
+
             slot.reset();
 
             slots.push_back(slot);
@@ -866,34 +863,111 @@ struct server_context
         return prompt_tokens;
     }
 
-    server_slot *get_slot(int id)
+    server_slot *get_slot_by_id(int id)
     {
-        int64_t t_last = ggml_time_us();
-
-        server_slot *last_used = nullptr;
-
         for (server_slot &slot : slots)
         {
-            if (slot.id == id && slot.available())
+            if (slot.id == id)
             {
                 return &slot;
             }
+        }
+
+        return nullptr;
+    }
+
+    server_slot *get_available_slot(const std::string &prompt)
+    {
+        server_slot *ret = nullptr;
+
+        // find the slot that has at least n% prompt similarity
+        if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty())
+        {
+            int max_lcp_len = 0;
+            float similarity = 0;
+
+            for (server_slot &slot : slots)
+            {
+                // skip the slot if it is not available
+                if (!slot.available())
+                {
+                    continue;
+                }
+
+                // skip the slot if it does not contains prompt
+                if (!slot.prompt.is_string())
+                {
+                    continue;
+                }
+
+                // current slot's prompt
+                std::string slot_prompt = slot.prompt.get<std::string>();
+
+                // length of the current slot's prompt
+                int slot_prompt_len = slot_prompt.size();
+
+                // length of the Longest Common Prefix between the current slot's prompt and the input prompt
+                int lcp_len = common_part(slot_prompt, prompt);
+
+                // fraction of the common substring length compared to the current slot's prompt length
+                similarity = static_cast<float>(lcp_len) / slot_prompt_len;
+
+                // select the current slot if the criteria match
+                if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity)
+                {
+                    max_lcp_len = lcp_len;
+                    ret = &slot;
+                }
+            }
+
+            if (ret != nullptr)
+            {
+                LOG_VERBOSE("selected slot by lcp similarity", {
+                                                                   {"id_slot", ret->id},
+                                                                   {"max_lcp_len", max_lcp_len},
+                                                                   {"similarity", similarity},
+                                                               });
+            }
+        }
+
+        // find the slot that has been least recently used
+        if (ret == nullptr)
+        {
+            int64_t t_last = ggml_time_us();
+            for (server_slot &slot : slots)
+            {
+                // skip the slot if it is not available
+                if (!slot.available())
+                {
+                    continue;
+                }
+
+                // select the current slot if the criteria match
+                if (slot.t_last_used < t_last)
+                {
+                    t_last = slot.t_last_used;
+                    ret = &slot;
+                }
+            }
 
-            // among all available slots, find the one that has been least recently used
-            if (slot.available() && slot.t_last_used < t_last)
+            if (ret != nullptr)
             {
-                last_used = &slot;
-                t_last = slot.t_last_used;
+                LOG_VERBOSE("selected slot by lru", {
+                                                        {"id_slot", ret->id},
+                                                        {"t_last", t_last},
+                                                    });
             }
         }
 
-        return last_used;
+        return ret;
     }
 
     bool launch_slot_with_task(server_slot &slot, const server_task &task)
     {
         slot_params default_params;
-        llama_sampling_params default_sparams;
+        // Sampling parameter defaults are loaded from the global server context (but individual requests can still
+        // override them)
+        llama_sampling_params default_sparams = params.sparams;
         auto &data = task.data;
 
         slot.oaicompat = false;
@@ -901,7 +975,7 @@ struct server_context
 
         slot.params.stream = json_value(data, "stream", false);
         slot.params.cache_prompt = json_value(data, "cache_prompt", false);
-        slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
+        slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
         slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
         slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
         slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
@@ -947,19 +1021,23 @@ struct server_context
         slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
 
         // get prompt
+        if (!task.infill)
         {
             const auto &prompt = data.find("prompt");
             if (prompt == data.end())
             {
-                send_error(task, R"(Either "prompt" or "messages" must be provided)", ERROR_TYPE_INVALID_REQUEST);
+                send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
 
-            slot.prompt = *prompt;
-
-            if (slot.prompt.is_array() && slot.prompt.empty())
+            if ((prompt->is_string()) || (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
+                (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer()))
+            {
+                slot.prompt = *prompt;
+            }
+            else
             {
-                send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
+                send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
                 return false;
             }
         }
@@ -1215,7 +1293,7 @@ struct server_context
     bool process_token(completion_token_output &result, server_slot &slot)
     {
         // remember which tokens were sampled - used for repetition penalties during sampling
-        const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
+        const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
         slot.sampled = result.tok;
 
         // search stop word and delete it
@@ -1546,12 +1624,12 @@ struct server_context
             }
 
             const float *embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-            if (embd == nullptr)
+            if (embd == NULL)
             {
                 embd = llama_get_embeddings_ith(ctx, i);
             }
 
-            if (embd == nullptr)
+            if (embd == NULL)
             {
                 LOG_ERROR("failed to get embeddings", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
 
@@ -1663,7 +1741,25 @@ struct server_context
         switch (task.type)
         {
         case SERVER_TASK_TYPE_COMPLETION: {
-            server_slot *slot = get_slot(json_value(task.data, "id_slot", -1));
+            const int id_slot = json_value(task.data, "id_slot", -1);
+
+            server_slot *slot;
+
+            if (id_slot != -1)
+            {
+                slot = get_slot_by_id(id_slot);
+            }
+            else
+            {
+                std::string prompt;
+                if (task.data.contains("prompt") && task.data.at("prompt").is_string())
+                {
+                    prompt = json_value(task.data, "prompt", std::string());
+                }
+
+                slot = get_available_slot(prompt);
+            }
+
             if (slot == nullptr)
             {
                 // if no slot is available, we defer this task for processing later
@@ -1671,6 +1767,13 @@ struct server_context
                 queue_tasks.defer(task);
                 break;
             }
+            if (!slot->available())
+            {
+                // if requested slot is unavailable, we defer this task for processing later
+                LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                queue_tasks.defer(task);
+                break;
+            }
 
             if (task.data.contains("system_prompt"))
             {
@@ -1790,12 +1893,19 @@ struct server_context
         break;
         case SERVER_TASK_TYPE_SLOT_SAVE: {
             int id_slot = task.data.at("id_slot");
-            server_slot *slot = get_slot(id_slot);
+            server_slot *slot = get_slot_by_id(id_slot);
             if (slot == nullptr)
             {
                 send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                 break;
             }
+            if (!slot->available())
+            {
+                // if requested slot is unavailable, we defer this task for processing later
+                LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                queue_tasks.defer(task);
+                break;
+            }
 
             const size_t token_count = slot->cache_tokens.size();
             const int64_t t_start = ggml_time_us();
@@ -1823,12 +1933,19 @@ struct server_context
         break;
         case SERVER_TASK_TYPE_SLOT_RESTORE: {
             int id_slot = task.data.at("id_slot");
-            server_slot *slot = get_slot(id_slot);
+            server_slot *slot = get_slot_by_id(id_slot);
             if (slot == nullptr)
             {
                 send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                 break;
             }
+            if (!slot->available())
+            {
+                // if requested slot is unavailable, we defer this task for processing later
+                LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                queue_tasks.defer(task);
+                break;
+            }
 
             const int64_t t_start = ggml_time_us();
 
@@ -1865,12 +1982,19 @@ struct server_context
         break;
         case SERVER_TASK_TYPE_SLOT_ERASE: {
             int id_slot = task.data.at("id_slot");
-            server_slot *slot = get_slot(id_slot);
+            server_slot *slot = get_slot_by_id(id_slot);
             if (slot == nullptr)
             {
                 send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
                 break;
             }
+            if (!slot->available())
+            {
+                // if requested slot is unavailable, we defer this task for processing later
+                LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+                queue_tasks.defer(task);
+                break;
+            }
 
             // Erase token cache
             const size_t n_erased = slot->cache_tokens.size();
@@ -2054,6 +2178,11 @@ struct server_context
         int32_t n_batch = llama_n_batch(ctx);
         int32_t n_ubatch = llama_n_ubatch(ctx);
 
+        // track if this is an embedding or non-embedding batch
+        // if we've added sampled tokens above, we are in non-embedding mode
+        // -1: none, 0: non-embedding, 1: embedding
+        int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
+
         // next, batch any pending prompts without exceeding n_batch
         if (params.cont_batching || batch.n_tokens == 0)
         {
@@ -2074,6 +2203,7 @@ struct server_context
 
                         if (slot.infill)
                         {
+                            const bool add_bos = llama_should_add_bos_token(model);
                             bool suff_rm_leading_spc = true;
                             if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
                             {
@@ -2091,11 +2221,23 @@ struct server_context
                             }
 
                             prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
-                            prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
-                            prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
-                            prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
-                            prefix_tokens.push_back(llama_token_middle(model));
-                            prompt_tokens = prefix_tokens;
+                            suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+
+                            auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+                            auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+                            if (add_bos)
+                            {
+                                embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+                            }
+                            embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+                            const llama_token middle_token = llama_token_middle(model);
+                            if (middle_token >= 0)
+                            {
+                                embd_inp.push_back(middle_token);
+                            }
+
+                            prompt_tokens = embd_inp;
                         }
                         else
                         {
@@ -2138,7 +2280,6 @@ struct server_context
                                 slot.state = SLOT_STATE_PROCESSING;
                                 slot.command = SLOT_COMMAND_NONE;
                                 slot.release();
-                                slot.print_timings();
                                 send_error(slot, "input is too large to process. increase the physical batch size",
                                            ERROR_TYPE_SERVER);
                                 continue;
@@ -2236,6 +2377,17 @@ struct server_context
                         }
                     }
 
+                    // check that we are in the right batch_type, if not defer the slot
+                    bool slot_type = slot.embedding ? 1 : 0;
+                    if (batch_type == -1)
+                    {
+                        batch_type = slot_type;
+                    }
+                    else if (batch_type != slot_type)
+                    {
+                        continue;
+                    }
+
                     // keep only the common part
                     int p0 = (int)system_tokens.size() + slot.n_past;
                     if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1))
@@ -2344,6 +2496,9 @@ struct server_context
                                           {"n_tokens", batch.n_tokens},
                                       });
 
+        // make sure we're in the right embedding mode
+        llama_set_embeddings(ctx, batch_type == 1);
+
         // process the created batch of tokens
         for (int32_t i = 0; i < batch.n_tokens; i += n_batch)
         {
@@ -2531,10 +2686,9 @@ struct server_context
 };
 
 // parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct.
-static void server_params_parse(json jparams, server_params &sparams, gpt_params &params)
+static void server_params_parse(json jparams, gpt_params &params)
 {
     gpt_params default_params;
-    server_params default_sparams;
 
     params.seed = json_value(jparams, "seed", default_params.seed);
     params.n_threads = json_value(jparams, "n_threads", default_params.n_threads);
@@ -2551,7 +2705,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
     params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel);
     params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences);
     params.p_split = json_value(jparams, "p_split", default_params.p_split);
-    params.n_beams = json_value(jparams, "n_beams", default_params.n_beams);
     params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n);
     params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w);
     params.n_print = json_value(jparams, "n_print", default_params.n_print);
@@ -2582,7 +2735,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
     params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic);
     params.logits_file = json_value(jparams, "logits_file", default_params.logits_file);
     params.lora_adapter = json_value(jparams, "lora_adapter", default_params.lora_adapter);
-    params.lora_base = json_value(jparams, "lora_base", default_params.lora_base);
     params.embedding = json_value(jparams, "embedding", default_params.embedding);
     params.escape = json_value(jparams, "escape", default_params.escape);
     params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching);
@@ -2592,8 +2744,8 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
     params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap);
     params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock);
     params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload);
-    sparams.system_prompt = json_value(jparams, "system_prompt", default_sparams.system_prompt);
-    sparams.chat_template = json_value(jparams, "chat_template", default_sparams.chat_template);
+    params.system_prompt = json_value(jparams, "system_prompt", default_params.system_prompt);
+    params.chat_template = json_value(jparams, "chat_template", default_params.chat_template);
 
     if (jparams.contains("n_gpu_layers"))
     {
diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp
index ad7198c1..7de7eac4 100644
--- a/src/main/cpp/utils.hpp
+++ b/src/main/cpp/utils.hpp
@@ -97,10 +97,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li
         json log = json{
             {"msg", message},
 #if SERVER_VERBOSE
-            {"ts", time(nullptr)},
-            {"level", log_level_to_string(level)},
-            {"tid", ss_tid.str()},
-            {"function", function},
+            {"ts", time(nullptr)}, {"level", log_level_to_string(level)}, {"tid", ss_tid.str()}, {"function", function},
             {"line", line},
 #endif
         };
@@ -135,9 +132,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li
         }
 
 #if SERVER_VERBOSE
-        ss << " | ts " << time(nullptr)
-           << " | tid " << ss_tid.str()
-           << " | " << function << " line " << line;
+        ss << " | ts " << time(nullptr) << " | tid " << ss_tid.str() << " | " << function << " line " << line;
 #endif
 
         const std::string str = ss.str();
@@ -157,50 +152,51 @@ static inline void server_log(ggml_log_level level, const char *function, int li
 // chat template utils
 //
 
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string &tmpl)
-{
-    llama_chat_message chat[] = {{"user", "test"}};
-    int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-
 // Format given chat. If tmpl is empty, we take the template from model metadata
 inline std::string format_chat(const struct llama_model *model, const std::string &tmpl,
                                const std::vector<json> &messages)
 {
-    size_t alloc_size = 0;
-    // vector holding all allocated string to be passed to llama_chat_apply_template
-    std::vector<std::string> str(messages.size() * 2);
-    std::vector<llama_chat_message> chat(messages.size());
+    std::vector<llama_chat_msg> chat;
 
     for (size_t i = 0; i < messages.size(); ++i)
     {
         const auto &curr_msg = messages[i];
-        str[i * 2 + 0] = json_value(curr_msg, "role", std::string(""));
-        str[i * 2 + 1] = json_value(curr_msg, "content", std::string(""));
-        alloc_size += str[i * 2 + 1].length();
-        chat[i].role = str[i * 2 + 0].c_str();
-        chat[i].content = str[i * 2 + 1].c_str();
-    }
 
-    const char *ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
-    std::vector<char> buf(alloc_size * 2);
+        std::string role = json_value(curr_msg, "role", std::string(""));
 
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+        std::string content;
+        if (curr_msg.contains("content"))
+        {
+            if (curr_msg["content"].is_string())
+            {
+                content = curr_msg["content"].get<std::string>();
+            }
+            else if (curr_msg["content"].is_array())
+            {
+                for (const auto &part : curr_msg["content"])
+                {
+                    if (part.contains("text"))
+                    {
+                        content += "\n" + part["text"].get<std::string>();
+                    }
+                }
+            }
+            else
+            {
+                throw std::runtime_error(
+                    "Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            }
+        }
+        else
+        {
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+        }
 
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t)res > buf.size())
-    {
-        buf.resize(res);
-        res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+        chat.push_back({role, content});
     }
 
-    const std::string formatted_chat(buf.data(), res);
-
+    auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
     LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
     return formatted_chat;
 }
 
@@ -322,6 +318,16 @@ static size_t common_part(const std::vector<llama_token> &a, const std::vector<l
     return i;
 }
 
+static size_t common_part(const std::string &a, const std::string &b)
+{
+    size_t i;
+    for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++)
+    {
+    }
+
+    return i;
+}
+
 static bool ends_with(const std::string &str, const std::string &suffix)
 {
     return str.size() >= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
@@ -432,24 +438,6 @@ static json oaicompat_completion_params_parse(const struct llama_model *model,
 
     llama_params["__oaicompat"] = true;
 
-    // Map OpenAI parameters to llama.cpp parameters
-    //
-    // For parameters that are defined by the OpenAI documentation (e.g.
-    // temperature), we explicitly specify OpenAI's intended default; we
-    // need to do that because sometimes OpenAI disagrees with llama.cpp
-    //
-    // https://platform.openai.com/docs/api-reference/chat/create
-    llama_sampling_params default_sparams;
-    llama_params["model"] = json_value(body, "model", std::string("unknown"));
-    llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
-    llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
-    llama_params["n_predict"] = json_value(body, "max_tokens", -1);
-    llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
-    llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
-    llama_params["stream"] = json_value(body, "stream", false);
-    llama_params["temperature"] = json_value(body, "temperature", 1.0);
-    llama_params["top_p"] = json_value(body, "top_p", 1.0);
-
     // Apply chat template to the list of messages
     llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
 
@@ -674,7 +662,7 @@ static json format_embeddings_response_oaicompat(const json &request, const json
 {
     json data = json::array();
     int i = 0;
-    for (const auto &elem : embeddings)
+    for (auto &elem : embeddings)
     {
         data.push_back(
             json{{"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, {"object", "embedding"}});
diff --git a/src/main/java/de/kherud/llama/LlamaLoader.java b/src/main/java/de/kherud/llama/LlamaLoader.java
index 5aa84001..a0239d20 100644
--- a/src/main/java/de/kherud/llama/LlamaLoader.java
+++ b/src/main/java/de/kherud/llama/LlamaLoader.java
@@ -62,6 +62,7 @@ static synchronized void initialize() throws UnsatisfiedLinkError {
 				System.err.println("'ggml-metal.metal' not found");
 			}
 		}
+		loadNativeLibrary("ggml");
 		loadNativeLibrary("llama");
 		loadNativeLibrary("jllama");
 		extracted = true;
@@ -96,12 +97,7 @@ private static void cleanPath(Path path) {
 	private static void loadNativeLibrary(String name) {
 		List<String> triedPaths = new LinkedList<>();
 
-		// Try loading library from de.kherud.llama.lib.path library path
-		String nativeLibName = System.getProperty("de.kherud.llama.lib.name");
-		if (nativeLibName == null) {
-			nativeLibName = System.mapLibraryName(name);
-		}
-
+		String nativeLibName = System.mapLibraryName(name);
 		String nativeLibPath = System.getProperty("de.kherud.llama.lib.path");
 		if (nativeLibPath != null) {
 			Path path = Paths.get(nativeLibPath, nativeLibName);
@@ -125,21 +121,7 @@ private static void loadNativeLibrary(String name) {
 			}
 		}
 
-		// Load the os-dependent library from the jar file
-		nativeLibPath = getNativeResourcePath();
-		if (hasNativeLib(nativeLibPath, nativeLibName)) {
-			// temporary library folder
-			String tempFolder = getTempDir().getAbsolutePath();
-			// Try extracting the library from jar
-			if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) {
-				return;
-			}
-			else {
-				triedPaths.add(nativeLibPath);
-			}
-		}
-
-		// As a last resort try from java.library.path
+		// Try to load the library from java.library.path
 		String javaLibraryPath = System.getProperty("java.library.path", "");
 		for (String ldPath : javaLibraryPath.split(File.pathSeparator)) {
 			if (ldPath.isEmpty()) {
@@ -154,6 +136,20 @@ private static void loadNativeLibrary(String name) {
 			}
 		}
 
+		// As a last resort try load the os-dependent library from the jar file
+		nativeLibPath = getNativeResourcePath();
+		if (hasNativeLib(nativeLibPath, nativeLibName)) {
+			// temporary library folder
+			String tempFolder = getTempDir().getAbsolutePath();
+			// Try extracting the library from jar
+			if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) {
+				return;
+			}
+			else {
+				triedPaths.add(nativeLibPath);
+			}
+		}
+
 		throw new UnsatisfiedLinkError(
 				String.format(
 						"No native library found for os.name=%s, os.arch=%s, paths=[%s]",
diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java
index 1cbb6973..3b34d3f3 100644
--- a/src/main/java/de/kherud/llama/ModelParameters.java
+++ b/src/main/java/de/kherud/llama/ModelParameters.java
@@ -32,7 +32,6 @@ public final class ModelParameters extends JsonParameters {
 	private static final String PARAM_SPLIT_MODE = "split_mode";
 	private static final String PARAM_MAIN_GPU = "main_gpu";
 	private static final String PARAM_TENSOR_SPLIT = "tensor_split";
-	private static final String PARAM_N_BEAMS = "n_beams";
 	private static final String PARAM_GRP_ATTN_N = "grp_attn_n";
 	private static final String PARAM_GRP_ATTN_W = "grp_attn_w";
 	private static final String PARAM_ROPE_FREQ_BASE = "rope_freq_base";
@@ -55,7 +54,6 @@ public final class ModelParameters extends JsonParameters {
 	private static final String PARAM_LOOKUP_CACHE_STATIC = "lookup_cache_static";
 	private static final String PARAM_LOOKUP_CACHE_DYNAMIC = "lookup_cache_dynamic";
 	private static final String PARAM_LORA_ADAPTER = "lora_adapter";
-	private static final String PARAM_LORA_BASE = "lora_base";
 	private static final String PARAM_EMBEDDING = "embedding";
 	private static final String PARAM_CONT_BATCHING = "cont_batching";
 	private static final String PARAM_FLASH_ATTENTION = "flash_attn";
@@ -244,14 +242,6 @@ public ModelParameters setTensorSplit(float[] tensorSplit) {
 		return this;
 	}
 
-	/**
-	 * Set usage of beam search of given width if non-zero.
-	 */
-	public ModelParameters setNBeams(int nBeams) {
-		parameters.put(PARAM_N_BEAMS, String.valueOf(nBeams));
-		return this;
-	}
-
 	/**
 	 * Set the group-attention factor (default: 1)
 	 */
@@ -484,14 +474,6 @@ public ModelParameters setLoraAdapters(Map<String, Float> loraAdapters) {
 		return this;
 	}
 
-	/**
-	 * Set an optional model to use as a base for the layers modified by the LoRA adapter
-	 */
-	public ModelParameters setLoraBase(String loraBase) {
-		parameters.put(PARAM_LORA_BASE, toJsonString(loraBase));
-		return this;
-	}
-
 	/**
 	 * Whether to load model with embedding support
 	 */
diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java
index a5454c59..c7ece673 100644
--- a/src/test/java/de/kherud/llama/LlamaModelTest.java
+++ b/src/test/java/de/kherud/llama/LlamaModelTest.java
@@ -1,7 +1,6 @@
 package de.kherud.llama;
 
 import java.io.*;
-import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.regex.Pattern;
 
@@ -24,6 +23,7 @@ public static void setup() {
 //		LlamaModel.setLogger(LogFormat.TEXT, (level, msg) -> System.out.println(level + ": " + msg));
 		model = new LlamaModel(
 				new ModelParameters()
+						.setNCtx(128)
 						.setModelFilePath("models/codellama-7b.Q2_K.gguf")
 //						.setModelUrl("https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf")
 						.setNGpuLayers(43)