diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 3fef12a6..17923928 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -22,11 +22,16 @@ jobs:
# cmake should figure out OS and ARCH automatically when running build.sh (but we need mvn compile for it)
run: |
mvn compile
- .github/build.sh
+ .github/build.sh -DLLAMA_VERBOSE=ON
- name: Download model
run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
- name: Run tests
run: mvn test
+ - if: failure()
+ uses: actions/upload-artifact@v3
+ with:
+ path: ${{ github.workspace }}/hs_err_pid*.log
+ if-no-files-found: warn
build-and-test-macos:
name: ${{ matrix.target.runner }}
@@ -37,11 +42,11 @@ jobs:
target:
- {
runner: macos-13,
- cmake: '-DLLAMA_METAL=OFF'
+ cmake: '-DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
}
- {
runner: macos-14,
- cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF'
+ cmake: '-DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_METAL=OFF -DLLAMA_VERBOSE=ON'
}
steps:
- uses: actions/checkout@v4
@@ -70,8 +75,13 @@ jobs:
- name: Build libraries
run: |
mvn compile
- .github\build.bat
+ .github\build.bat -DLLAMA_VERBOSE=ON
- name: Download model
run: curl -L $env:MODEL_URL --create-dirs -o models/$env:MODEL_NAME
- name: Run tests
run: mvn test
+ - if: failure()
+ uses: actions/upload-artifact@v3
+ with:
+ path: ${{ github.workspace }}\hs_err_pid*.log
+ if-no-files-found: warn
diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index fc88d112..7d01ef41 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -137,25 +137,24 @@ jobs:
- name: Run tests
run: mvn test
- # disabled for now, we don't have access to a macos arm64 runner and testing on x86_64 doesn't work
-# test-macos:
-# name: Test Mac
-# needs: build-macos-native
-# runs-on: macos-latest
-# steps:
-# - uses: actions/checkout@v4
-# - uses: actions/download-artifact@v3
-# with:
-# name: artifacts
-# path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
-# - name: Download model
-# run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
-# - uses: actions/setup-java@v4
-# with:
-# distribution: 'zulu'
-# java-version: '11'
-# - name: Run tests
-# run: mvn test
+ test-macos:
+ name: Test Mac
+ needs: build-macos-native
+ runs-on: macos-14
+ steps:
+ - uses: actions/checkout@v4
+ - uses: actions/download-artifact@v3
+ with:
+ name: artifacts
+ path: ${{ github.workspace }}/src/main/resources/de/kherud/llama/
+ - name: Download model
+ run: curl -L ${MODEL_URL} --create-dirs -o models/${MODEL_NAME}
+ - uses: actions/setup-java@v4
+ with:
+ distribution: 'zulu'
+ java-version: '11'
+ - name: Run tests
+ run: mvn test
test-windows:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 550759f2..43a0c725 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ project(jllama CXX)
include(FetchContent)
set(BUILD_SHARED_LIBS ON)
-set(LLAMA_STATIC OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
option(LLAMA_VERBOSE "llama: verbose output" OFF)
@@ -24,7 +23,7 @@ FetchContent_MakeAvailable(json)
FetchContent_Declare(
llama.cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
- GIT_TAG b3008
+ GIT_TAG b3534
)
FetchContent_MakeAvailable(llama.cpp)
@@ -98,11 +97,12 @@ target_compile_definitions(jllama PRIVATE
)
if(OS_NAME STREQUAL "Windows")
- set_target_properties(jllama llama PROPERTIES
+ set_target_properties(jllama llama ggml PROPERTIES
+ RUNTIME_OUTPUT_DIRECTORY_DEBUG ${JLLAMA_DIR}
RUNTIME_OUTPUT_DIRECTORY_RELEASE ${JLLAMA_DIR}
)
else()
- set_target_properties(jllama llama PROPERTIES
+ set_target_properties(jllama llama ggml PROPERTIES
LIBRARY_OUTPUT_DIRECTORY ${JLLAMA_DIR}
)
endif()
diff --git a/README.md b/README.md
index 2f2d2dfd..60a1dcec 100644
--- a/README.md
+++ b/README.md
@@ -3,8 +3,7 @@
# Java Bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp)
-The main goal of llama.cpp is to run the LLaMA model using 4-bit integer quantization on a MacBook.
-This repository provides Java bindings for the C++ library.
+Inference of Meta's LLaMA model (and others) in pure C/C++.
**You are welcome to contribute**
@@ -32,14 +31,14 @@ Access this library via Maven:
```
-There are multiple [examples](src/test/java/examples):
+There are multiple [examples](src/test/java/examples).
### No Setup required
We support CPU inference for the following platforms out of the box:
- Linux x86-64, aarch64
-- MacOS x86-64, aarch64 (M1)
+- MacOS x86-64, aarch64 (M-series)
- Windows x86-64, x64, arm (32 bit)
If any of these match your platform, you can include the Maven dependency and get started.
@@ -47,82 +46,53 @@ If any of these match your platform, you can include the Maven dependency and ge
### Setup required
If none of the above listed platforms matches yours, currently you have to compile the library yourself (also if you
-want GPU acceleration, see below).
+want GPU acceleration).
-This requires:
+This consists of two steps: 1) Compiling the libraries and 2) putting them in the right location.
-- Git
-- A C++11 conforming compiler
-- The [cmake](https://www.cmake.org/) build system
-- Java, Maven, and setting [JAVA_HOME](https://www.baeldung.com/java-home-on-windows-7-8-10-mac-os-x-linux)
+##### Library Compilation
-Make sure everything works by running
-
-```
-g++ -v # depending on your compiler
-java -version
-mvn -v
-echo $JAVA_HOME # for linux/macos
-echo %JAVA_HOME% # for windows
-```
-
-Then, checkout [llama.cpp](https://github.com/ggerganov/llama.cpp) to know which build arguments to use (e.g. for CUDA support).
-Finally, you have to run following commands in the directory of this repository (java-llama.cpp).
-Remember to add your build arguments in the fourth line (`cmake ..`):
+First, have a look at [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to know which build arguments to use (e.g. for CUDA support).
+Any build option of llama.cpp works equivalently for this project.
+You then have to run the following commands in the directory of this repository (java-llama.cpp):
```shell
-mvn compile
-mkdir build
-cd build
-cmake .. # add any other arguments for your backend
-cmake --build . --config Release
+mvn compile # don't forget this line
+cmake -B build # add any other arguments for your backend, e.g. -DGGML_CUDA=ON
+cmake --build build --config Release
```
> [!TIP]
-> Use `-DLLAMA_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
+> Use `-DGGML_CURL=ON` to download models via Java code using `ModelParameters#setModelUrl(String)`.
-All required files will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
+All compiled libraries will be put in a resources directory matching your platform, which will appear in the cmake output. For example something like:
```shell
-- Installing files to /java-llama.cpp/src/main/resources/de/kherud/llama/Linux/x86_64
```
-This includes:
-
-- Linux: `libllama.so`, `libjllama.so`
-- MacOS: `libllama.dylib`, `libjllama.dylib`, `ggml-metal.metal`
-- Windows: `llama.dll`, `jllama.dll`
-
-If you then compile your own JAR from this directory, you are ready to go. Otherwise, if you still want to use the library
-as a Maven dependency, see below how to set the necessary paths in order for Java to find your compiled libraries.
+#### Library Location
-### Custom llama.cpp Setup (GPU acceleration)
+This project has to load three shared libraries:
-This repository provides default support for CPU based inference. You can compile `llama.cpp` any way you want, however (see [Setup Required](#setup-required)).
-In order to use your self-compiled library, set either of the [JVM options](https://www.jetbrains.com/help/idea/tuning-the-ide.html#configure-jvm-options):
+- ggml
+- llama
+- jllama
-- `de.kherud.llama.lib.path`, for example `-Dde.kherud.llama.lib.path=/directory/containing/lib`
-- `java.library.path`, for example `-Djava.library.path=/directory/containing/lib`
+Note, that the file names vary between operating systems, e.g., `ggml.dll` on Windows, `libggml.so` on Linux, and `libggml.dylib` on macOS.
-This repository uses [`System#mapLibraryName`](https://docs.oracle.com/javase%2F7%2Fdocs%2Fapi%2F%2F/java/lang/System.html) to determine the name of the shared library for you platform.
-If for any reason your library has a different name, you can set it with
-
-- `de.kherud.llama.lib.name`, for example `-Dde.kherud.llama.lib.name=myname.so`
-
-For compiling `llama.cpp`, refer to the official [readme](https://github.com/ggerganov/llama.cpp#build) for details.
-The library can be built with the `llama.cpp` project:
-
-```shell
-mkdir build
-cd build
-cmake .. -DBUILD_SHARED_LIBS=ON # add any other arguments for your backend
-cmake --build . --config Release
-```
+The application will search in the following order in the following locations:
-Look for the shared library in `build`.
+- In **de.kherud.llama.lib.path**: Use this option if you want a custom location for your shared libraries, i.e., set VM option `-Dde.kherud.llama.lib.path=/path/to/directory`.
+- In **java.library.path**: These are predefined locations for each OS, e.g., `/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib` on Linux.
+ You can find out the locations using `System.out.println(System.getProperty("java.library.path"))`.
+ Use this option if you want to install the shared libraries as system libraries.
+- From the **JAR**: If any of the libraries weren't found yet, the application will try to use a prebuilt shared library.
+ This of course only works for the [supported platforms](#no-setup-required) .
-> [!IMPORTANT]
-> If you are running MacOS with Metal, you have to put the file `ggml-metal.metal` from `build/bin` in the same directory as the shared library.
+Not all libraries have to be in the same location.
+For example, if you already have a llama.cpp and ggml version you can install them as a system library and rely on the jllama library from the JAR.
+This way, you don't have to compile anything.
## Documentation
diff --git a/pom.xml b/pom.xml
index 79f10350..5b00bb42 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
de.kherud
llama
- 3.2.1
+ 3.3.0
jar
${project.groupId}:${project.artifactId}
diff --git a/src/main/cpp/jllama.cpp b/src/main/cpp/jllama.cpp
index 2298c190..d59f3b77 100644
--- a/src/main/cpp/jllama.cpp
+++ b/src/main/cpp/jllama.cpp
@@ -355,13 +355,12 @@ JNIEXPORT void JNICALL JNI_OnUnload(JavaVM *vm, void *reserved)
JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jobject obj, jstring jparams)
{
gpt_params params;
- server_params sparams;
auto *ctx_server = new server_context();
std::string c_params = parse_jstring(env, jparams);
json json_params = json::parse(c_params);
- server_params_parse(json_params, sparams, params);
+ server_params_parse(json_params, params);
if (json_value(json_params, "disable_log", false))
{
@@ -372,9 +371,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
log_enable();
}
- if (!sparams.system_prompt.empty())
+ if (!params.system_prompt.empty())
{
- ctx_server->system_prompt_set(sparams.system_prompt);
+ ctx_server->system_prompt_set(params.system_prompt);
}
if (params.model_alias == "unknown")
@@ -395,6 +394,9 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
std::atomic state{SERVER_STATE_LOADING_MODEL};
+ // Necessary similarity of prompt for slot selection
+ ctx_server->slot_prompt_similarity = params.slot_prompt_similarity;
+
// load the model
if (!ctx_server->load_model(params))
{
@@ -411,32 +413,36 @@ JNIEXPORT void JNICALL Java_de_kherud_llama_LlamaModel_loadModel(JNIEnv *env, jo
const auto model_meta = ctx_server->model_meta();
// if a custom chat template is not supplied, we will use the one that comes with the model (if any)
- if (sparams.chat_template.empty())
+ if (params.chat_template.empty())
{
if (!ctx_server->validate_model_chat_template())
{
LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
"may cause the model to output suboptimal responses",
{});
- sparams.chat_template = "chatml";
+ params.chat_template = "chatml";
}
}
- ctx_server->chat_template = sparams.chat_template;
- // print sample chat example to make it clear which template is used
+ // if a custom chat template is not supplied, we will use the one that comes with the model (if any)
+ if (params.chat_template.empty())
{
- json chat;
- chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
- chat.push_back({{"role", "user"}, {"content", "Hello"}});
- chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
- chat.push_back({{"role", "user"}, {"content", "How are you?"}});
-
- const std::string chat_example = format_chat(ctx_server->model, sparams.chat_template, chat);
+ if (!ctx_server->validate_model_chat_template())
+ {
+ LOG_ERROR("The chat template that comes with this model is not yet supported, falling back to chatml. This "
+ "may cause the model to output suboptimal responses",
+ {});
+ params.chat_template = "chatml";
+ }
+ }
- LOG_INFO("chat template", {
- {"chat_example", chat_example},
- {"built_in", sparams.chat_template.empty()},
- });
+ // print sample chat example to make it clear which template is used
+ {
+ LOG_INFO("chat template",
+ {
+ {"chat_example", llama_chat_format_example(ctx_server->model, params.chat_template)},
+ {"built_in", params.chat_template.empty()},
+ });
}
ctx_server->queue_tasks.on_new_task(
@@ -480,7 +486,7 @@ JNIEXPORT jint JNICALL Java_de_kherud_llama_LlamaModel_requestCompletion(JNIEnv
json chat;
chat.push_back({{"role", "system"}, {"content", ctx_server->system_prompt}});
chat.push_back({{"role", "user"}, {"content", json_params["prompt"]}});
- json_params["prompt"] = format_chat(ctx_server->model, ctx_server->chat_template, chat);
+ json_params["prompt"] = format_chat(ctx_server->model, ctx_server->params.chat_template, chat);
}
const int id_task = ctx_server->queue_tasks.get_new_id();
diff --git a/src/main/cpp/server.hpp b/src/main/cpp/server.hpp
index d3d4750a..0601dac4 100644
--- a/src/main/cpp/server.hpp
+++ b/src/main/cpp/server.hpp
@@ -103,12 +103,6 @@ struct slot_params
json input_suffix;
};
-struct server_params
-{
- std::string chat_template = "";
- std::string system_prompt = "";
-};
-
struct server_slot
{
int id;
@@ -686,11 +680,6 @@ struct server_context
std::string system_prompt;
std::vector system_tokens;
- std::string name_user; // this should be the antiprompt
- std::string name_assistant;
-
- std::string chat_template;
-
// slots / clients
std::vector slots;
json default_generation_settings_for_props;
@@ -700,6 +689,9 @@ struct server_context
server_metrics metrics;
+ // Necessary similarity of prompt for slot selection
+ float slot_prompt_similarity = 0.0f;
+
~server_context()
{
if (ctx)
@@ -733,7 +725,10 @@ struct server_context
// dedicate one sequence to the system prompt
params.n_parallel += 1;
- std::tie(model, ctx) = llama_init_from_gpt_params(params);
+ llama_init_result llama_init = llama_init_from_gpt_params(params);
+
+ model = llama_init.model;
+ ctx = llama_init.context;
params.n_parallel -= 1; // but be sneaky about it
if (model == nullptr)
{
@@ -791,6 +786,8 @@ struct server_context
slot.ga_n = ga_n;
slot.ga_w = ga_w;
+ slot.sparams = params.sparams;
+
slot.reset();
slots.push_back(slot);
@@ -866,34 +863,111 @@ struct server_context
return prompt_tokens;
}
- server_slot *get_slot(int id)
+ server_slot *get_slot_by_id(int id)
{
- int64_t t_last = ggml_time_us();
-
- server_slot *last_used = nullptr;
-
for (server_slot &slot : slots)
{
- if (slot.id == id && slot.available())
+ if (slot.id == id)
{
return &slot;
}
+ }
+
+ return nullptr;
+ }
+
+ server_slot *get_available_slot(const std::string &prompt)
+ {
+ server_slot *ret = nullptr;
+
+ // find the slot that has at least n% prompt similarity
+ if (ret == nullptr && slot_prompt_similarity != 0.0f && !prompt.empty())
+ {
+ int max_lcp_len = 0;
+ float similarity = 0;
+
+ for (server_slot &slot : slots)
+ {
+ // skip the slot if it is not available
+ if (!slot.available())
+ {
+ continue;
+ }
+
+ // skip the slot if it does not contains prompt
+ if (!slot.prompt.is_string())
+ {
+ continue;
+ }
+
+ // current slot's prompt
+ std::string slot_prompt = slot.prompt.get();
+
+ // length of the current slot's prompt
+ int slot_prompt_len = slot_prompt.size();
+
+ // length of the Longest Common Prefix between the current slot's prompt and the input prompt
+ int lcp_len = common_part(slot_prompt, prompt);
+
+ // fraction of the common substring length compared to the current slot's prompt length
+ similarity = static_cast(lcp_len) / slot_prompt_len;
+
+ // select the current slot if the criteria match
+ if (lcp_len > max_lcp_len && similarity > slot_prompt_similarity)
+ {
+ max_lcp_len = lcp_len;
+ ret = &slot;
+ }
+ }
+
+ if (ret != nullptr)
+ {
+ LOG_VERBOSE("selected slot by lcp similarity", {
+ {"id_slot", ret->id},
+ {"max_lcp_len", max_lcp_len},
+ {"similarity", similarity},
+ });
+ }
+ }
+
+ // find the slot that has been least recently used
+ if (ret == nullptr)
+ {
+ int64_t t_last = ggml_time_us();
+ for (server_slot &slot : slots)
+ {
+ // skip the slot if it is not available
+ if (!slot.available())
+ {
+ continue;
+ }
+
+ // select the current slot if the criteria match
+ if (slot.t_last_used < t_last)
+ {
+ t_last = slot.t_last_used;
+ ret = &slot;
+ }
+ }
- // among all available slots, find the one that has been least recently used
- if (slot.available() && slot.t_last_used < t_last)
+ if (ret != nullptr)
{
- last_used = &slot;
- t_last = slot.t_last_used;
+ LOG_VERBOSE("selected slot by lru", {
+ {"id_slot", ret->id},
+ {"t_last", t_last},
+ });
}
}
- return last_used;
+ return ret;
}
bool launch_slot_with_task(server_slot &slot, const server_task &task)
{
slot_params default_params;
- llama_sampling_params default_sparams;
+ // Sampling parameter defaults are loaded from the global server context (but individual requests can still
+ // override them)
+ llama_sampling_params default_sparams = params.sparams;
auto &data = task.data;
slot.oaicompat = false;
@@ -901,7 +975,7 @@ struct server_context
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", false);
- slot.params.n_predict = json_value(data, "n_predict", default_params.n_predict);
+ slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", default_params.n_predict));
slot.sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
slot.sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
slot.sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
@@ -947,19 +1021,23 @@ struct server_context
slot.params.input_suffix = json_value(data, "input_suffix", default_params.input_suffix);
// get prompt
+ if (!task.infill)
{
const auto &prompt = data.find("prompt");
if (prompt == data.end())
{
- send_error(task, R"(Either "prompt" or "messages" must be provided)", ERROR_TYPE_INVALID_REQUEST);
+ send_error(task, "\"prompt\" must be provided", ERROR_TYPE_INVALID_REQUEST);
return false;
}
- slot.prompt = *prompt;
-
- if (slot.prompt.is_array() && slot.prompt.empty())
+ if ((prompt->is_string()) || (prompt->is_array() && prompt->size() == 1 && prompt->at(0).is_string()) ||
+ (prompt->is_array() && !prompt->empty() && prompt->at(0).is_number_integer()))
+ {
+ slot.prompt = *prompt;
+ }
+ else
{
- send_error(task, "\"prompt\" cannot be an empty array", ERROR_TYPE_INVALID_REQUEST);
+ send_error(task, "\"prompt\" must be a string or an array of integers", ERROR_TYPE_INVALID_REQUEST);
return false;
}
}
@@ -1215,7 +1293,7 @@ struct server_context
bool process_token(completion_token_output &result, server_slot &slot)
{
// remember which tokens were sampled - used for repetition penalties during sampling
- const std::string token_str = llama_token_to_piece(ctx, result.tok, false);
+ const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
slot.sampled = result.tok;
// search stop word and delete it
@@ -1546,12 +1624,12 @@ struct server_context
}
const float *embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
- if (embd == nullptr)
+ if (embd == NULL)
{
embd = llama_get_embeddings_ith(ctx, i);
}
- if (embd == nullptr)
+ if (embd == NULL)
{
LOG_ERROR("failed to get embeddings", {{"token", batch.token[i]}, {"seq_id", batch.seq_id[i][0]}});
@@ -1663,7 +1741,25 @@ struct server_context
switch (task.type)
{
case SERVER_TASK_TYPE_COMPLETION: {
- server_slot *slot = get_slot(json_value(task.data, "id_slot", -1));
+ const int id_slot = json_value(task.data, "id_slot", -1);
+
+ server_slot *slot;
+
+ if (id_slot != -1)
+ {
+ slot = get_slot_by_id(id_slot);
+ }
+ else
+ {
+ std::string prompt;
+ if (task.data.contains("prompt") && task.data.at("prompt").is_string())
+ {
+ prompt = json_value(task.data, "prompt", std::string());
+ }
+
+ slot = get_available_slot(prompt);
+ }
+
if (slot == nullptr)
{
// if no slot is available, we defer this task for processing later
@@ -1671,6 +1767,13 @@ struct server_context
queue_tasks.defer(task);
break;
}
+ if (!slot->available())
+ {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
if (task.data.contains("system_prompt"))
{
@@ -1790,12 +1893,19 @@ struct server_context
break;
case SERVER_TASK_TYPE_SLOT_SAVE: {
int id_slot = task.data.at("id_slot");
- server_slot *slot = get_slot(id_slot);
+ server_slot *slot = get_slot_by_id(id_slot);
if (slot == nullptr)
{
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available())
+ {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
@@ -1823,12 +1933,19 @@ struct server_context
break;
case SERVER_TASK_TYPE_SLOT_RESTORE: {
int id_slot = task.data.at("id_slot");
- server_slot *slot = get_slot(id_slot);
+ server_slot *slot = get_slot_by_id(id_slot);
if (slot == nullptr)
{
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available())
+ {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
const int64_t t_start = ggml_time_us();
@@ -1865,12 +1982,19 @@ struct server_context
break;
case SERVER_TASK_TYPE_SLOT_ERASE: {
int id_slot = task.data.at("id_slot");
- server_slot *slot = get_slot(id_slot);
+ server_slot *slot = get_slot_by_id(id_slot);
if (slot == nullptr)
{
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
break;
}
+ if (!slot->available())
+ {
+ // if requested slot is unavailable, we defer this task for processing later
+ LOG_VERBOSE("requested slot is unavailable", {{"id_task", task.id}});
+ queue_tasks.defer(task);
+ break;
+ }
// Erase token cache
const size_t n_erased = slot->cache_tokens.size();
@@ -2054,6 +2178,11 @@ struct server_context
int32_t n_batch = llama_n_batch(ctx);
int32_t n_ubatch = llama_n_ubatch(ctx);
+ // track if this is an embedding or non-embedding batch
+ // if we've added sampled tokens above, we are in non-embedding mode
+ // -1: none, 0: non-embedding, 1: embedding
+ int32_t batch_type = batch.n_tokens > 0 ? 0 : -1;
+
// next, batch any pending prompts without exceeding n_batch
if (params.cont_batching || batch.n_tokens == 0)
{
@@ -2074,6 +2203,7 @@ struct server_context
if (slot.infill)
{
+ const bool add_bos = llama_should_add_bos_token(model);
bool suff_rm_leading_spc = true;
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1)
{
@@ -2091,11 +2221,23 @@ struct server_context
}
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
- prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS
- prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model));
- prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end());
- prefix_tokens.push_back(llama_token_middle(model));
- prompt_tokens = prefix_tokens;
+ suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
+
+ auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
+ auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
+ if (add_bos)
+ {
+ embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
+ }
+ embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
+
+ const llama_token middle_token = llama_token_middle(model);
+ if (middle_token >= 0)
+ {
+ embd_inp.push_back(middle_token);
+ }
+
+ prompt_tokens = embd_inp;
}
else
{
@@ -2138,7 +2280,6 @@ struct server_context
slot.state = SLOT_STATE_PROCESSING;
slot.command = SLOT_COMMAND_NONE;
slot.release();
- slot.print_timings();
send_error(slot, "input is too large to process. increase the physical batch size",
ERROR_TYPE_SERVER);
continue;
@@ -2236,6 +2377,17 @@ struct server_context
}
}
+ // check that we are in the right batch_type, if not defer the slot
+ bool slot_type = slot.embedding ? 1 : 0;
+ if (batch_type == -1)
+ {
+ batch_type = slot_type;
+ }
+ else if (batch_type != slot_type)
+ {
+ continue;
+ }
+
// keep only the common part
int p0 = (int)system_tokens.size() + slot.n_past;
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1))
@@ -2344,6 +2496,9 @@ struct server_context
{"n_tokens", batch.n_tokens},
});
+ // make sure we're in the right embedding mode
+ llama_set_embeddings(ctx, batch_type == 1);
+
// process the created batch of tokens
for (int32_t i = 0; i < batch.n_tokens; i += n_batch)
{
@@ -2531,10 +2686,9 @@ struct server_context
};
// parse the given jparams (see de.kherud.llama.args.ModelParameters#toString()) from JSON to the required C++ struct.
-static void server_params_parse(json jparams, server_params &sparams, gpt_params ¶ms)
+static void server_params_parse(json jparams, gpt_params ¶ms)
{
gpt_params default_params;
- server_params default_sparams;
params.seed = json_value(jparams, "seed", default_params.seed);
params.n_threads = json_value(jparams, "n_threads", default_params.n_threads);
@@ -2551,7 +2705,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
params.n_parallel = json_value(jparams, "n_parallel", default_params.n_parallel);
params.n_sequences = json_value(jparams, "n_sequences", default_params.n_sequences);
params.p_split = json_value(jparams, "p_split", default_params.p_split);
- params.n_beams = json_value(jparams, "n_beams", default_params.n_beams);
params.grp_attn_n = json_value(jparams, "grp_attn_n", default_params.grp_attn_n);
params.grp_attn_w = json_value(jparams, "grp_attn_w", default_params.grp_attn_w);
params.n_print = json_value(jparams, "n_print", default_params.n_print);
@@ -2582,7 +2735,6 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
params.lookup_cache_dynamic = json_value(jparams, "lookup_cache_dynamic", default_params.lookup_cache_dynamic);
params.logits_file = json_value(jparams, "logits_file", default_params.logits_file);
params.lora_adapter = json_value(jparams, "lora_adapter", default_params.lora_adapter);
- params.lora_base = json_value(jparams, "lora_base", default_params.lora_base);
params.embedding = json_value(jparams, "embedding", default_params.embedding);
params.escape = json_value(jparams, "escape", default_params.escape);
params.cont_batching = json_value(jparams, "cont_batching", default_params.cont_batching);
@@ -2592,8 +2744,8 @@ static void server_params_parse(json jparams, server_params &sparams, gpt_params
params.use_mmap = json_value(jparams, "use_mmap", default_params.use_mmap);
params.use_mlock = json_value(jparams, "use_mlock", default_params.use_mlock);
params.no_kv_offload = json_value(jparams, "no_kv_offload", default_params.no_kv_offload);
- sparams.system_prompt = json_value(jparams, "system_prompt", default_sparams.system_prompt);
- sparams.chat_template = json_value(jparams, "chat_template", default_sparams.chat_template);
+ params.system_prompt = json_value(jparams, "system_prompt", default_params.system_prompt);
+ params.chat_template = json_value(jparams, "chat_template", default_params.chat_template);
if (jparams.contains("n_gpu_layers"))
{
diff --git a/src/main/cpp/utils.hpp b/src/main/cpp/utils.hpp
index ad7198c1..7de7eac4 100644
--- a/src/main/cpp/utils.hpp
+++ b/src/main/cpp/utils.hpp
@@ -97,10 +97,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li
json log = json{
{"msg", message},
#if SERVER_VERBOSE
- {"ts", time(nullptr)},
- {"level", log_level_to_string(level)},
- {"tid", ss_tid.str()},
- {"function", function},
+ {"ts", time(nullptr)}, {"level", log_level_to_string(level)}, {"tid", ss_tid.str()}, {"function", function},
{"line", line},
#endif
};
@@ -135,9 +132,7 @@ static inline void server_log(ggml_log_level level, const char *function, int li
}
#if SERVER_VERBOSE
- ss << " | ts " << time(nullptr)
- << " | tid " << ss_tid.str()
- << " | " << function << " line " << line;
+ ss << " | ts " << time(nullptr) << " | tid " << ss_tid.str() << " | " << function << " line " << line;
#endif
const std::string str = ss.str();
@@ -157,50 +152,51 @@ static inline void server_log(ggml_log_level level, const char *function, int li
// chat template utils
//
-// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-inline bool verify_custom_template(const std::string &tmpl)
-{
- llama_chat_message chat[] = {{"user", "test"}};
- int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
- return res >= 0;
-}
-
// Format given chat. If tmpl is empty, we take the template from model metadata
inline std::string format_chat(const struct llama_model *model, const std::string &tmpl,
const std::vector &messages)
{
- size_t alloc_size = 0;
- // vector holding all allocated string to be passed to llama_chat_apply_template
- std::vector str(messages.size() * 2);
- std::vector chat(messages.size());
+ std::vector chat;
for (size_t i = 0; i < messages.size(); ++i)
{
const auto &curr_msg = messages[i];
- str[i * 2 + 0] = json_value(curr_msg, "role", std::string(""));
- str[i * 2 + 1] = json_value(curr_msg, "content", std::string(""));
- alloc_size += str[i * 2 + 1].length();
- chat[i].role = str[i * 2 + 0].c_str();
- chat[i].content = str[i * 2 + 1].c_str();
- }
- const char *ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
- std::vector buf(alloc_size * 2);
+ std::string role = json_value(curr_msg, "role", std::string(""));
- // run the first time to get the total output length
- int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+ std::string content;
+ if (curr_msg.contains("content"))
+ {
+ if (curr_msg["content"].is_string())
+ {
+ content = curr_msg["content"].get();
+ }
+ else if (curr_msg["content"].is_array())
+ {
+ for (const auto &part : curr_msg["content"])
+ {
+ if (part.contains("text"))
+ {
+ content += "\n" + part["text"].get();
+ }
+ }
+ }
+ else
+ {
+ throw std::runtime_error(
+ "Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+ }
+ }
+ else
+ {
+ throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+ }
- // if it turns out that our buffer is too small, we resize it
- if ((size_t)res > buf.size())
- {
- buf.resize(res);
- res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
+ chat.push_back({role, content});
}
- const std::string formatted_chat(buf.data(), res);
-
+ auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
-
return formatted_chat;
}
@@ -322,6 +318,16 @@ static size_t common_part(const std::vector &a, const std::vector= suffix.size() && 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
@@ -432,24 +438,6 @@ static json oaicompat_completion_params_parse(const struct llama_model *model,
llama_params["__oaicompat"] = true;
- // Map OpenAI parameters to llama.cpp parameters
- //
- // For parameters that are defined by the OpenAI documentation (e.g.
- // temperature), we explicitly specify OpenAI's intended default; we
- // need to do that because sometimes OpenAI disagrees with llama.cpp
- //
- // https://platform.openai.com/docs/api-reference/chat/create
- llama_sampling_params default_sparams;
- llama_params["model"] = json_value(body, "model", std::string("unknown"));
- llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
- llama_params["logit_bias"] = json_value(body, "logit_bias", json::object());
- llama_params["n_predict"] = json_value(body, "max_tokens", -1);
- llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
- llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
- llama_params["stream"] = json_value(body, "stream", false);
- llama_params["temperature"] = json_value(body, "temperature", 1.0);
- llama_params["top_p"] = json_value(body, "top_p", 1.0);
-
// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
@@ -674,7 +662,7 @@ static json format_embeddings_response_oaicompat(const json &request, const json
{
json data = json::array();
int i = 0;
- for (const auto &elem : embeddings)
+ for (auto &elem : embeddings)
{
data.push_back(
json{{"embedding", json_value(elem, "embedding", json::array())}, {"index", i++}, {"object", "embedding"}});
diff --git a/src/main/java/de/kherud/llama/LlamaLoader.java b/src/main/java/de/kherud/llama/LlamaLoader.java
index 5aa84001..a0239d20 100644
--- a/src/main/java/de/kherud/llama/LlamaLoader.java
+++ b/src/main/java/de/kherud/llama/LlamaLoader.java
@@ -62,6 +62,7 @@ static synchronized void initialize() throws UnsatisfiedLinkError {
System.err.println("'ggml-metal.metal' not found");
}
}
+ loadNativeLibrary("ggml");
loadNativeLibrary("llama");
loadNativeLibrary("jllama");
extracted = true;
@@ -96,12 +97,7 @@ private static void cleanPath(Path path) {
private static void loadNativeLibrary(String name) {
List triedPaths = new LinkedList<>();
- // Try loading library from de.kherud.llama.lib.path library path
- String nativeLibName = System.getProperty("de.kherud.llama.lib.name");
- if (nativeLibName == null) {
- nativeLibName = System.mapLibraryName(name);
- }
-
+ String nativeLibName = System.mapLibraryName(name);
String nativeLibPath = System.getProperty("de.kherud.llama.lib.path");
if (nativeLibPath != null) {
Path path = Paths.get(nativeLibPath, nativeLibName);
@@ -125,21 +121,7 @@ private static void loadNativeLibrary(String name) {
}
}
- // Load the os-dependent library from the jar file
- nativeLibPath = getNativeResourcePath();
- if (hasNativeLib(nativeLibPath, nativeLibName)) {
- // temporary library folder
- String tempFolder = getTempDir().getAbsolutePath();
- // Try extracting the library from jar
- if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) {
- return;
- }
- else {
- triedPaths.add(nativeLibPath);
- }
- }
-
- // As a last resort try from java.library.path
+ // Try to load the library from java.library.path
String javaLibraryPath = System.getProperty("java.library.path", "");
for (String ldPath : javaLibraryPath.split(File.pathSeparator)) {
if (ldPath.isEmpty()) {
@@ -154,6 +136,20 @@ private static void loadNativeLibrary(String name) {
}
}
+ // As a last resort try load the os-dependent library from the jar file
+ nativeLibPath = getNativeResourcePath();
+ if (hasNativeLib(nativeLibPath, nativeLibName)) {
+ // temporary library folder
+ String tempFolder = getTempDir().getAbsolutePath();
+ // Try extracting the library from jar
+ if (extractAndLoadLibraryFile(nativeLibPath, nativeLibName, tempFolder)) {
+ return;
+ }
+ else {
+ triedPaths.add(nativeLibPath);
+ }
+ }
+
throw new UnsatisfiedLinkError(
String.format(
"No native library found for os.name=%s, os.arch=%s, paths=[%s]",
diff --git a/src/main/java/de/kherud/llama/ModelParameters.java b/src/main/java/de/kherud/llama/ModelParameters.java
index 1cbb6973..3b34d3f3 100644
--- a/src/main/java/de/kherud/llama/ModelParameters.java
+++ b/src/main/java/de/kherud/llama/ModelParameters.java
@@ -32,7 +32,6 @@ public final class ModelParameters extends JsonParameters {
private static final String PARAM_SPLIT_MODE = "split_mode";
private static final String PARAM_MAIN_GPU = "main_gpu";
private static final String PARAM_TENSOR_SPLIT = "tensor_split";
- private static final String PARAM_N_BEAMS = "n_beams";
private static final String PARAM_GRP_ATTN_N = "grp_attn_n";
private static final String PARAM_GRP_ATTN_W = "grp_attn_w";
private static final String PARAM_ROPE_FREQ_BASE = "rope_freq_base";
@@ -55,7 +54,6 @@ public final class ModelParameters extends JsonParameters {
private static final String PARAM_LOOKUP_CACHE_STATIC = "lookup_cache_static";
private static final String PARAM_LOOKUP_CACHE_DYNAMIC = "lookup_cache_dynamic";
private static final String PARAM_LORA_ADAPTER = "lora_adapter";
- private static final String PARAM_LORA_BASE = "lora_base";
private static final String PARAM_EMBEDDING = "embedding";
private static final String PARAM_CONT_BATCHING = "cont_batching";
private static final String PARAM_FLASH_ATTENTION = "flash_attn";
@@ -244,14 +242,6 @@ public ModelParameters setTensorSplit(float[] tensorSplit) {
return this;
}
- /**
- * Set usage of beam search of given width if non-zero.
- */
- public ModelParameters setNBeams(int nBeams) {
- parameters.put(PARAM_N_BEAMS, String.valueOf(nBeams));
- return this;
- }
-
/**
* Set the group-attention factor (default: 1)
*/
@@ -484,14 +474,6 @@ public ModelParameters setLoraAdapters(Map loraAdapters) {
return this;
}
- /**
- * Set an optional model to use as a base for the layers modified by the LoRA adapter
- */
- public ModelParameters setLoraBase(String loraBase) {
- parameters.put(PARAM_LORA_BASE, toJsonString(loraBase));
- return this;
- }
-
/**
* Whether to load model with embedding support
*/
diff --git a/src/test/java/de/kherud/llama/LlamaModelTest.java b/src/test/java/de/kherud/llama/LlamaModelTest.java
index a5454c59..c7ece673 100644
--- a/src/test/java/de/kherud/llama/LlamaModelTest.java
+++ b/src/test/java/de/kherud/llama/LlamaModelTest.java
@@ -1,7 +1,6 @@
package de.kherud.llama;
import java.io.*;
-import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.regex.Pattern;
@@ -24,6 +23,7 @@ public static void setup() {
// LlamaModel.setLogger(LogFormat.TEXT, (level, msg) -> System.out.println(level + ": " + msg));
model = new LlamaModel(
new ModelParameters()
+ .setNCtx(128)
.setModelFilePath("models/codellama-7b.Q2_K.gguf")
// .setModelUrl("https://huggingface.co/TheBloke/CodeLlama-7B-GGUF/resolve/main/codellama-7b.Q2_K.gguf")
.setNGpuLayers(43)