From e8b4f32da3e2fec87d737da6a45c79cafd93a895 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 21 May 2024 22:41:43 +0800 Subject: [PATCH 1/8] passthru rpc_servers params wip --- llama_cpp/llama.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6d872e3e6..1c7bfaec1 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -71,6 +71,7 @@ def __init__( split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, + rpc_servers: Optional[str] = None, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, @@ -149,6 +150,7 @@ def __init__( split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. + rpc_servers: Comma separated list of RPC servers to use for offloading vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. @@ -220,6 +222,7 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu + self.model_params.rpc_servers = rpc_servers self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: From 9e1d80f1d08b9c0d25071df77323ce8d07a3f208 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 21 May 2024 22:42:38 +0800 Subject: [PATCH 2/8] enable llama rpc by default --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b9508d40..5d3373970 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ option(LLAVA_BUILD "Build llava shared library and install alongside python pack if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") + set(LLAMA_RPC "On" CACHE BOOL "llama: use RPC" FORCE) # Building llama if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") From fd7bcc951deca85085c04da63f2ddf19751aaab4 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Tue, 21 May 2024 23:18:04 +0800 Subject: [PATCH 3/8] convert string to byte --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 1c7bfaec1..c09c6ed8f 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -222,7 +222,7 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu - self.model_params.rpc_servers = rpc_servers + self.model_params.rpc_servers = rpc_servers.encode('utf-8') self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: From 2f7f83e1215950e7e22f1691e3063259623d4f44 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 22 May 2024 21:12:43 +0800 Subject: [PATCH 4/8] add rpc package --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 3796d17e3..d8fb0cc27 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,9 @@ build.kompute: build.sycl: CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e . +build.rpc: + CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist From aeebfba8600dc3d617b1da8eedb5156b020577a3 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Thu, 23 May 2024 10:09:27 +0800 Subject: [PATCH 5/8] Revert "enable llama rpc by default" This reverts commit 832c6dd56c979514cec5df224bf2d2014dccd790. --- CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5d3373970..4b9508d40 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,6 @@ option(LLAVA_BUILD "Build llava shared library and install alongside python pack if (LLAMA_BUILD) set(BUILD_SHARED_LIBS "On") - set(LLAMA_RPC "On" CACHE BOOL "llama: use RPC" FORCE) # Building llama if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") From ff88fcbc2041ac38b3ac401062779cabd3559294 Mon Sep 17 00:00:00 2001 From: hongruichen Date: Wed, 29 May 2024 12:18:51 +0800 Subject: [PATCH 6/8] update readme --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index 342e924a0..0f7abfb28 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi ``` +
+RPC + +To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing: + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python +``` +
+ ### Windows Notes From 1e42468a271c81db242c5d6471826b6052351804 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 4 Jun 2024 10:37:02 -0400 Subject: [PATCH 7/8] Only set rpc_servers when provided --- llama_cpp/llama.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c09c6ed8f..536a43515 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -222,7 +222,11 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu - self.model_params.rpc_servers = rpc_servers.encode('utf-8') + if rpc_servers is not None: + self.model_params.rpc_servers = rpc_servers.encode('utf-8') + self._rpc_servers = rpc_servers + else: + self._rpc_servers = None self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: From 2b5438d71bd82e6d9c93ba34dfd90cd4e4eb4b12 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Tue, 4 Jun 2024 10:37:20 -0400 Subject: [PATCH 8/8] Add rpc servers to server options --- llama_cpp/server/model.py | 1 + llama_cpp/server/settings.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 4f8371668..d4d4acbe3 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -226,6 +226,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: use_mmap=settings.use_mmap, use_mlock=settings.use_mlock, kv_overrides=kv_overrides, + rpc_servers=settings.rpc_servers, # Context Params seed=settings.seed, n_ctx=settings.n_ctx, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a3e185007..4d924f337 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -58,6 +58,10 @@ class ModelSettings(BaseSettings): default=None, description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", ) + rpc_servers: Optional[str] = Field( + default=None, + description="comma seperated list of rpc servers for offloading", + ) # Context Params seed: int = Field( default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."