diff --git a/Makefile b/Makefile index 3796d17e3..d8fb0cc27 100644 --- a/Makefile +++ b/Makefile @@ -45,6 +45,9 @@ build.kompute: build.sycl: CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e . +build.rpc: + CMAKE_ARGS="-DLLAMA_RPC=on" python3 -m pip install --verbose -e . + build.sdist: python3 -m build --sdist diff --git a/README.md b/README.md index 342e924a0..0f7abfb28 100644 --- a/README.md +++ b/README.md @@ -221,6 +221,17 @@ CMAKE_ARGS="-DLLAMA_SYCL=on -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx" pi ``` +
+RPC + +To install with RPC support, set the `LLAMA_RPC=on` environment variable before installing: + +```bash +source /opt/intel/oneapi/setvars.sh +CMAKE_ARGS="-DLLAMA_RPC=on" pip install llama-cpp-python +``` +
+ ### Windows Notes diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 6d872e3e6..536a43515 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -71,6 +71,7 @@ def __init__( split_mode: int = llama_cpp.LLAMA_SPLIT_MODE_LAYER, main_gpu: int = 0, tensor_split: Optional[List[float]] = None, + rpc_servers: Optional[str] = None, vocab_only: bool = False, use_mmap: bool = True, use_mlock: bool = False, @@ -149,6 +150,7 @@ def __init__( split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options. main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split. + rpc_servers: Comma separated list of RPC servers to use for offloading vocab_only: Only load the vocabulary no weights. use_mmap: Use mmap if possible. use_mlock: Force the system to keep the model in RAM. @@ -220,6 +222,11 @@ def __init__( ) # 0x7FFFFFFF is INT32 max, will be auto set to all layers self.model_params.split_mode = split_mode self.model_params.main_gpu = main_gpu + if rpc_servers is not None: + self.model_params.rpc_servers = rpc_servers.encode('utf-8') + self._rpc_servers = rpc_servers + else: + self._rpc_servers = None self.tensor_split = tensor_split self._c_tensor_split = None if self.tensor_split is not None: diff --git a/llama_cpp/server/model.py b/llama_cpp/server/model.py index 4f8371668..d4d4acbe3 100644 --- a/llama_cpp/server/model.py +++ b/llama_cpp/server/model.py @@ -226,6 +226,7 @@ def load_llama_from_model_settings(settings: ModelSettings) -> llama_cpp.Llama: use_mmap=settings.use_mmap, use_mlock=settings.use_mlock, kv_overrides=kv_overrides, + rpc_servers=settings.rpc_servers, # Context Params seed=settings.seed, n_ctx=settings.n_ctx, diff --git a/llama_cpp/server/settings.py b/llama_cpp/server/settings.py index a3e185007..4d924f337 100644 --- a/llama_cpp/server/settings.py +++ b/llama_cpp/server/settings.py @@ -58,6 +58,10 @@ class ModelSettings(BaseSettings): default=None, description="List of model kv overrides in the format key=type:value where type is one of (bool, int, float). Valid true values are (true, TRUE, 1), otherwise false.", ) + rpc_servers: Optional[str] = Field( + default=None, + description="comma seperated list of rpc servers for offloading", + ) # Context Params seed: int = Field( default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random."