Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 9f14fd2

Browse files
committed
Merge branch 'main' into remove-unwanted-bos
2 parents a6e5917 + 951e39c commit 9f14fd2

18 files changed

+262
-81
lines changed

.github/workflows/build-and-release.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
python -m pip install -e .[all]
3030
3131
- name: Build wheels
32-
uses: pypa/cibuildwheel@v2.17.0
32+
uses: pypa/cibuildwheel@v2.18.1
3333
env:
3434
# disable repair
3535
CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -56,7 +56,7 @@ jobs:
5656
platforms: linux/arm64
5757

5858
- name: Build wheels
59-
uses: pypa/cibuildwheel@v2.17.0
59+
uses: pypa/cibuildwheel@v2.18.1
6060
env:
6161
CIBW_SKIP: "*musllinux* pp*"
6262
CIBW_REPAIR_WHEEL_COMMAND: ""

CHANGELOG.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,43 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.2.77]
11+
12+
- feat: Update llama.cpp to ggerganov/llama.cpp@bde7cd3cd949c1a85d3a199498ac98e78039d46f
13+
- fix: string value kv_overrides by @abetlen in df45a4b3fe46e72664bda87301b318210c6d4782
14+
- fix: Fix typo in Llama3VisionAlphaChatHandler by @abetlen in 165b4dc6c188f8fda2fc616154e111f710484eba
15+
- fix: Use numpy recarray for candidates data, fixes bug with temp < 0 by @abetlen in af3ed503e9ce60fe6b5365031abad4176a3536b3
16+
fix: Disable Windows+CUDA workaround when compiling for HIPBLAS by Engininja2 in #1493
17+
18+
## [0.2.76]
19+
20+
- feat: Update llama.cpp to ggerganov/llama.cpp@0df0aa8e43c3378975269a51f9b876c8692e70da
21+
- feat: Improve Llama.eval performance by avoiding list conversion by @thoughtp0lice in #1476
22+
- example: LLM inference with Ray Serve by @rgerganov in #1465
23+
24+
## [0.2.75]
25+
26+
- feat: Update llama.cpp to ggerganov/llama.cpp@13ad16af1231ab2d245d35df3295bcfa23de1305
27+
- fix: segfault for models without eos / bos tokens by @abetlen in d99a6ba607a4885fb00e63e967964aa41bdbbbcb
28+
- feat: add MinTokensLogitProcessor and min_tokens argument to server by @twaka in #1333
29+
- misc: Remove unnecessary metadata lookups by @CISC in #1448
30+
31+
## [0.2.74]
32+
33+
- feat: Update llama.cpp to ggerganov/llama.cpp@b228aba91ac2cd9eb90e9d423ba1d0d20e0117e2
34+
- fix: Enable CUDA backend for llava by @abetlen in 7f59856fa6f3e23f07e12fc15aeb9359dc6c3bb4
35+
- docs: Fix typo in README.md by @yupbank in #1444
36+
37+
## [0.2.73]
38+
39+
- feat: Update llama.cpp to ggerganov/llama.cpp@25c6e82e7a1ad25a42b0894e87d9b5c557409516
40+
- fix: Clear kv cache at beginning of image chat formats to avoid bug when image is evaluated first by @abetlen in ac55d0a175115d1e719672ce1cb1bec776c738b1
41+
42+
## [0.2.72]
43+
44+
- fix(security): Remote Code Execution by Server-Side Template Injection in Model Metadata by @retr0reg in b454f40a9a1787b2b5659cd2cb00819d983185df
45+
- fix(security): Update remaining jinja chat templates to use immutable sandbox by @CISC in #1441
46+
1047
## [0.2.71]
1148

1249
- feat: Update llama.cpp to ggerganov/llama.cpp@911b3900dded9a1cfe0f0e41b82c7a29baf3a217

CMakeLists.txt

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,18 +41,21 @@ if (LLAMA_BUILD)
4141
RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
4242
)
4343
# Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
44-
install(
45-
FILES $<TARGET_RUNTIME_DLLS:llama>
46-
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
47-
)
48-
install(
49-
FILES $<TARGET_RUNTIME_DLLS:llama>
50-
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
51-
)
44+
if (WIN32 AND (LLAMA_CUDA OR LLAMA_CUBLAS))
45+
install(
46+
FILES $<TARGET_RUNTIME_DLLS:llama>
47+
DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
48+
)
49+
install(
50+
FILES $<TARGET_RUNTIME_DLLS:llama>
51+
DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
52+
)
53+
endif()
5254

5355
if (LLAVA_BUILD)
54-
if (LLAMA_CUBLAS)
56+
if (LLAMA_CUBLAS OR LLAMA_CUDA)
5557
add_compile_definitions(GGML_USE_CUBLAS)
58+
add_compile_definitions(GGML_USE_CUDA)
5659
endif()
5760

5861
if (LLAMA_METAL)

Makefile

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,16 @@ build:
1313
python3 -m pip install --verbose -e .
1414

1515
build.debug:
16-
CMAKE_ARGS="-DCMAKE_BUILD_TYPE=Debug" python3 -m pip install --verbose --config-settings=cmake.verbose=true --config-settings=logging.level=INFO --config-settings=install.strip=false --editable .
16+
python3 -m pip install \
17+
--verbose \
18+
--config-settings=cmake.verbose=true \
19+
--config-settings=logging.level=INFO \
20+
--config-settings=install.strip=false \
21+
--config-settings=cmake.args="-DCMAKE_BUILD_TYPE=Debug;-DCMAKE_C_FLAGS='-ggdb -O0';-DCMAKE_CXX_FLAGS='-ggdb -O0'" \
22+
--editable .
1723

1824
build.cuda:
19-
CMAKE_ARGS="-DLLAMA_CUBLAS=on" python3 -m pip install --verbose -e .
25+
CMAKE_ARGS="-DLLAMA_CUDA=on" python3 -m pip install --verbose -e .
2026

2127
build.opencl:
2228
CMAKE_ARGS="-DLLAMA_CLBLAST=on" python3 -m pip install --verbose -e .

README.md

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -499,13 +499,16 @@ llm = Llama.from_pretrained(
499499

500500
`llama-cpp-python` supports such as llava1.5 which allow the language model to read information from both text and images.
501501

502-
You'll first need to download one of the available multi-modal models in GGUF format:
502+
Below are the supported multi-modal models and their respective chat handlers (Python API) and chat formats (Server API).
503503

504-
- [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
505-
- [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
506-
- [bakllava-1-7b](https://huggingface.co/mys/ggml_bakllava-1)
507-
- [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf)
508-
- [moondream2](https://huggingface.co/vikhyatk/moondream2)
504+
| Model | `LlamaChatHandler` | `chat_format` |
505+
|:--- |:--- |:--- |
506+
| [llava-v1.5-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b) | `Llava15ChatHandler` | `llava-1-5` |
507+
| [llava-v1.5-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b) | `Llava15ChatHandler` | `llava-1-5` |
508+
| [llava-v1.6-34b](https://huggingface.co/cjpais/llava-v1.6-34B-gguf) | `Llava16ChatHandler` | `llava-1-6` |
509+
| [moondream2](https://huggingface.co/vikhyatk/moondream2) | `MoondreamChatHandler` | `moondream2` |
510+
| [nanollava](https://huggingface.co/abetlen/nanollava-gguf) | `NanollavaChatHandler` | `nanollava` |
511+
| [llama-3-vision-alpha](https://huggingface.co/abetlen/llama-3-vision-alpha-gguf) | `Llama3VisionAlphaChatHandler` | `llama-3-vision-alpha` |
509512

510513
Then you'll need to use a custom chat handler to load the clip model and process the chat messages and images.
511514

@@ -550,7 +553,7 @@ llm = Llama.from_pretrained(
550553
n_ctx=2048, # n_ctx should be increased to accommodate the image embedding
551554
)
552555

553-
respoonse = llm.create_chat_completion(
556+
response = llm.create_chat_completion(
554557
messages = [
555558
{
556559
"role": "user",

examples/ray/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
This is an example of doing LLM inference with [Ray](https://docs.ray.io/en/latest/index.html) and [Ray Serve](https://docs.ray.io/en/latest/serve/index.html).
2+
3+
First, install the requirements:
4+
5+
```bash
6+
$ pip install -r requirements.txt
7+
```
8+
9+
Deploy a GGUF model to Ray Serve with the following command:
10+
11+
```bash
12+
$ serve run llm:llm_builder model_path='../models/mistral-7b-instruct-v0.2.Q4_K_M.gguf'
13+
```
14+
15+
This will start an API endpoint at `http://localhost:8000/`. You can query the model like this:
16+
17+
```bash
18+
$ curl -k -d '{"prompt": "tell me a joke", "max_tokens": 128}' -X POST http://localhost:8000
19+
```

examples/ray/llm.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
from starlette.requests import Request
2+
from typing import Dict
3+
from ray import serve
4+
from ray.serve import Application
5+
from llama_cpp import Llama
6+
7+
@serve.deployment
8+
class LlamaDeployment:
9+
def __init__(self, model_path: str):
10+
self._llm = Llama(model_path=model_path)
11+
12+
async def __call__(self, http_request: Request) -> Dict:
13+
input_json = await http_request.json()
14+
prompt = input_json["prompt"]
15+
max_tokens = input_json.get("max_tokens", 64)
16+
return self._llm(prompt, max_tokens=max_tokens)
17+
18+
19+
def llm_builder(args: Dict[str, str]) -> Application:
20+
return LlamaDeployment.bind(args["model_path"])

examples/ray/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
ray[serve]
2+
--extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu
3+
llama-cpp-python

llama_cpp/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from .llama_cpp import *
22
from .llama import *
33

4-
__version__ = "0.2.71"
4+
__version__ = "0.2.77"

llama_cpp/_internals.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -553,13 +553,12 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
553553
class _LlamaTokenDataArray:
554554
def __init__(self, *, n_vocab: int):
555555
self.n_vocab = n_vocab
556-
self.candidates_data = np.array(
557-
[],
556+
self.candidates_data = np.recarray(
557+
(self.n_vocab,),
558558
dtype=np.dtype(
559559
[("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
560560
),
561561
)
562-
self.candidates_data.resize(3, self.n_vocab, refcheck=False)
563562
self.candidates = llama_cpp.llama_token_data_array(
564563
data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
565564
size=self.n_vocab,
@@ -569,14 +568,11 @@ def __init__(self, *, n_vocab: int):
569568
self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
570569

571570
def copy_logits(self, logits: npt.NDArray[np.single]):
572-
self.candidates_data["id"][:] = self.default_candidates_data_id
573-
self.candidates_data["logit"][:] = logits
574-
self.candidates_data["p"][:] = self.default_candidates_data_p
575-
self.candidates.data = self.candidates_data.ctypes.data_as(
576-
llama_cpp.llama_token_data_p
577-
)
578-
self.candidates.sorted = ctypes.c_bool(False)
579-
self.candidates.size = ctypes.c_size_t(self.n_vocab)
571+
self.candidates_data.id[:] = self.default_candidates_data_id
572+
self.candidates_data.logit[:] = logits
573+
self.candidates_data.p[:] = self.default_candidates_data_p
574+
self.candidates.sorted = False
575+
self.candidates.size = self.n_vocab
580576

581577

582578
# Python wrappers over common/common
@@ -767,14 +763,14 @@ def sample(
767763
self.params.penalty_present,
768764
)
769765
if not self.params.penalize_nl:
770-
token_data_array.candidates_data["logit"][nl_token] = nl_logit
766+
token_data_array.candidates_data.logit[nl_token] = nl_logit
771767

772768
if self.grammar is not None:
773769
ctx_main.sample_grammar(token_data_array, self.grammar)
774770

775771
if self.params.temp < 0:
776772
ctx_main.sample_softmax(token_data_array)
777-
id = token_data_array.candidates_data["id"][0]
773+
id = token_data_array.candidates_data.id[0]
778774
elif self.params.temp == 0:
779775
id = ctx_main.sample_token_greedy(token_data_array)
780776
else:

llama_cpp/llama.py

Lines changed: 33 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import time
77
import json
88
import ctypes
9+
import typing
910
import fnmatch
1011
import multiprocessing
1112

@@ -249,24 +250,26 @@ def __init__(
249250
self._kv_overrides_array[i].key = k.encode("utf-8")
250251
if isinstance(v, bool):
251252
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
252-
self._kv_overrides_array[i].value.bool_value = v
253+
self._kv_overrides_array[i].value.val_bool = v
253254
elif isinstance(v, int):
254255
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
255-
self._kv_overrides_array[i].value.int_value = v
256+
self._kv_overrides_array[i].value.val_i64 = v
256257
elif isinstance(v, float):
257258
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
258-
self._kv_overrides_array[i].value.float_value = v
259+
self._kv_overrides_array[i].value.val_f64 = v
259260
elif isinstance(v, str): # type: ignore
260261
v_bytes = v.encode("utf-8")
261262
if len(v_bytes) > 128: # TODO: Make this a constant
262263
raise ValueError(f"Value for {k} is too long: {v}")
263264
v_bytes = v_bytes.ljust(128, b"\0")
264265
self._kv_overrides_array[i].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
265266
# copy min(v_bytes, 128) to str_value
267+
address = typing.cast(int, ctypes.addressof(self._kv_overrides_array[i].value) + llama_cpp.llama_model_kv_override_value.val_str.offset)
268+
buffer_start = ctypes.cast(address, ctypes.POINTER(ctypes.c_char))
266269
ctypes.memmove(
267-
self._kv_overrides_array[i].value.str_value,
270+
buffer_start,
268271
v_bytes,
269-
min(len(v_bytes), 128),
272+
128,
270273
)
271274
else:
272275
raise ValueError(f"Unknown value type for {k}: {v}")
@@ -410,11 +413,11 @@ def __init__(
410413
if self.verbose:
411414
print(f"Model metadata: {self.metadata}", file=sys.stderr)
412415

413-
eos_token_id = int(self.metadata.get("tokenizer.ggml.eos_token_id", self.token_eos()))
414-
bos_token_id = int(self.metadata.get("tokenizer.ggml.bos_token_id", self.token_bos()))
416+
eos_token_id = self.token_eos()
417+
bos_token_id = self.token_bos()
415418

416-
eos_token = self._model.token_get_text(eos_token_id)
417-
bos_token = self._model.token_get_text(bos_token_id)
419+
eos_token = self._model.token_get_text(eos_token_id) if eos_token_id != -1 else ""
420+
bos_token = self._model.token_get_text(bos_token_id) if bos_token_id != -1 else ""
418421

419422
# Unfortunately the llama.cpp API does not return metadata arrays, so we can't get template names from tokenizer.chat_templates
420423
template_choices = dict((name[10:], template) for name, template in self.metadata.items() if name.startswith("tokenizer.chat_template."))
@@ -562,12 +565,12 @@ def eval(self, tokens: Sequence[int]):
562565
if self.context_params.logits_all:
563566
rows = n_tokens
564567
cols = self._n_vocab
565-
logits = self._ctx.get_logits()[: rows * cols]
568+
logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
566569
self.scores[n_past : n_past + n_tokens, :].reshape(-1)[: :] = logits
567570
else:
568571
rows = 1
569572
cols = self._n_vocab
570-
logits = self._ctx.get_logits()[: rows * cols]
573+
logits = np.ctypeslib.as_array(self._ctx.get_logits(), shape=(rows * cols, ))
571574
self.scores[n_past + n_tokens - 1, :].reshape(-1)[: :] = logits
572575
# Update n_tokens
573576
self.n_tokens += n_tokens
@@ -961,9 +964,9 @@ def _create_completion(
961964

962965
completion_id: str = f"cmpl-{str(uuid.uuid4())}"
963966
created: int = int(time.time())
964-
prefix_token_id: int = int(self.metadata.get("tokenizer.ggml.prefix_token_id", self._model.token_prefix()))
965-
middle_token_id: int = int(self.metadata.get("tokenizer.ggml.middle_token_id", self._model.token_middle()))
966-
suffix_token_id: int = int(self.metadata.get("tokenizer.ggml.suffix_token_id", self._model.token_suffix()))
967+
prefix_token_id: int = self._model.token_prefix()
968+
middle_token_id: int = self._model.token_middle()
969+
suffix_token_id: int = self._model.token_suffix()
967970
# If prompt is empty, initialize completion with BOS token to avoid
968971
# detokenization including a space at the beginning of the completion
969972
completion_tokens: List[int] = [] if len(prompt) > 0 else [self.token_bos()]
@@ -2087,3 +2090,19 @@ def __call__(
20872090
self, input_ids: npt.NDArray[np.intc], logits: npt.NDArray[np.single]
20882091
) -> bool:
20892092
return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
2093+
2094+
2095+
class MinTokensLogitsProcessor(LogitsProcessor):
2096+
def __init__(self, min_tokens: int, token_eos: int):
2097+
self.min_tokens = min_tokens
2098+
self.token_eos = token_eos
2099+
self.prompt_tokens = None
2100+
2101+
def __call__(
2102+
self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single]
2103+
) -> npt.NDArray[np.single]:
2104+
if self.prompt_tokens is None:
2105+
self.prompt_tokens = len(input_ids)
2106+
if len(input_ids) - self.prompt_tokens < self.min_tokens:
2107+
scores[self.token_eos] = -np.inf
2108+
return scores

0 commit comments

Comments
 (0)