From 9bede1043e0a68361089cb8ff60e3806c10d984c Mon Sep 17 00:00:00 2001 From: Xu Song Date: Mon, 5 Aug 2024 18:23:51 +0800 Subject: [PATCH 1/5] return completion_tokens and completion_text --- llama_cpp/llama.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0cb5ca2fc..9de48780b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -778,8 +778,10 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + print(f"Llama.generate: {longest_prefix} prefix-match hit, remains {len(tokens)-longest_prefix} to forward", file=sys.stderr) reset = False + if len(tokens)-longest_prefix > 10: + print("Maybe a new session", json.dumps({"tokens": tokens, "input_ids": self._input_ids.tolist()})) tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix @@ -1344,6 +1346,8 @@ def logit_bias_processor( prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], ).decode("utf-8", errors="ignore"), + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1386,6 +1390,8 @@ def logit_bias_processor( "choices": [ { "text": ts, + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": None, "finish_reason": None, @@ -1480,6 +1486,8 @@ def logit_bias_processor( "text": last_text[ : len(last_text) - (token_end_position - end) ].decode("utf-8", errors="ignore"), + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1498,6 +1506,8 @@ def logit_bias_processor( "text": self.detokenize([token]).decode( "utf-8", errors="ignore" ), + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, From 7908bd52d605511f9d6ccca6eb85526ad57e4ccf Mon Sep 17 00:00:00 2001 From: Xu Song Date: Tue, 6 Aug 2024 13:37:47 +0800 Subject: [PATCH 2/5] Update llama.py --- llama_cpp/llama.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 9de48780b..c025ca2d7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1415,17 +1415,17 @@ def logit_bias_processor( if stream: remaining_tokens = completion_tokens[returned_tokens:] - all_text = self.detokenize( + remaining_text = self.detokenize( remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], ) - any_stop = [s for s in stop_sequences if s in all_text] + any_stop = [s for s in stop_sequences if s in remaining_text] if len(any_stop) > 0: - end = min(all_text.index(stop) for stop in any_stop) + end = min(remaining_text.index(stop) for stop in any_stop) else: - end = len(all_text) + end = len(remaining_text) - token_end_position = 0 + token_end_position = 0 # 这里有问题: for token in remaining_tokens: token_end_position += len( self.detokenize( @@ -1486,7 +1486,7 @@ def logit_bias_processor( "text": last_text[ : len(last_text) - (token_end_position - end) ].decode("utf-8", errors="ignore"), - "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_text": text.decode("utf-8", errors="ignore"), "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, @@ -1506,7 +1506,7 @@ def logit_bias_processor( "text": self.detokenize([token]).decode( "utf-8", errors="ignore" ), - "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_text": text.decode("utf-8", errors="ignore"), "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, From 88ccdef1809f1339014eaa799c380caffc6d4904 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Tue, 6 Aug 2024 18:01:08 +0800 Subject: [PATCH 3/5] Update llama.py --- llama_cpp/llama.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c025ca2d7..dad077ee8 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -778,11 +778,15 @@ def generate( break if longest_prefix > 0: if self.verbose: - print(f"Llama.generate: {longest_prefix} prefix-match hit, remains {len(tokens)-longest_prefix} to forward", file=sys.stderr) + print(f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remains {len(tokens)-longest_prefix} prompt tokens to eval", file=sys.stderr) reset = False - if len(tokens)-longest_prefix > 10: - print("Maybe a new session", json.dumps({"tokens": tokens, "input_ids": self._input_ids.tolist()})) tokens = tokens[longest_prefix:] + if len(tokens) > 0: + print("Maybe a new session", json.dumps( + {"content": self.detokenize(tokens).decode('utf-8'), + "tokens": tokens, + "input_ids": self._input_ids.tolist()})) self.n_tokens = longest_prefix # Reset the model state @@ -1225,15 +1229,17 @@ def logit_bias_processor( grammar=grammar, ): assert self._model.model is not None + + # 返回的文本,不应该包含EOG。因为 + # 返回的tokens需要包含EOG和stop_tokens。因为1是用于kv-cache,2是返回后方便拼接 + completion_tokens.append(token) + if llama_cpp.llama_token_is_eog(self._model.model, token): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize(completion_tokens[:-1], prev_tokens=prompt_tokens) finish_reason = "stop" break - completion_tokens.append(token) - all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) - # Contains multi-byte UTF8 for k, char in enumerate(all_text[-3:]): k = 3 - k From ba3395c6489ad2945a5d37da5be83417a291566a Mon Sep 17 00:00:00 2001 From: Xu Song Date: Sun, 25 Aug 2024 08:44:22 +0800 Subject: [PATCH 4/5] Fix memory allocation --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index dad077ee8..5bb29ec5d 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -2055,7 +2055,7 @@ def load_state(self, state: LlamaState) -> None: assert self._ctx.ctx is not None # Only filling in up to `n_tokens` and then zero-ing out the rest self.scores[: state.n_tokens, :] = state.scores.copy() - self.scores[state.n_tokens :, :] = 0.0 + # self.scores[state.n_tokens :, :] = 0.0 self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens state_size = state.llama_state_size From 63bd763e02cfd86240331f2e7dccfdf314364296 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Sun, 25 Aug 2024 10:31:18 +0800 Subject: [PATCH 5/5] Update llama.py --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 5bb29ec5d..a792a6482 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -786,7 +786,7 @@ def generate( print("Maybe a new session", json.dumps( {"content": self.detokenize(tokens).decode('utf-8'), "tokens": tokens, - "input_ids": self._input_ids.tolist()})) + "input_ids": self._input_ids.tolist()}, ensure_ascii=False)) self.n_tokens = longest_prefix # Reset the model state