diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 0cb5ca2fc..a792a6482 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -778,9 +778,15 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + print(f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remains {len(tokens)-longest_prefix} prompt tokens to eval", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] + if len(tokens) > 0: + print("Maybe a new session", json.dumps( + {"content": self.detokenize(tokens).decode('utf-8'), + "tokens": tokens, + "input_ids": self._input_ids.tolist()}, ensure_ascii=False)) self.n_tokens = longest_prefix # Reset the model state @@ -1223,15 +1229,17 @@ def logit_bias_processor( grammar=grammar, ): assert self._model.model is not None + + # 返回的文本,不应该包含EOG。因为 + # 返回的tokens需要包含EOG和stop_tokens。因为1是用于kv-cache,2是返回后方便拼接 + completion_tokens.append(token) + if llama_cpp.llama_token_is_eog(self._model.model, token): - text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) + text = self.detokenize(completion_tokens[:-1], prev_tokens=prompt_tokens) finish_reason = "stop" break - completion_tokens.append(token) - all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) - # Contains multi-byte UTF8 for k, char in enumerate(all_text[-3:]): k = 3 - k @@ -1344,6 +1352,8 @@ def logit_bias_processor( prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], ).decode("utf-8", errors="ignore"), + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1386,6 +1396,8 @@ def logit_bias_processor( "choices": [ { "text": ts, + "completion_text": all_text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": None, "finish_reason": None, @@ -1409,17 +1421,17 @@ def logit_bias_processor( if stream: remaining_tokens = completion_tokens[returned_tokens:] - all_text = self.detokenize( + remaining_text = self.detokenize( remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], ) - any_stop = [s for s in stop_sequences if s in all_text] + any_stop = [s for s in stop_sequences if s in remaining_text] if len(any_stop) > 0: - end = min(all_text.index(stop) for stop in any_stop) + end = min(remaining_text.index(stop) for stop in any_stop) else: - end = len(all_text) + end = len(remaining_text) - token_end_position = 0 + token_end_position = 0 # 这里有问题: for token in remaining_tokens: token_end_position += len( self.detokenize( @@ -1480,6 +1492,8 @@ def logit_bias_processor( "text": last_text[ : len(last_text) - (token_end_position - end) ].decode("utf-8", errors="ignore"), + "completion_text": text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -1498,6 +1512,8 @@ def logit_bias_processor( "text": self.detokenize([token]).decode( "utf-8", errors="ignore" ), + "completion_text": text.decode("utf-8", errors="ignore"), + "completion_tokens": completion_tokens, "index": 0, "logprobs": logprobs_or_none, "finish_reason": None, @@ -2039,7 +2055,7 @@ def load_state(self, state: LlamaState) -> None: assert self._ctx.ctx is not None # Only filling in up to `n_tokens` and then zero-ing out the rest self.scores[: state.n_tokens, :] = state.scores.copy() - self.scores[state.n_tokens :, :] = 0.0 + # self.scores[state.n_tokens :, :] = 0.0 self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens state_size = state.llama_state_size