diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 0cb5ca2fc..a792a6482 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -778,9 +778,15 @@ def generate(
                     break
             if longest_prefix > 0:
                 if self.verbose:
-                    print("Llama.generate: prefix-match hit", file=sys.stderr)
+                    print(f"Llama.generate: {longest_prefix} prefix-match hit, "
+                          f"remains {len(tokens)-longest_prefix} prompt tokens to eval", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
+                if len(tokens) > 0:
+                    print("Maybe a new session", json.dumps(
+                        {"content": self.detokenize(tokens).decode('utf-8'),
+                         "tokens": tokens,
+                         "input_ids": self._input_ids.tolist()}, ensure_ascii=False))
                 self.n_tokens = longest_prefix
 
         # Reset the model state
@@ -1223,15 +1229,17 @@ def logit_bias_processor(
             grammar=grammar,
         ):
             assert self._model.model is not None
+
+            # 返回的文本，不应该包含EOG。因为
+            # 返回的tokens需要包含EOG和stop_tokens。因为1是用于kv-cache，2是返回后方便拼接
+            completion_tokens.append(token)
+
             if llama_cpp.llama_token_is_eog(self._model.model, token):
-                text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
+                text = self.detokenize(completion_tokens[:-1], prev_tokens=prompt_tokens)
                 finish_reason = "stop"
                 break
 
-            completion_tokens.append(token)
-
             all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens)
-
             # Contains multi-byte UTF8
             for k, char in enumerate(all_text[-3:]):
                 k = 3 - k
@@ -1344,6 +1352,8 @@ def logit_bias_processor(
                                         prev_tokens=prompt_tokens
                                         + completion_tokens[:returned_tokens],
                                     ).decode("utf-8", errors="ignore"),
+                                    "completion_text": all_text.decode("utf-8", errors="ignore"),
+                                    "completion_tokens": completion_tokens,
                                     "index": 0,
                                     "logprobs": logprobs_or_none,
                                     "finish_reason": None,
@@ -1386,6 +1396,8 @@ def logit_bias_processor(
                             "choices": [
                                 {
                                     "text": ts,
+                                    "completion_text": all_text.decode("utf-8", errors="ignore"),
+                                    "completion_tokens": completion_tokens,
                                     "index": 0,
                                     "logprobs": None,
                                     "finish_reason": None,
@@ -1409,17 +1421,17 @@ def logit_bias_processor(
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
-            all_text = self.detokenize(
+            remaining_text = self.detokenize(
                 remaining_tokens,
                 prev_tokens=prompt_tokens + completion_tokens[:returned_tokens],
             )
-            any_stop = [s for s in stop_sequences if s in all_text]
+            any_stop = [s for s in stop_sequences if s in remaining_text]
             if len(any_stop) > 0:
-                end = min(all_text.index(stop) for stop in any_stop)
+                end = min(remaining_text.index(stop) for stop in any_stop)
             else:
-                end = len(all_text)
+                end = len(remaining_text)
 
-            token_end_position = 0
+            token_end_position = 0         # 这里有问题：
             for token in remaining_tokens:
                 token_end_position += len(
                     self.detokenize(
@@ -1480,6 +1492,8 @@ def logit_bias_processor(
                                 "text": last_text[
                                     : len(last_text) - (token_end_position - end)
                                 ].decode("utf-8", errors="ignore"),
+                                "completion_text": text.decode("utf-8", errors="ignore"),
+                                "completion_tokens": completion_tokens,
                                 "index": 0,
                                 "logprobs": logprobs_or_none,
                                 "finish_reason": None,
@@ -1498,6 +1512,8 @@ def logit_bias_processor(
                             "text": self.detokenize([token]).decode(
                                 "utf-8", errors="ignore"
                             ),
+                            "completion_text": text.decode("utf-8", errors="ignore"),
+                            "completion_tokens": completion_tokens,
                             "index": 0,
                             "logprobs": logprobs_or_none,
                             "finish_reason": None,
@@ -2039,7 +2055,7 @@ def load_state(self, state: LlamaState) -> None:
         assert self._ctx.ctx is not None
         # Only filling in up to `n_tokens` and then zero-ing out the rest
         self.scores[: state.n_tokens, :] = state.scores.copy()
-        self.scores[state.n_tokens :, :] = 0.0
+        # self.scores[state.n_tokens :, :] = 0.0
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
         state_size = state.llama_state_size