You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
# Using string instead of tokens to check for antiprompt,
548
+
# It is more reliable than tokens for interactive mode.
549
+
generated_str=""
535
550
whileself.params.interactive:
536
551
self.set_color(util.CONSOLE_COLOR_USER_INPUT)
537
552
if (self.params.instruct):
@@ -546,6 +561,10 @@ def interact(self):
546
561
try:
547
562
foriinself.output():
548
563
print(i,end="",flush=True)
564
+
generated_str+=i
565
+
forapinself.params.antiprompt:
566
+
ifgenerated_str.endswith(ap):
567
+
raiseKeyboardInterrupt
549
568
exceptKeyboardInterrupt:
550
569
self.set_color(util.CONSOLE_COLOR_DEFAULT)
551
570
ifnotself.params.instruct:
@@ -561,7 +580,7 @@ def interact(self):
561
580
time_now=datetime.now()
562
581
prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}.
563
582
{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision.
564
-
There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
583
+
Transcript below contains only the recorded dialog between two, without any annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other.
565
584
The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
566
585
The transcript only includes text, it does not include markup like HTML and Markdown.
567
586
@@ -575,8 +594,11 @@ def interact(self):
575
594
{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
123
-
main_gpu: The GPU that is used for scratch and small tensors.
125
+
split_mode: How to split the model across GPUs. See llama_cpp.LLAMA_SPLIT_* for options.
126
+
main_gpu: main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model. LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results. LLAMA_SPLIT_LAYER: ignored
124
127
tensor_split: How split tensors should be distributed across GPUs. If None, the model is not split.
125
128
vocab_only: Only load the vocabulary no weights.
126
129
use_mmap: Use mmap if possible.
127
130
use_mlock: Force the system to keep the model in RAM.
131
+
kv_overrides: Key-value overrides for the model.
128
132
seed: RNG seed, -1 for random
129
133
n_ctx: Text context, 0 = from model
130
134
n_batch: Prompt processing maximum batch size
@@ -170,6 +174,7 @@ def __init__(
170
174
self.model_params.n_gpu_layers= (
171
175
0x7FFFFFFFifn_gpu_layers==-1elsen_gpu_layers
172
176
) # 0x7FFFFFFF is INT32 max, will be auto set to all layers
0 commit comments