From 163f3e37e16420bcfbf2dc17dc20874562536977 Mon Sep 17 00:00:00 2001 From: siraj Date: Wed, 5 Jul 2023 09:25:12 -0400 Subject: [PATCH 01/14] Initial Refactoring of llama-cpp-python --- .gitmodules | 3 + Makefile | 2 +- falcon_cpp/__init__.py | 2 + llama_cpp/llama.py => falcon_cpp/falcon.py | 465 ++++---- falcon_cpp/falcon_cpp.py | 1024 +++++++++++++++++ .../falcon_types.py | 0 {llama_cpp => falcon_cpp}/server/__init__.py | 0 {llama_cpp => falcon_cpp}/server/__main__.py | 0 {llama_cpp => falcon_cpp}/server/app.py | 76 +- llama_cpp/__init__.py | 2 - llama_cpp/llama_cpp.py | 1024 ----------------- mkdocs.yml | 4 +- pyproject.toml | 14 +- setup.py | 14 +- tests/{test_llama.py => test_falcon.py} | 86 +- vendor/llama.cpp | 1 - 16 files changed, 1369 insertions(+), 1348 deletions(-) create mode 100644 falcon_cpp/__init__.py rename llama_cpp/llama.py => falcon_cpp/falcon.py (76%) create mode 100644 falcon_cpp/falcon_cpp.py rename llama_cpp/llama_types.py => falcon_cpp/falcon_types.py (100%) rename {llama_cpp => falcon_cpp}/server/__init__.py (100%) rename {llama_cpp => falcon_cpp}/server/__main__.py (100%) rename {llama_cpp => falcon_cpp}/server/app.py (88%) delete mode 100644 llama_cpp/__init__.py delete mode 100644 llama_cpp/llama_cpp.py rename tests/{test_llama.py => test_falcon.py} (56%) delete mode 160000 vendor/llama.cpp diff --git a/.gitmodules b/.gitmodules index 7edf0975d..cdb598717 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "vendor/llama.cpp"] path = vendor/llama.cpp url = https://github.com/ggerganov/llama.cpp.git +[submodule "ggllm.cpp"] + path = ggllm.cpp + url = https://github.com/cmp-nct/ggllm.cpp diff --git a/Makefile b/Makefile index 66d93f3a2..48238f921 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ update: git submodule update --init --recursive update.vendor: - cd vendor/llama.cpp && git pull origin master + cd vendor/ggllm.cpp && git pull origin master build: python3 setup.py develop diff --git a/falcon_cpp/__init__.py b/falcon_cpp/__init__.py new file mode 100644 index 000000000..e7d40876f --- /dev/null +++ b/falcon_cpp/__init__.py @@ -0,0 +1,2 @@ +from .falcon_cpp import * +from .falcon import * diff --git a/llama_cpp/llama.py b/falcon_cpp/falcon.py similarity index 76% rename from llama_cpp/llama.py rename to falcon_cpp/falcon.py index 688b2a74f..40b662f23 100644 --- a/llama_cpp/llama.py +++ b/falcon_cpp/falcon.py @@ -20,15 +20,15 @@ import diskcache -from . import llama_cpp -from .llama_types import * +from . import falcon_cpp +from .falcon_types import * import numpy as np import numpy.typing as npt -class BaseLlamaCache(ABC): - """Base cache class for a llama.cpp model.""" +class BaseFalconCache(ABC): + """Base cache class for a falcon.cpp model.""" def __init__(self, capacity_bytes: int = (2 << 30)): self.capacity_bytes = capacity_bytes @@ -45,7 +45,7 @@ def _find_longest_prefix_key( pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": raise NotImplementedError @abstractmethod @@ -53,21 +53,21 @@ def __contains__(self, key: Sequence[int]) -> bool: raise NotImplementedError @abstractmethod - def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None: + def __setitem__(self, key: Sequence[int], value: "FalconState") -> None: raise NotImplementedError -class LlamaRAMCache(BaseLlamaCache): - """Cache for a llama.cpp model using RAM.""" +class FalconRAMCache(BaseFalconCache): + """Cache for a falcon.cpp model using RAM.""" def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() + self.cache_state: OrderedDict[Tuple[int, ...], "FalconState"] = OrderedDict() @property def cache_size(self): - return sum([state.llama_state_size for state in self.cache_state.values()]) + return sum([state.falcon_state_size for state in self.cache_state.values()]) def _find_longest_prefix_key( self, @@ -76,7 +76,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + (k, Falcon.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: if prefix_len > min_len: @@ -84,7 +84,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -96,7 +96,7 @@ def __getitem__(self, key: Sequence[int]) -> "LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "LlamaState"): + def __setitem__(self, key: Sequence[int], value: "FalconState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -106,14 +106,14 @@ def __setitem__(self, key: Sequence[int], value: "LlamaState"): # Alias for backwards compatibility -LlamaCache = LlamaRAMCache +FalconCache = FalconRAMCache -class LlamaDiskCache(BaseLlamaCache): - """Cache for a llama.cpp model using disk.""" +class FalconDiskCache(BaseFalconCache): + """Cache for a falcon.cpp model using disk.""" def __init__( - self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) ): super().__init__(capacity_bytes) self.cache = diskcache.Cache(cache_dir) @@ -129,54 +129,54 @@ def _find_longest_prefix_key( min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = Llama.longest_token_prefix(k, key) + prefix_len = Falcon.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "LlamaState" = self.cache.pop(_key) # type: ignore + value: "FalconState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, - # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # Falcon.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore return value def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "LlamaState"): - print("LlamaDiskCache.__setitem__: called", file=sys.stderr) + def __setitem__(self, key: Sequence[int], value: "FalconState"): + print("FalconDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: - print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) + print("FalconDiskCache.__setitem__: delete", file=sys.stderr) del self.cache[key] self.cache[key] = value - print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + print("FalconDiskCache.__setitem__: set", file=sys.stderr) while self.cache_size > self.capacity_bytes and len(self.cache) > 0: key_to_remove = next(iter(self.cache)) del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + print("FalconDiskCache.__setitem__: trim", file=sys.stderr) -class LlamaState: +class FalconState: def __init__( self, input_ids: npt.NDArray[np.intc], scores: npt.NDArray[np.single], n_tokens: int, - llama_state: bytes, - llama_state_size: int, + falcon_state: bytes, + falcon_state_size: int, ): self.input_ids = input_ids self.scores = scores self.n_tokens = n_tokens - self.llama_state = llama_state - self.llama_state_size = llama_state_size + self.falcon_state = falcon_state + self.falcon_state_size = falcon_state_size LogitsProcessor = Callable[[List[int], List[float]], List[float]] @@ -197,8 +197,8 @@ def __call__(self, input_ids: List[int], logits: List[float]) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) -class Llama: - """High-level Python wrapper for a llama.cpp model.""" +class Falcon: + """High-level Python wrapper for a falcon.cpp model.""" def __init__( self, @@ -221,37 +221,56 @@ def __init__( lora_path: Optional[str] = None, low_vram: bool = False, verbose: bool = True, - ): - """Load a llama.cpp model from `model_path`. - - Args: - model_path: Path to the model. - n_ctx: Maximum context size. - n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. -1 for random. - f16_kv: Use half-precision for key/value cache. - logits_all: Return logits for all tokens, not just the last token. - vocab_only: Only load the vocabulary no weights. - use_mmap: Use mmap if possible. - use_mlock: Force the system to keep the model in RAM. - embedding: Embedding mode only. - n_threads: Number of threads to use. If None, the number of threads is automatically determined. - n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. - last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. - lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. - lora_path: Path to a LoRA file to apply to the model. - verbose: Print verbose output to stderr. - - Raises: - ValueError: If the model path does not exist. + ): - Returns: - A Llama instance. - """ + # TODO: Add the parameters for + ''' + -ts SPLIT --tensor-split SPLIT + how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1 + -mg i, --main-gpu i the GPU to use for scratch and small tensors (0 = first) + --override-max-gpu N + limits the number of GPUs visible (allows to disable multi/single GPU processing) + --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM) + --mtest compute maximum memory usage + --export export the computation graph to 'falcon.ggml' + --verbose-prompt print prompt before generation + -dt, --debug-timings print GGML_PERF debug output (requires GGML_PERF=1 for timings) + 1 = print first layer, 2 = print first and last layer, 3+ = all layers + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + -m FNAME, --model FNAME + ''' + + """Load a Falcon model from `model_path`. + + Args: + model_path: Path to the model. + n_ctx: Maximum context size. + n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. + seed: Random seed. -1 for random. + f16_kv: Use half-precision for key/value cache. + logits_all: Return logits for all tokens, not just the last token. + vocab_only: Only load the vocabulary no weights. + use_mmap: Use mmap if possible. + use_mlock: Force the system to keep the model in RAM. + embedding: Embedding mode only. + n_threads: Number of threads to use. If None, the number of threads is automatically determined. + n_batch: Maximum number of prompt tokens to batch together when calling falcon_eval. + last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. + lora_path: Path to a LoRA file to apply to the model. + verbose: Print verbose output to stderr. + + Raises: + ValueError: If the model path does not exist. + + Returns: + A falcon instance. + """ self.verbose = verbose self.model_path = model_path - self.params = llama_cpp.llama_context_default_params() + self.params = falcon_cpp.falcon_context_default_params() self.params.n_ctx = n_ctx self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed @@ -266,7 +285,7 @@ def __init__( self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.cache: Optional[BaseLlamaCache] = None + self.cache: Optional[BaseFalconCache] = None self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) @@ -280,35 +299,35 @@ def __init__( if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self.model = llama_cpp.llama_load_model_from_file( + self.model = falcon_cpp.falcon_load_model_from_file( self.model_path.encode("utf-8"), self.params ) assert self.model is not None - self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) + self.ctx = falcon_cpp.falcon_new_context_with_model(self.model, self.params) assert self.ctx is not None if self.lora_path: - if llama_cpp.llama_model_apply_lora_from_file( - self.model, - llama_cpp.c_char_p(self.lora_path.encode("utf-8")), - llama_cpp.c_char_p(self.lora_base.encode("utf-8")) - if self.lora_base is not None - else llama_cpp.c_char_p(0), - llama_cpp.c_int(self.n_threads), + if falcon_cpp.falcon_model_apply_lora_from_file( + self.model, + falcon_cpp.c_char_p(self.lora_path.encode("utf-8")), + falcon_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else falcon_cpp.c_char_p(0), + falcon_cpp.c_int(self.n_threads), ): raise RuntimeError( f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + print(falcon_cpp.falcon_print_system_info().decode("utf-8"), file=sys.stderr) self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - size = llama_cpp.c_size_t(self._n_vocab) - sorted = llama_cpp.c_bool(False) + size = falcon_cpp.c_size_t(self._n_vocab) + sorted = falcon_cpp.c_bool(False) self._candidates_data = np.array( [], dtype=np.dtype( @@ -316,14 +335,14 @@ def __init__( ), ) self._candidates_data.resize(3, self._n_vocab, refcheck=False) - candidates = llama_cpp.llama_token_data_array( - data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), + candidates = falcon_cpp.falcon_token_data_array( + data=self._candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p), size=size, sorted=sorted, ) self._candidates = candidates - self._token_nl = Llama.token_nl() - self._token_eos = Llama.token_eos() + self._token_nl = Falcon.token_nl() + self._token_eos = Falcon.token_eos() self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -364,23 +383,23 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """ assert self.ctx is not None n_ctx = self._n_ctx - tokens = (llama_cpp.llama_token * n_ctx)() - n_tokens = llama_cpp.llama_tokenize( + tokens = (falcon_cpp.falcon_token * n_ctx)() + n_tokens = falcon_cpp.falcon_tokenize( self.ctx, text, tokens, - llama_cpp.c_int(n_ctx), - llama_cpp.c_bool(add_bos), + falcon_cpp.c_int(n_ctx), + falcon_cpp.c_bool(add_bos), ) if n_tokens < 0: n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * n_tokens)() - n_tokens = llama_cpp.llama_tokenize( + tokens = (falcon_cpp.falcon_token * n_tokens)() + n_tokens = falcon_cpp.falcon_tokenize( self.ctx, text, tokens, - llama_cpp.c_int(n_tokens), - llama_cpp.c_bool(add_bos), + falcon_cpp.c_int(n_tokens), + falcon_cpp.c_bool(add_bos), ) if n_tokens < 0: raise RuntimeError( @@ -400,12 +419,12 @@ def detokenize(self, tokens: List[int]) -> bytes: assert self.ctx is not None output = b"" for token in tokens: - output += llama_cpp.llama_token_to_str( - self.ctx, llama_cpp.llama_token(token) + output += falcon_cpp.falcon_token_to_str( + self.ctx, falcon_cpp.falcon_token(token) ) return output - def set_cache(self, cache: Optional[BaseLlamaCache]): + def set_cache(self, cache: Optional[BaseFalconCache]): """Set the cache. Args: @@ -429,39 +448,39 @@ def eval(self, tokens: Sequence[int]): batch = tokens[i : min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) - return_code = llama_cpp.llama_eval( + return_code = falcon_cpp.falcon_eval( ctx=self.ctx, - tokens=(llama_cpp.llama_token * len(batch))(*batch), - n_tokens=llama_cpp.c_int(n_tokens), - n_past=llama_cpp.c_int(n_past), - n_threads=llama_cpp.c_int(self.n_threads), + tokens=(falcon_cpp.falcon_token * len(batch))(*batch), + n_tokens=falcon_cpp.c_int(n_tokens), + n_past=falcon_cpp.c_int(n_past), + n_threads=falcon_cpp.c_int(self.n_threads), ) if return_code != 0: - raise RuntimeError(f"llama_eval returned {return_code}") + raise RuntimeError(f"falcon_eval returned {return_code}") # Save tokens self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] # Update n_tokens self.n_tokens += n_tokens def _sample( self, - last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] - last_n_tokens_size: llama_cpp.c_int, - top_k: llama_cpp.c_int, - top_p: llama_cpp.c_float, - temp: llama_cpp.c_float, - tfs_z: llama_cpp.c_float, - repeat_penalty: llama_cpp.c_float, - frequency_penalty: llama_cpp.c_float, - presence_penalty: llama_cpp.c_float, - mirostat_mode: llama_cpp.c_int, - mirostat_tau: llama_cpp.c_float, - mirostat_eta: llama_cpp.c_float, + last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] + last_n_tokens_size: falcon_cpp.c_int, + top_k: falcon_cpp.c_int, + top_p: falcon_cpp.c_float, + temp: falcon_cpp.c_float, + tfs_z: falcon_cpp.c_float, + repeat_penalty: falcon_cpp.c_float, + frequency_penalty: falcon_cpp.c_float, + presence_penalty: falcon_cpp.c_float, + mirostat_mode: falcon_cpp.c_int, + mirostat_tau: falcon_cpp.c_float, + mirostat_eta: falcon_cpp.c_float, penalize_nl: bool = True, logits_processor: Optional[LogitsProcessorList] = None, ): @@ -469,9 +488,9 @@ def _sample( assert self.n_tokens > 0 n_vocab = self._n_vocab n_ctx = self._n_ctx - top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + top_k = falcon_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k last_n_tokens_size = ( - llama_cpp.c_int(n_ctx) + falcon_cpp.c_int(n_ctx) if last_n_tokens_size.value < 0 else last_n_tokens_size ) @@ -490,94 +509,94 @@ def _sample( candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore candidates_data["logit"] = logits candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) - candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) - candidates.sorted = llama_cpp.c_bool(False) - candidates.size = llama_cpp.c_size_t(n_vocab) - llama_cpp.llama_sample_repetition_penalty( + candidates.data = candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p) + candidates.sorted = falcon_cpp.c_bool(False) + candidates.size = falcon_cpp.c_size_t(n_vocab) + falcon_cpp.falcon_sample_repetition_penalty( ctx=self.ctx, last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) - llama_cpp.llama_sample_frequency_and_presence_penalties( + falcon_cpp.falcon_sample_frequency_and_presence_penalties( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) + candidates.data[self._token_nl].logit = falcon_cpp.c_float(nl_logit) if temp.value == 0.0: - return llama_cpp.llama_sample_token_greedy( + return falcon_cpp.falcon_sample_token_greedy( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore ) elif mirostat_mode.value == 1: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) - mirostat_m = llama_cpp.c_int(100) - llama_cpp.llama_sample_temperature( + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + mirostat_m = falcon_cpp.c_int(100) + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) - return llama_cpp.llama_sample_token_mirostat( + return falcon_cpp.falcon_sample_token_mirostat( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore m=mirostat_m, ) elif mirostat_mode.value == 2: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) - llama_cpp.llama_sample_temperature( + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=falcon_cpp.ctypes.pointer(candidates), temp=temp, ) - return llama_cpp.llama_sample_token_mirostat_v2( + return falcon_cpp.falcon_sample_token_mirostat_v2( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore ) else: - llama_cpp.llama_sample_top_k( + falcon_cpp.falcon_sample_top_k( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore k=top_k, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_tail_free( + falcon_cpp.falcon_sample_tail_free( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore z=tfs_z, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_typical( + falcon_cpp.falcon_sample_typical( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - p=llama_cpp.c_float(1.0), - min_keep=llama_cpp.c_size_t(1), + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + p=falcon_cpp.c_float(1.0), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_top_p( + falcon_cpp.falcon_sample_top_p( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore p=top_p, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_temperature( + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) - return llama_cpp.llama_sample_token( + return falcon_cpp.falcon_sample_token( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore ) def sample( @@ -607,24 +626,24 @@ def sample( The sampled token. """ assert self.ctx is not None - last_n_tokens_data = [llama_cpp.llama_token(0)] * max( + last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max( 0, self.last_n_tokens_size - len(self._input_ids) ) + self._input_ids[-self.last_n_tokens_size :].tolist() return self._sample( - last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( + last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)( *last_n_tokens_data ), - last_n_tokens_size=llama_cpp.c_int(self.last_n_tokens_size), - top_k=llama_cpp.c_int(top_k), - top_p=llama_cpp.c_float(top_p), - temp=llama_cpp.c_float(temp), - tfs_z=llama_cpp.c_float(tfs_z), - repeat_penalty=llama_cpp.c_float(repeat_penalty), - frequency_penalty=llama_cpp.c_float(frequency_penalty), - presence_penalty=llama_cpp.c_float(presence_penalty), - mirostat_mode=llama_cpp.c_int(mirostat_mode), - mirostat_tau=llama_cpp.c_float(mirostat_tau), - mirostat_eta=llama_cpp.c_float(mirostat_eta), + last_n_tokens_size=falcon_cpp.c_int(self.last_n_tokens_size), + top_k=falcon_cpp.c_int(top_k), + top_p=falcon_cpp.c_float(top_p), + temp=falcon_cpp.c_float(temp), + tfs_z=falcon_cpp.c_float(tfs_z), + repeat_penalty=falcon_cpp.c_float(repeat_penalty), + frequency_penalty=falcon_cpp.c_float(frequency_penalty), + presence_penalty=falcon_cpp.c_float(presence_penalty), + mirostat_mode=falcon_cpp.c_int(mirostat_mode), + mirostat_tau=falcon_cpp.c_float(mirostat_tau), + mirostat_eta=falcon_cpp.c_float(mirostat_eta), penalize_nl=penalize_nl, logits_processor=logits_processor, ) @@ -649,10 +668,10 @@ def generate( """Create a generator of tokens from a prompt. Examples: - >>> llama = Llama("models/ggml-7b.bin") - >>> tokens = llama.tokenize(b"Hello, world!") - >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): - ... print(llama.detokenize([token])) + >>> falcon = Falcon("models/ggml-7b.bin") + >>> tokens = falcon.tokenize(b"Hello, world!") + >>> for token in falcon.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): + ... print(falcon.detokenize([token])) Args: tokens: The prompt tokens. @@ -676,7 +695,7 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + print("Falcon.generate: prefix-match hit", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix @@ -724,11 +743,11 @@ def create_embedding( if self.params.embedding == False: raise RuntimeError( - "Llama model must be created with embedding=True to call this method" + "Falcon model must be created with embedding=True to call this method" ) if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + falcon_cpp.falcon_reset_timings(self.ctx) if isinstance(input, str): inputs = [input] @@ -743,8 +762,8 @@ def create_embedding( self.eval(tokens) n_tokens = len(tokens) total_tokens += n_tokens - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) + embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[ + : falcon_cpp.falcon_n_embd(self.ctx) ] data.append( @@ -755,7 +774,7 @@ def create_embedding( } ) if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + falcon_cpp.falcon_print_timings(self.ctx) return { "object": "list", @@ -806,7 +825,7 @@ def _create_completion( completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) completion_tokens: List[int] = [] - # Add blank space to start of prompt to match OG llama tokenizer + # Add blank space to start of prompt to match OG Falcon tokenizer prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) text: bytes = b"" returned_tokens: int = 0 @@ -816,7 +835,7 @@ def _create_completion( model_name: str = model if model is not None else self.model_path if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + falcon_cpp.falcon_reset_timings(self.ctx) if len(prompt_tokens) > self._n_ctx: raise ValueError( @@ -843,19 +862,19 @@ def _create_completion( if self.cache: try: cache_item = self.cache[prompt_tokens] - cache_prefix_len = Llama.longest_token_prefix( + cache_prefix_len = Falcon.longest_token_prefix( cache_item.input_ids.tolist(), prompt_tokens ) - eval_prefix_len = Llama.longest_token_prefix( + eval_prefix_len = Falcon.longest_token_prefix( self._input_ids.tolist(), prompt_tokens ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) if self.verbose: - print("Llama._create_completion: cache hit", file=sys.stderr) + print("Falcon._create_completion: cache hit", file=sys.stderr) except KeyError: if self.verbose: - print("Llama._create_completion: cache miss", file=sys.stderr) + print("Falcon._create_completion: cache miss", file=sys.stderr) finish_reason = "length" multibyte_fix = 0 @@ -937,7 +956,7 @@ def _create_completion( ) token_offset = len(prompt_tokens) + returned_tokens logits = self._scores[token_offset - 1, :].tolist() - current_logprobs = Llama.logits_to_logprobs(logits) + current_logprobs = Falcon.logits_to_logprobs(logits) sorted_logprobs = list( sorted( zip(current_logprobs, range(len(current_logprobs))), @@ -991,7 +1010,7 @@ def _create_completion( finish_reason = "stop" if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + falcon_cpp.falcon_print_timings(self.ctx) if stream: remaining_tokens = completion_tokens[returned_tokens:] @@ -1016,7 +1035,7 @@ def _create_completion( ) token_offset = len(prompt_tokens) + returned_tokens - 1 logits = self._scores[token_offset, :].tolist() - current_logprobs = Llama.logits_to_logprobs(logits) + current_logprobs = Falcon.logits_to_logprobs(logits) sorted_logprobs = list( sorted( zip(current_logprobs, range(len(current_logprobs))), @@ -1080,14 +1099,14 @@ def _create_completion( } if self.cache: if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) + print("Falcon._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() - print("Llama._create_completion: cache saved", file=sys.stderr) + print("Falcon._create_completion: cache saved", file=sys.stderr) return if self.cache: if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) + print("Falcon._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() text_str = text.decode("utf-8", errors="ignore") @@ -1118,7 +1137,7 @@ def _create_completion( for token in all_tokens ] all_logprobs = [ - Llama.logits_to_logprobs(row.tolist()) for row in self._scores + Falcon.logits_to_logprobs(row.tolist()) for row in self._scores ][token_offset:] for token, token_str, logprobs_token in zip( all_tokens, all_token_strs, all_logprobs @@ -1440,10 +1459,10 @@ def create_chat_completion( def __del__(self): if self.model is not None: - llama_cpp.llama_free_model(self.model) + falcon_cpp.falcon_free_model(self.model) self.model = None if self.ctx is not None: - llama_cpp.llama_free(self.ctx) + falcon_cpp.falcon_free(self.ctx) self.ctx = None def __getstate__(self): @@ -1492,82 +1511,82 @@ def __setstate__(self, state): verbose=state["verbose"], ) - def save_state(self) -> LlamaState: + def save_state(self) -> FalconState: assert self.ctx is not None if self.verbose: - print("Llama.save_state: saving llama state", file=sys.stderr) - state_size = llama_cpp.llama_get_state_size(self.ctx) + print("Falcon.save_state: saving falcon state", file=sys.stderr) + state_size = falcon_cpp.falcon_get_state_size(self.ctx) if self.verbose: - print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) - llama_state = (llama_cpp.c_uint8 * int(state_size))() + print(f"Falcon.save_state: got state size: {state_size}", file=sys.stderr) + falcon_state = (falcon_cpp.c_uint8 * int(state_size))() if self.verbose: - print("Llama.save_state: allocated state", file=sys.stderr) - n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + print("Falcon.save_state: allocated state", file=sys.stderr) + n_bytes = falcon_cpp.falcon_copy_state_data(self.ctx, falcon_state) if self.verbose: - print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) + print(f"Falcon.save_state: copied falcon state: {n_bytes}", file=sys.stderr) if int(n_bytes) > int(state_size): - raise RuntimeError("Failed to copy llama state data") - llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() - llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + raise RuntimeError("Failed to copy Falcon state data") + falcon_state_compact = (falcon_cpp.c_uint8 * int(n_bytes))() + falcon_cpp.ctypes.memmove(falcon_state_compact, falcon_state, int(n_bytes)) if self.verbose: print( - f"Llama.save_state: saving {n_bytes} bytes of llama state", + f"Falcon.save_state: saving {n_bytes} bytes of falcon state", file=sys.stderr, ) - return LlamaState( + return FalconState( scores=self.scores.copy(), input_ids=self.input_ids.copy(), n_tokens=self.n_tokens, - llama_state=bytes(llama_state_compact), - llama_state_size=n_bytes, + falcon_state=bytes(falcon_state_compact), + falcon_state_size=n_bytes, ) - def load_state(self, state: LlamaState) -> None: + def load_state(self, state: FalconState) -> None: assert self.ctx is not None self.scores = state.scores.copy() self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens - state_size = state.llama_state_size - LLamaStateArrayType = llama_cpp.c_uint8 * state_size - llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) + state_size = state.falcon_state_size + FalconStateArrayType = falcon_cpp.c_uint8 * state_size + falcon_state = FalconStateArrayType.from_buffer_copy(state.falcon_state) - if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: - raise RuntimeError("Failed to set llama state data") + if falcon_cpp.falcon_set_state_data(self.ctx, falcon_state) != state_size: + raise RuntimeError("Failed to set Falcon state data") def n_ctx(self) -> int: """Return the context window size.""" assert self.ctx is not None - return llama_cpp.llama_n_ctx(self.ctx) + return falcon_cpp.falcon_n_ctx(self.ctx) def n_embd(self) -> int: """Return the embedding size.""" assert self.ctx is not None - return llama_cpp.llama_n_embd(self.ctx) + return falcon_cpp.falcon_n_embd(self.ctx) def n_vocab(self) -> int: """Return the vocabulary size.""" assert self.ctx is not None - return llama_cpp.llama_n_vocab(self.ctx) + return falcon_cpp.falcon_n_vocab(self.ctx) - def tokenizer(self) -> "LlamaTokenizer": + def tokenizer(self) -> "FalconTokenizer": """Return the tokenizer for this model.""" assert self.ctx is not None - return LlamaTokenizer(self) + return FalconTokenizer(self) @staticmethod def token_eos() -> int: """Return the end-of-sequence token.""" - return llama_cpp.llama_token_eos() + return falcon_cpp.falcon_token_eos() @staticmethod def token_bos() -> int: """Return the beginning-of-sequence token.""" - return llama_cpp.llama_token_bos() + return falcon_cpp.falcon_token_bos() @staticmethod def token_nl() -> int: """Return the newline token.""" - return llama_cpp.llama_token_nl() + return falcon_cpp.falcon_token_nl() @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: @@ -1586,18 +1605,18 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]): return longest_prefix -class LlamaTokenizer: - def __init__(self, llama: Llama): - self.llama = llama +class FalconTokenizer: + def __init__(self, falcon: Falcon): + self.falcon = falcon def encode(self, text: str, add_bos: bool = True) -> List[int]: - return self.llama.tokenize( + return self.falcon.tokenize( text.encode("utf-8", errors="ignore"), add_bos=add_bos ) def decode(self, tokens: List[int]) -> str: - return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + return self.falcon.detokenize(tokens).decode("utf-8", errors="ignore") @classmethod - def from_ggml_file(cls, path: str) -> "LlamaTokenizer": - return cls(Llama(model_path=path, vocab_only=True)) + def from_ggml_file(cls, path: str) -> "FalconTokenizer": + return cls(Falcon(model_path=path, vocab_only=True)) diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py new file mode 100644 index 000000000..d0c0455e6 --- /dev/null +++ b/falcon_cpp/falcon_cpp.py @@ -0,0 +1,1024 @@ +import sys +import os +import ctypes +from ctypes import ( + c_int, + c_float, + c_char_p, + c_void_p, + c_bool, + POINTER, + _Pointer, # type: ignore + Structure, + Array, + c_uint8, + c_size_t, +) +import pathlib +from typing import List, Union + + +# Load the library +def _load_shared_library(lib_base_name: str): + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(__file__).parent.resolve() + # Searching for the library in the current directory under the name "libFalcon" (default name + # for falconcpp) and "falcon" (default name for this repo) + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") + + if "FALCON_CPP_LIB" in os.environ: + lib_base_name = os.environ["FALCON_CPP_LIB"] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] + + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_lib_base_name = "ggllm" + +# Load the library +_lib = _load_shared_library(_lib_base_name) + +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# falcon.h bindings + +GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") +GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) +FALCON_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) + +# #define FALCON_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +FALCON_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define FALCON_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +FALCON_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define FALCON_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +FALCON_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define FLACON_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +FALCON_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define FALCON_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +FALCON_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define FALCON_FILE_VERSION 3 +FALCON_FILE_VERSION = c_int(3) +FALCON_FILE_MAGIC = FALCON_FILE_MAGIC_GGJT +FALCON_FILE_MAGIC_UNVERSIONED = FALCON_FILE_MAGIC_GGML +FALCON_SESSION_MAGIC = FALCON_FILE_MAGIC_GGSN +FALCON_SESSION_VERSION = c_int(1) + +# struct falcon_model; +falcon_model_p = c_void_p + +# struct falcon_context; +falcon_context_p = c_void_p + + +# typedef int falcon_token; +falcon_token = c_int +falcon_token_p = POINTER(falcon_token) + + +# typedef struct falcon_token_data { +# falcon_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } falcon_token_data; +class falcon_token_data(Structure): + _fields_ = [ + ("id", falcon_token), + ("logit", c_float), + ("p", c_float), + ] + + +falcon_token_data_p = POINTER(falcon_token_data) + + +# typedef struct falcon_token_data_array { +# falcon_token_data * data; +# size_t size; +# bool sorted; +# } falcon_token_data_array; +class falcon_token_data_array(Structure): + _fields_ = [ + ("data", falcon_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +falcon_token_data_array_p = POINTER(falcon_token_data_array) + +# typedef void (*falcon_progress_callback)(float progress, void *ctx); +falcon_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) + + +# struct falcon_context_params { +# int seed; // RNG seed, -1 for random +# int n_ctx; // text context +# int n_batch; // prompt processing batch size +# int n_gpu_layers; // number of layers to store in VRAM +# int main_gpu; // the GPU that is used for scratch and small tensors +# float tensor_split[FALCON_MAX_DEVICES]; // how to split layers across multiple GPUs +# // called with a progress value between 0 and 1, pass NULL to disable +# falcon_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; + + +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the falcon_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only +# }; +class ggllm_context_params(Structure): + _fields_ = [ + ("seed", c_int), + ("n_ctx", c_int), + ("n_batch", c_int), + ("n_gpu_layers", c_int), + ("main_gpu", c_int), + ("tensor_split", c_float * FALCON_MAX_DEVICES.value), + ("progress_callback", falcon_progress_callback), + ("progress_callback_user_data", c_void_p), + ("low_vram", c_bool), + ("f16_kv", c_bool), + ("logits_all", c_bool), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), + ] + + +falcon_context_params_p = POINTER(ggllm_context_params) + +# enum falcon_ftype { +# FALCON_FTYPE_ALL_F32 = 0, +# FALCON_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // FALCON_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // FALCON_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# FALCON_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors +# }; +FALCON_FTYPE_ALL_F32 = c_int(0) +FALCON_FTYPE_MOSTLY_F16 = c_int(1) +FALCON_FTYPE_MOSTLY_Q4_0 = c_int(2) +FALCON_FTYPE_MOSTLY_Q4_1 = c_int(3) +FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +FALCON_FTYPE_MOSTLY_Q8_0 = c_int(7) +FALCON_FTYPE_MOSTLY_Q5_0 = c_int(8) +FALCON_FTYPE_MOSTLY_Q5_1 = c_int(9) +FALCON_FTYPE_MOSTLY_Q2_K = c_int(10) +FALCON_FTYPE_MOSTLY_Q3_K_S = c_int(11) +FALCON_FTYPE_MOSTLY_Q3_K_M = c_int(12) +FALCON_FTYPE_MOSTLY_Q3_K_L = c_int(13) +FALCON_FTYPE_MOSTLY_Q4_K_S = c_int(14) +FALCON_FTYPE_MOSTLY_Q4_K_M = c_int(15) +FALCON_FTYPE_MOSTLY_Q5_K_S = c_int(16) +FALCON_FTYPE_MOSTLY_Q5_K_M = c_int(17) +FALCON_FTYPE_MOSTLY_Q6_K = c_int(18) + + +# // model quantization parameters +# typedef struct falcon_model_quantize_params { +# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum falcon_ftype ftype; // quantize to this falcon_ftype +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# } falcon_model_quantize_params; +class falcon_model_quantize_params(Structure): + _fields_ = [ + ("nthread", c_int), + ("ftype", c_int), + ("allow_requantize", c_bool), + ("quantize_output_tensor", c_bool), + ] + + +# FALCON_API struct falcon_context_params falcon_context_default_params(); +def falcon_context_default_params() -> ggllm_context_params: + return _lib.ggllm_context_default_params() + + +_lib.ggllm_context_default_params.argtypes = [] +_lib.ggllm_context_default_params.restype = ggllm_context_params + + +# FALCON_API struct falcon_model_quantize_params falcon_model_quantize_default_params(); +def falcon_model_quantize_default_params() -> falcon_model_quantize_params: + return _lib.ggllm_model_quantize_default_params() + + +_lib.ggllm_model_quantize_default_params.argtypes = [] +_lib.ggllm_model_quantize_default_params.restype = falcon_model_quantize_params + + +# FALCON_API bool falcon_mmap_supported(); +def falcon_mmap_supported() -> bool: + return _lib.ggllm_mmap_supported() + + +_lib.ggllm_mmap_supported.argtypes = [] +_lib.ggllm_mmap_supported.restype = c_bool + + +# FALCON_API bool falcon_mlock_supported(); +def falcon_mlock_supported() -> bool: + return _lib.ggllm_mlock_supported() + + +_lib.ggllm_mlock_supported.argtypes = [] +_lib.ggllm_mlock_supported.restype = c_bool + + +# // TODO: not great API - very likely to change +# // Initialize the falcon + ggml backend +# // If numa is true, use NUMA optimizations +# // Call once at the start of the program +# FLACON_API void falcon_init_backend(bool numa); +def falcon_init_backend(numa: c_bool): + return _lib.ggllm_init_backend(numa) + + +_lib.ggllm_init_backend.argtypes = [c_bool] +_lib.ggllm_init_backend.restype = None + + +# FALCON_API struct falcon_model * falcon_load_model_from_file( +# const char * path_model, +# struct falcon_context_params params); +def falcon_load_model_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_model_p: + return _lib.ggllm_load_model_from_file(path_model, params) + + +_lib.ggllm_load_model_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_load_model_from_file.restype = falcon_model_p + + +# FALCON_API void falcon_free_model(struct falcon_model * model); +def falcon_free_model(model: falcon_model_p): + return _lib.ggllm_free_model(model) + + +_lib.ggllm_free_model.argtypes = [falcon_model_p] +_lib.ggllm_free_model.restype = None + + +# FALCON_API struct falcon_context * falcon_new_context_with_model( +# struct falcon_model * model, +# struct falcon_context_params params); +def falcon_new_context_with_model( + model: falcon_model_p, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_new_context_with_model(model, params) + + +_lib.ggllm_new_context_with_model.argtypes = [falcon_model_p, ggllm_context_params] +_lib.ggllm_new_context_with_model.restype = falcon_context_p + + +# FALCON_API int64_t ggllm_time_us(); +def ggllm_time_us() -> int: + return _lib.ggllm_time_us() + + +_lib.ggllm_time_us.argtypes = [] +_lib.ggllm_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml falcon model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# FALCON_API struct falcon_context * falcon_init_from_file( +# const char * path_model, +# struct falcon_context_params params); +def ggllm_init_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_init_from_file(path_model, params) + + +_lib.ggllm_init_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_init_from_file.restype = falcon_context_p + + +# Frees all allocated memory +# FALCON_API void falcon_free(struct falcon_context * ctx); +def falcon_free(ctx: falcon_context_p): + return _lib.ggllm_free(ctx) + + +_lib.ggllm_free.argtypes = [falcon_context_p] +_lib.ggllm_free.restype = None + + +# // Returns 0 on success +# FALCON_API int ggllm_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# const falcon_model_quantize_params * params); +def ggllm_model_quantize( + fname_inp: bytes, + fname_out: bytes, + params, # type: POINTER(falcon_model_quantize_params) # type: ignore +) -> int: + return _lib.ggllm_model_quantize(fname_inp, fname_out, params) + + +_lib.ggllm_model_quantize.argtypes = [ + c_char_p, + c_char_p, + POINTER(falcon_model_quantize_params), +] +_lib.ggllm_model_quantize.restype = c_int + + +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +# FALCON_API int falcon_apply_lora_from_file( +# struct falcon_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def ggllm_apply_lora_from_file( + ctx: falcon_context_p, + path_lora: c_char_p, + path_base_model: c_char_p, + n_threads: c_int, +) -> int: + return _lib.ggllm_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.ggllm_apply_lora_from_file.argtypes = [falcon_context_p, c_char_p, c_char_p, c_int] +_lib.ggllm_apply_lora_from_file.restype = c_int + + +# FALCON_API int ggllm_model_apply_lora_from_file( +# const struct ggllm_model * model, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def falcon_model_apply_lora_from_file( + model: falcon_model_p, + path_lora: Union[c_char_p, bytes], + path_base_model: Union[c_char_p, bytes], + n_threads: c_int, +) -> int: + return _lib.ggllm_model_apply_lora_from_file( + model, path_lora, path_base_model, n_threads + ) + + +_lib.ggllm_model_apply_lora_from_file.argtypes = [ + falcon_model_p, + c_char_p, + c_char_p, + c_int, +] +_lib.ggllm_model_apply_lora_from_file.restype = c_int + + +# Returns the number of tokens in the KV cache +# FALCON_API int falcon_get_kv_cache_token_count(const struct falcon_context * ctx); +def ggllm_get_kv_cache_token_count(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_kv_cache_token_count(ctx) + + +_lib.ggllm_get_kv_cache_token_count.argtypes = [falcon_context_p] +_lib.ggllm_get_kv_cache_token_count.restype = c_int + + +# Sets the current rng seed. +# FALCON_API void falcon_set_rng_seed(struct falcon_context * ctx, int seed); +def falcon_set_rng_seed(ctx: falcon_context_p, seed: c_int): + return _lib.ggllm_set_rng_seed(ctx, seed) + + +_lib.ggllm_set_rng_seed.argtypes = [falcon_context_p, c_int] +_lib.ggllm_set_rng_seed.restype = None + + +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens +# FALCON_API size_t falcon_get_state_size(const struct falcon_context * ctx); +def falcon_get_state_size(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_state_size(ctx) + + +_lib.ggllm_get_state_size.argtypes = [falcon_context_p] +_lib.ggllm_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +# FALCON_API size_t falcon_copy_state_data(struct falcon_context * ctx, uint8_t * dst); +def falcon_copy_state_data( + ctx: falcon_context_p, dst # type: Array[c_uint8] +) -> int: + return _lib.ggllm_copy_state_data(ctx, dst) + + +_lib.ggllm_copy_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +# FALCON_API size_t falcon_set_state_data(struct falcon_context * ctx, uint8_t * src); +def falcon_set_state_data( + ctx: falcon_context_p, src # type: Array[c_uint8] +) -> int: + return _lib.ggllm_set_state_data(ctx, src) + + +_lib.ggllm_set_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_set_state_data.restype = c_size_t + + +# Save/load session file +# GGLLM_API bool falcon_load_session_file(struct falcon_context * ctx, const char * path_session, falcon_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); +def ggllm_load_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens_out, # type: Array[falcon_token] + n_token_capacity: c_size_t, + n_token_count_out, # type: _Pointer[c_size_t] +) -> int: + return _lib.ggllm_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.ggllm_load_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, + c_size_t_p, +] +_lib.ggllm_load_session_file.restype = c_size_t + + +# FALCON_API bool falcon_save_session_file(struct falcon_context * ctx, const char * path_session, const falcon_token * tokens, size_t n_token_count); +def ggllm_save_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens, # type: Array[falcon_token] + n_token_count: c_size_t, +) -> int: + return _lib.ggllm_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.ggllm_save_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, +] +_lib.ggllm_save_session_file.restype = c_size_t + + +# Run the falcon inference to obtain the logits and probabilities for the next token. +# tokens + n_tokens is the provided batch of new tokens to process +# n_past is the number of tokens to use from previous eval calls +# Returns 0 on success +# GGLLM_API int falcon_eval( +# struct falcon_context * ctx, +# const falcon_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); +def falcon_eval( + ctx: falcon_context_p, + tokens, # type: Array[falcon_token] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval(ctx, tokens, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval.argtypes = [falcon_context_p, falcon_token_p, c_int, c_int, c_int] +_lib.ggllm_eval.restype = c_int + + +# // Same as falcon_eval, but use float matrix input directly. +# FALCON_API int falcon_eval_embd( +# struct falcon_context * ctx, +# const float * embd, +# int n_tokens, +# int n_past, +# int n_threads); +def ggllm_eval_embd( + ctx: falcon_context_p, + embd, # type: Array[c_float] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval_embd(ctx, embd, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval_embd.argtypes = [falcon_context_p, c_float_p, c_int, c_int, c_int] +_lib.ggllm_eval_embd.restype = c_int + + +# Convert the provided text into tokens. +# The tokens pointer must be large enough to hold the resulting tokens. +# Returns the number of tokens on success, no more than n_max_tokens +# Returns a negative number on failure - the number of tokens that would have been returned +# TODO: not sure if correct +# FALCON_API int ggllm_tokenize( +# struct falcon_context * ctx, +# const char * text, +# falcon_token * tokens, +# int n_max_tokens, +# bool add_bos); +def falcon_tokenize( + ctx: falcon_context_p, + text: bytes, + tokens, # type: Array[falcon_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.ggllm_tokenize(ctx, text, tokens, n_max_tokens, add_bos) + + +_lib.ggllm_tokenize.argtypes = [falcon_context_p, c_char_p, falcon_token_p, c_int, c_bool] +_lib.ggllm_tokenize.restype = c_int + + +# GGLLM_API int ggllm_n_vocab(const struct falcon_context * ctx); +def falcon_n_vocab(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_vocab(ctx) + + +_lib.ggllm_n_vocab.argtypes = [falcon_context_p] +_lib.ggllm_n_vocab.restype = c_int + + +# FALCON_API int falcon_n_ctx (const struct falcon_context * ctx); +def falcon_n_ctx(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_ctx(ctx) + + +_lib.ggllm_n_ctx.argtypes = [falcon_context_p] +_lib.ggllm_n_ctx.restype = c_int + + +# FALCON_API int falcon_n_embd (const struct falcon_context * ctx); +def falcon_n_embd(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_embd(ctx) + + +_lib.ggllm_n_embd.argtypes = [falcon_context_p] +_lib.ggllm_n_embd.restype = c_int + + +# // Get the vocabulary as output parameters. +# // Returns number of results. +# FALCON_API int falcon_get_vocab( +# const struct falcon_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def falcon_get_vocab( + ctx: falcon_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.ggllm_get_vocab(ctx, strings, scores, capacity) + + +_lib.ggllm_get_vocab.argtypes = [falcon_context_p, c_char_p, c_float, c_int] +_lib.ggllm_get_vocab.restype = c_int + + +# Token logits obtained from the last call to falcon_eval() +# The logits for the last token are stored in the last row +# Can be mutated in order to change the probabilities of the next token +# Rows: n_tokens +# Cols: n_vocab +# FALCON_API float * falcon_get_logits(struct falcon_context * ctx); +def falcon_get_logits( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_logits(ctx) + + +_lib.ggllm_get_logits.argtypes = [falcon_context_p] +_lib.ggllm_get_logits.restype = c_float_p + + +# Get the embeddings for the input +# shape: [n_embd] (1-dimensional) +# FALCON_API float * falcon_get_embeddings(struct falcon_context * ctx); +def falcon_get_embeddings( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_embeddings(ctx) + + +_lib.ggllm_get_embeddings.argtypes = [falcon_context_p] +_lib.ggllm_get_embeddings.restype = c_float_p + + +# Token Id -> String. Uses the vocabulary in the provided context +# FLACON_API const char * falcon_token_to_str(const struct falcon_context * ctx, falcon_token token); +def falcon_token_to_str(ctx: falcon_context_p, token: falcon_token) -> bytes: + return _lib.ggllm_token_to_str(ctx, token) + + +_lib.ggllm_token_to_str.argtypes = [falcon_context_p, falcon_token] +_lib.ggllm_token_to_str.restype = c_char_p + +# Special tokens + + +# FALCON_API falcon_token falcon_token_bos(); // beginning-of-sentence +def falcon_token_bos() -> int: + return _lib.ggllm_token_bos() + + +_lib.ggllm_token_bos.argtypes = [] +_lib.ggllm_token_bos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_eos(); // end-of-sentence +def falcon_token_eos() -> int: + return _lib.ggllm_token_eos() + + +_lib.ggllm_token_eos.argtypes = [] +_lib.ggllm_token_eos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_nl(); // next-line +def falcon_token_nl() -> int: + return _lib.ggllm_token_nl() + + +_lib.ggllm_token_nl.argtypes = [] +_lib.ggllm_token_nl.restype = falcon_token + + +# Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# FALCON_API void falcon_sample_repetition_penalty(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float penalty); +def falcon_sample_repetition_penalty( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + penalty: c_float, +): + return _lib.ggllm_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty + ) + + +_lib.ggllm_sample_repetition_penalty.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, +] +_lib.ggllm_sample_repetition_penalty.restype = None + + +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# FALCON_API void falcon_sample_frequency_and_presence_penalties(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def falcon_sample_frequency_and_presence_penalties( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +): + return _lib.ggllm_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.ggllm_sample_frequency_and_presence_penalties.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, + c_float, +] +_lib.ggllm_sample_frequency_and_presence_penalties.restype = None + + +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# FALCON_API void falcon_sample_softmax(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_softmax( + ctx: falcon_context_p, candidates # type: _Pointer[falcon_token_data] +): + return _lib.ggllm_sample_softmax(ctx, candidates) + + +_lib.ggllm_sample_softmax.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_softmax.restype = None + + +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_k(struct falcon_context * ctx, falcon_token_data_array * candidates, int k, size_t min_keep); +def falcon_sample_top_k( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + k: c_int, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.ggllm_sample_top_k.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_int, + c_size_t, +] +_lib.ggllm_sample_top_k.restype = None + + +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_p(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_top_p( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_top_p.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_top_p.restype = None + + +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# FALCON_API void falcon_sample_tail_free(struct falcon_context * ctx, falcon_token_data_array * candidates, float z, size_t min_keep); +def falcon_sample_tail_free( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + z: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.ggllm_sample_tail_free.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_tail_free.restype = None + + +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# FALCON_API void falcon_sample_typical(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_typical( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_typical(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_typical.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_typical.restype = None + + +# FALCON_API void falcon_sample_temperature(struct falcon_context * ctx, falcon_token_data_array * candidates, float temp); +def falcon_sample_temperature( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + temp: c_float, +): + return _lib.ggllm_sample_temperature(ctx, candidates, temp) + + +_lib.ggllm_sample_temperature.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, +] +_lib.ggllm_sample_temperature.restype = None + + +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, int m, float * mu); +def falcon_sample_token_mirostat( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + m: c_int, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.ggllm_sample_token_mirostat.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_int, + c_float_p, +] +_lib.ggllm_sample_token_mirostat.restype = falcon_token + + +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat_v2(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, float * mu); +def falcon_sample_token_mirostat_v2( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.ggllm_sample_token_mirostat_v2.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_float_p, +] +_lib.ggllm_sample_token_mirostat_v2.restype = falcon_token + + +# @details Selects the token with the highest probability. +# FALCON_API falcon_token falcon_sample_token_greedy(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token_greedy( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token_greedy(ctx, candidates) + + +_lib.ggllm_sample_token_greedy.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token_greedy.restype = falcon_token + + +# @details Randomly selects a token from the candidates based on their probabilities. +# FALCON_API falcon_token falcon_sample_token(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token(ctx, candidates) + + +_lib.ggllm_sample_token.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token.restype = falcon_token + + +# Performance information + + +# FALCON_API void falcon_print_timings(struct falcon_context * ctx); +def falcon_print_timings(ctx: falcon_context_p): + _lib.ggllm_print_timings(ctx) + + +_lib.ggllm_print_timings.argtypes = [falcon_context_p] +_lib.ggllm_print_timings.restype = None + + +# FALCON_API void falcon_reset_timings(struct falcon_context * ctx); +def falcon_reset_timings(ctx: falcon_context_p): + _lib.ggllm_reset_timings(ctx) + + +_lib.ggllm_reset_timings.argtypes = [falcon_context_p] +_lib.ggllm_reset_timings.restype = None + + +# Print system information +# FALCON_API const char * falcon_print_system_info(void); +def falcon_print_system_info() -> bytes: + return _lib.ggllm_print_system_info() + + +_lib.ggllm_print_system_info.argtypes = [] +_lib.ggllm_print_system_info.restype = c_char_p + +################################################################################################### + + +_falcon_initialized = False + +if not _falcon_initialized: + falcon_init_backend(c_bool(False)) + _falcon_initialized = True diff --git a/llama_cpp/llama_types.py b/falcon_cpp/falcon_types.py similarity index 100% rename from llama_cpp/llama_types.py rename to falcon_cpp/falcon_types.py diff --git a/llama_cpp/server/__init__.py b/falcon_cpp/server/__init__.py similarity index 100% rename from llama_cpp/server/__init__.py rename to falcon_cpp/server/__init__.py diff --git a/llama_cpp/server/__main__.py b/falcon_cpp/server/__main__.py similarity index 100% rename from llama_cpp/server/__main__.py rename to falcon_cpp/server/__main__.py diff --git a/llama_cpp/server/app.py b/falcon_cpp/server/app.py similarity index 88% rename from llama_cpp/server/app.py rename to falcon_cpp/server/app.py index ef319c7e0..6b5538dcf 100644 --- a/llama_cpp/server/app.py +++ b/falcon_cpp/server/app.py @@ -5,7 +5,7 @@ from typing import Iterator, List, Optional, Union, Dict from typing_extensions import TypedDict, Literal -import llama_cpp +import falcon_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream @@ -43,11 +43,11 @@ class Settings(BaseSettings): ) f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") use_mlock: bool = Field( - default=llama_cpp.llama_mlock_supported(), + default=falcon_cpp.falcon_mlock_supported(), description="Use mlock.", ) use_mmap: bool = Field( - default=llama_cpp.llama_mmap_supported(), + default=falcon_cpp.falcon_mmap_supported(), description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") @@ -90,14 +90,14 @@ class Settings(BaseSettings): router = APIRouter() settings: Optional[Settings] = None -llama: Optional[llama_cpp.Llama] = None +falcon: Optional[falcon_cpp.falcon] = None def create_app(settings: Optional[Settings] = None): if settings is None: settings = Settings() app = FastAPI( - title="🦙 llama.cpp Python API", + title="🦙 falcon.cpp Python API", version="0.0.1", ) app.add_middleware( @@ -108,8 +108,8 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) - global llama - llama = llama_cpp.Llama( + global falcon + falcon = falcon_cpp.Falcon( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, seed=settings.seed, @@ -129,14 +129,14 @@ def create_app(settings: Optional[Settings] = None): if settings.cache_type == "disk": if settings.verbose: print(f"Using disk cache with size {settings.cache_size}") - cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + cache = falcon_cpp.FalconDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: print(f"Using ram cache with size {settings.cache_size}") - cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + cache = falcon_cpp.FalconRAMCache(capacity_bytes=settings.cache_size) - cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) - llama.set_cache(cache) + cache = falcon_cpp.FalconCache(capacity_bytes=settings.cache_size) + falcon.set_cache(cache) def set_settings(_settings: Settings): global settings @@ -146,12 +146,12 @@ def set_settings(_settings: Settings): return app -llama_lock = Lock() +falcon_lock = Lock() -def get_llama(): - with llama_lock: - yield llama +def get_falcon(): + with falcon_lock: + yield falcon def get_settings(): @@ -276,7 +276,7 @@ class CreateCompletionRequest(BaseModel): best_of: Optional[int] = 1 user: Optional[str] = Field(None) - # llama.cpp specific parameters + # falcon.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) @@ -290,11 +290,11 @@ class Config: } -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) +CreateCompletionResponse = create_model_from_typeddict(falcon_cpp.Completion) def make_logit_bias_processor( - llama: llama_cpp.Llama, + falcon: falcon_cpp.Falcon, logit_bias: Dict[str, float], logit_bias_type: Optional[Literal["input_ids", "tokens"]], ): @@ -310,7 +310,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): token = token.encode('utf-8') - for input_id in llama.tokenize(token, add_bos=False): + for input_id in falcon.tokenize(token, add_bos=False): to_bias[input_id] = score def logit_bias_processor( @@ -333,7 +333,7 @@ def logit_bias_processor( async def create_completion( request: Request, body: CreateCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), + falcon: falcon_cpp.Falcon = Depends(get_falcon), ): if isinstance(body.prompt, list): assert len(body.prompt) <= 1 @@ -349,8 +349,8 @@ async def create_completion( kwargs = body.dict(exclude=exclude) if body.logit_bias is not None: - kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ - make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), ]) if body.stream: @@ -359,7 +359,7 @@ async def create_completion( async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: - iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore + iterator: Iterator[falcon_cpp.CompletionChunk] = await run_in_threadpool(falcon, **kwargs) # type: ignore async for chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): @@ -378,7 +378,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): recv_chan, data_sender_callable=partial(event_publisher, send_chan) ) else: - completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore + completion: falcon_cpp.Completion = await run_in_threadpool(falcon, **kwargs) # type: ignore return completion @@ -395,7 +395,7 @@ class Config: } -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) +CreateEmbeddingResponse = create_model_from_typeddict(falcon_cpp.Embedding) @router.post( @@ -403,10 +403,10 @@ class Config: response_model=CreateEmbeddingResponse, ) async def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) + request: CreateEmbeddingRequest, falcon: falcon_cpp.Falcon = Depends(get_falcon) ): return await run_in_threadpool( - llama.create_embedding, **request.dict(exclude={"user"}) + falcon.create_embedding, **request.dict(exclude={"user"}) ) @@ -438,7 +438,7 @@ class CreateChatCompletionRequest(BaseModel): n: Optional[int] = 1 user: Optional[str] = Field(None) - # llama.cpp specific parameters + # falcon.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) @@ -458,7 +458,7 @@ class Config: } -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) +CreateChatCompletionResponse = create_model_from_typeddict(falcon_cpp.ChatCompletion) @router.post( @@ -468,8 +468,8 @@ class Config: async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + falcon: falcon_cpp.Falcon = Depends(get_falcon), +) -> Union[falcon_cpp.ChatCompletion, EventSourceResponse]: exclude = { "n", "logit_bias", @@ -479,8 +479,8 @@ async def create_chat_completion( kwargs = body.dict(exclude=exclude) if body.logit_bias is not None: - kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ - make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), ]) if body.stream: @@ -489,7 +489,7 @@ async def create_chat_completion( async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: - iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + iterator: Iterator[falcon_cpp.ChatCompletionChunk] = await run_in_threadpool(falcon.create_chat_completion, **kwargs) # type: ignore async for chat_chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): @@ -509,8 +509,8 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): data_sender_callable=partial(event_publisher, send_chan), ) else: - completion: llama_cpp.ChatCompletion = await run_in_threadpool( - llama.create_chat_completion, **kwargs # type: ignore + completion: falcon_cpp.ChatCompletion = await run_in_threadpool( + falcon.create_chat_completion, **kwargs # type: ignore ) return completion @@ -533,7 +533,7 @@ class ModelList(TypedDict): @router.get("/v1/models", response_model=GetModelResponse) async def get_models( settings: Settings = Depends(get_settings), - llama: llama_cpp.Llama = Depends(get_llama), + falcon: falcon_cpp.Falcon = Depends(get_falcon), ) -> ModelList: return { "object": "list", @@ -541,7 +541,7 @@ async def get_models( { "id": settings.model_alias if settings.model_alias is not None - else llama.model_path, + else falcon.model_path, "object": "model", "owned_by": "me", "permissions": [], diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py deleted file mode 100644 index dce1764f6..000000000 --- a/llama_cpp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .llama_cpp import * -from .llama import * diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py deleted file mode 100644 index 52fc14e1d..000000000 --- a/llama_cpp/llama_cpp.py +++ /dev/null @@ -1,1024 +0,0 @@ -import sys -import os -import ctypes -from ctypes import ( - c_int, - c_float, - c_char_p, - c_void_p, - c_bool, - POINTER, - _Pointer, # type: ignore - Structure, - Array, - c_uint8, - c_size_t, -) -import pathlib -from typing import List, Union - - -# Load the library -def _load_shared_library(lib_base_name: str): - # Construct the paths to the possible shared library names - _base_path = pathlib.Path(__file__).parent.resolve() - # Searching for the library in the current directory under the name "libllama" (default name - # for llamacpp) and "llama" (default name for this repo) - _lib_paths: List[pathlib.Path] = [] - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - ] - elif sys.platform == "darwin": - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - _base_path / f"lib{lib_base_name}.dylib", - ] - elif sys.platform == "win32": - _lib_paths += [ - _base_path / f"{lib_base_name}.dll", - ] - else: - raise RuntimeError("Unsupported platform") - - if "LLAMA_CPP_LIB" in os.environ: - lib_base_name = os.environ["LLAMA_CPP_LIB"] - _lib = pathlib.Path(lib_base_name) - _base_path = _lib.parent.resolve() - _lib_paths = [_lib.resolve()] - - cdll_args = dict() # type: ignore - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(_base_path)) - if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) - cdll_args["winmode"] = 0 - - # Try to load the shared library, handling potential errors - for _lib_path in _lib_paths: - if _lib_path.exists(): - try: - return ctypes.CDLL(str(_lib_path), **cdll_args) - except Exception as e: - raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - - -# Specify the base name of the shared library to load -_lib_base_name = "llama" - -# Load the library -_lib = _load_shared_library(_lib_base_name) - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# llama.h bindings - -GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") -GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) -LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) - -# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) -# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' -LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) -# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' -LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) -# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' -LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) -# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) - -# #define LLAMA_FILE_VERSION 3 -LLAMA_FILE_VERSION = c_int(3) -LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT -LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML -LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -LLAMA_SESSION_VERSION = c_int(1) - -# struct llama_model; -llama_model_p = c_void_p - -# struct llama_context; -llama_context_p = c_void_p - - -# typedef int llama_token; -llama_token = c_int -llama_token_p = POINTER(llama_token) - - -# typedef struct llama_token_data { -# llama_token id; // token id -# float logit; // log-odds of the token -# float p; // probability of the token -# } llama_token_data; -class llama_token_data(Structure): - _fields_ = [ - ("id", llama_token), - ("logit", c_float), - ("p", c_float), - ] - - -llama_token_data_p = POINTER(llama_token_data) - - -# typedef struct llama_token_data_array { -# llama_token_data * data; -# size_t size; -# bool sorted; -# } llama_token_data_array; -class llama_token_data_array(Structure): - _fields_ = [ - ("data", llama_token_data_p), - ("size", c_size_t), - ("sorted", c_bool), - ] - - -llama_token_data_array_p = POINTER(llama_token_data_array) - -# typedef void (*llama_progress_callback)(float progress, void *ctx); -llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) - - -# struct llama_context_params { -# int seed; // RNG seed, -1 for random -# int n_ctx; // text context -# int n_batch; // prompt processing batch size -# int n_gpu_layers; // number of layers to store in VRAM -# int main_gpu; // the GPU that is used for scratch and small tensors -# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs -# // called with a progress value between 0 and 1, pass NULL to disable -# llama_progress_callback progress_callback; -# // context pointer passed to the progress callback -# void * progress_callback_user_data; - - -# // Keep the booleans together to avoid misalignment during copy-by-value. -# bool low_vram; // if true, reduce VRAM usage at the cost of performance -# bool f16_kv; // use fp16 for KV cache -# bool logits_all; // the llama_eval() call computes all logits, not just the last one -# bool vocab_only; // only load the vocabulary, no weights -# bool use_mmap; // use mmap if possible -# bool use_mlock; // force system to keep model in RAM -# bool embedding; // embedding mode only -# }; -class llama_context_params(Structure): - _fields_ = [ - ("seed", c_int), - ("n_ctx", c_int), - ("n_batch", c_int), - ("n_gpu_layers", c_int), - ("main_gpu", c_int), - ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), - ("progress_callback", llama_progress_callback), - ("progress_callback_user_data", c_void_p), - ("low_vram", c_bool), - ("f16_kv", c_bool), - ("logits_all", c_bool), - ("vocab_only", c_bool), - ("use_mmap", c_bool), - ("use_mlock", c_bool), - ("embedding", c_bool), - ] - - -llama_context_params_p = POINTER(llama_context_params) - -# enum llama_ftype { -# LLAMA_FTYPE_ALL_F32 = 0, -# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 -# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed -# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed -# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors -# }; -LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) -LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10) -LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11) -LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12) -LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13) -LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14) -LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15) -LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16) -LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) -LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) - - -# // model quantization parameters -# typedef struct llama_model_quantize_params { -# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# } llama_model_quantize_params; -class llama_model_quantize_params(Structure): - _fields_ = [ - ("nthread", c_int), - ("ftype", c_int), - ("allow_requantize", c_bool), - ("quantize_output_tensor", c_bool), - ] - - -# LLAMA_API struct llama_context_params llama_context_default_params(); -def llama_context_default_params() -> llama_context_params: - return _lib.llama_context_default_params() - - -_lib.llama_context_default_params.argtypes = [] -_lib.llama_context_default_params.restype = llama_context_params - - -# LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); -def llama_model_quantize_default_params() -> llama_model_quantize_params: - return _lib.llama_model_quantize_default_params() - - -_lib.llama_model_quantize_default_params.argtypes = [] -_lib.llama_model_quantize_default_params.restype = llama_model_quantize_params - - -# LLAMA_API bool llama_mmap_supported(); -def llama_mmap_supported() -> bool: - return _lib.llama_mmap_supported() - - -_lib.llama_mmap_supported.argtypes = [] -_lib.llama_mmap_supported.restype = c_bool - - -# LLAMA_API bool llama_mlock_supported(); -def llama_mlock_supported() -> bool: - return _lib.llama_mlock_supported() - - -_lib.llama_mlock_supported.argtypes = [] -_lib.llama_mlock_supported.restype = c_bool - - -# // TODO: not great API - very likely to change -# // Initialize the llama + ggml backend -# // If numa is true, use NUMA optimizations -# // Call once at the start of the program -# LLAMA_API void llama_init_backend(bool numa); -def llama_init_backend(numa: c_bool): - return _lib.llama_init_backend(numa) - - -_lib.llama_init_backend.argtypes = [c_bool] -_lib.llama_init_backend.restype = None - - -# LLAMA_API struct llama_model * llama_load_model_from_file( -# const char * path_model, -# struct llama_context_params params); -def llama_load_model_from_file( - path_model: bytes, params: llama_context_params -) -> llama_model_p: - return _lib.llama_load_model_from_file(path_model, params) - - -_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] -_lib.llama_load_model_from_file.restype = llama_model_p - - -# LLAMA_API void llama_free_model(struct llama_model * model); -def llama_free_model(model: llama_model_p): - return _lib.llama_free_model(model) - - -_lib.llama_free_model.argtypes = [llama_model_p] -_lib.llama_free_model.restype = None - - -# LLAMA_API struct llama_context * llama_new_context_with_model( -# struct llama_model * model, -# struct llama_context_params params); -def llama_new_context_with_model( - model: llama_model_p, params: llama_context_params -) -> llama_context_p: - return _lib.llama_new_context_with_model(model, params) - - -_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params] -_lib.llama_new_context_with_model.restype = llama_context_p - - -# LLAMA_API int64_t llama_time_us(); -def llama_time_us() -> int: - return _lib.llama_time_us() - - -_lib.llama_time_us.argtypes = [] -_lib.llama_time_us.restype = ctypes.c_int64 - - -# // Various functions for loading a ggml llama model. -# // Allocate (almost) all memory needed for the model. -# // Return NULL on failure -# LLAMA_API struct llama_context * llama_init_from_file( -# const char * path_model, -# struct llama_context_params params); -def llama_init_from_file( - path_model: bytes, params: llama_context_params -) -> llama_context_p: - return _lib.llama_init_from_file(path_model, params) - - -_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] -_lib.llama_init_from_file.restype = llama_context_p - - -# Frees all allocated memory -# LLAMA_API void llama_free(struct llama_context * ctx); -def llama_free(ctx: llama_context_p): - return _lib.llama_free(ctx) - - -_lib.llama_free.argtypes = [llama_context_p] -_lib.llama_free.restype = None - - -# // Returns 0 on success -# LLAMA_API int llama_model_quantize( -# const char * fname_inp, -# const char * fname_out, -# const llama_model_quantize_params * params); -def llama_model_quantize( - fname_inp: bytes, - fname_out: bytes, - params, # type: POINTER(llama_model_quantize_params) # type: ignore -) -> int: - return _lib.llama_model_quantize(fname_inp, fname_out, params) - - -_lib.llama_model_quantize.argtypes = [ - c_char_p, - c_char_p, - POINTER(llama_model_quantize_params), -] -_lib.llama_model_quantize.restype = c_int - - -# Apply a LoRA adapter to a loaded model -# path_base_model is the path to a higher quality model to use as a base for -# the layers modified by the adapter. Can be NULL to use the current loaded model. -# The model needs to be reloaded before applying a new adapter, otherwise the adapter -# will be applied on top of the previous one -# Returns 0 on success -# LLAMA_API int llama_apply_lora_from_file( -# struct llama_context * ctx, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def llama_apply_lora_from_file( - ctx: llama_context_p, - path_lora: c_char_p, - path_base_model: c_char_p, - n_threads: c_int, -) -> int: - return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) - - -_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] -_lib.llama_apply_lora_from_file.restype = c_int - - -# LLAMA_API int llama_model_apply_lora_from_file( -# const struct llama_model * model, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def llama_model_apply_lora_from_file( - model: llama_model_p, - path_lora: Union[c_char_p, bytes], - path_base_model: Union[c_char_p, bytes], - n_threads: c_int, -) -> int: - return _lib.llama_model_apply_lora_from_file( - model, path_lora, path_base_model, n_threads - ) - - -_lib.llama_model_apply_lora_from_file.argtypes = [ - llama_model_p, - c_char_p, - c_char_p, - c_int, -] -_lib.llama_model_apply_lora_from_file.restype = c_int - - -# Returns the number of tokens in the KV cache -# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: - return _lib.llama_get_kv_cache_token_count(ctx) - - -_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] -_lib.llama_get_kv_cache_token_count.restype = c_int - - -# Sets the current rng seed. -# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); -def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): - return _lib.llama_set_rng_seed(ctx, seed) - - -_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] -_lib.llama_set_rng_seed.restype = None - - -# Returns the maximum size in bytes of the state (rng, logits, embedding -# and kv_cache) - will often be smaller after compacting tokens -# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); -def llama_get_state_size(ctx: llama_context_p) -> int: - return _lib.llama_get_state_size(ctx) - - -_lib.llama_get_state_size.argtypes = [llama_context_p] -_lib.llama_get_state_size.restype = c_size_t - - -# Copies the state to the specified destination address. -# Destination needs to have allocated enough memory. -# Returns the number of bytes copied -# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); -def llama_copy_state_data( - ctx: llama_context_p, dst # type: Array[c_uint8] -) -> int: - return _lib.llama_copy_state_data(ctx, dst) - - -_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] -_lib.llama_copy_state_data.restype = c_size_t - - -# Set the state reading from the specified address -# Returns the number of bytes read -# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); -def llama_set_state_data( - ctx: llama_context_p, src # type: Array[c_uint8] -) -> int: - return _lib.llama_set_state_data(ctx, src) - - -_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] -_lib.llama_set_state_data.restype = c_size_t - - -# Save/load session file -# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); -def llama_load_session_file( - ctx: llama_context_p, - path_session: bytes, - tokens_out, # type: Array[llama_token] - n_token_capacity: c_size_t, - n_token_count_out, # type: _Pointer[c_size_t] -) -> int: - return _lib.llama_load_session_file( - ctx, path_session, tokens_out, n_token_capacity, n_token_count_out - ) - - -_lib.llama_load_session_file.argtypes = [ - llama_context_p, - c_char_p, - llama_token_p, - c_size_t, - c_size_t_p, -] -_lib.llama_load_session_file.restype = c_size_t - - -# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); -def llama_save_session_file( - ctx: llama_context_p, - path_session: bytes, - tokens, # type: Array[llama_token] - n_token_count: c_size_t, -) -> int: - return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) - - -_lib.llama_save_session_file.argtypes = [ - llama_context_p, - c_char_p, - llama_token_p, - c_size_t, -] -_lib.llama_save_session_file.restype = c_size_t - - -# Run the llama inference to obtain the logits and probabilities for the next token. -# tokens + n_tokens is the provided batch of new tokens to process -# n_past is the number of tokens to use from previous eval calls -# Returns 0 on success -# LLAMA_API int llama_eval( -# struct llama_context * ctx, -# const llama_token * tokens, -# int n_tokens, -# int n_past, -# int n_threads); -def llama_eval( - ctx: llama_context_p, - tokens, # type: Array[llama_token] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) - - -_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] -_lib.llama_eval.restype = c_int - - -# // Same as llama_eval, but use float matrix input directly. -# LLAMA_API int llama_eval_embd( -# struct llama_context * ctx, -# const float * embd, -# int n_tokens, -# int n_past, -# int n_threads); -def llama_eval_embd( - ctx: llama_context_p, - embd, # type: Array[c_float] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads) - - -_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int] -_lib.llama_eval_embd.restype = c_int - - -# Convert the provided text into tokens. -# The tokens pointer must be large enough to hold the resulting tokens. -# Returns the number of tokens on success, no more than n_max_tokens -# Returns a negative number on failure - the number of tokens that would have been returned -# TODO: not sure if correct -# LLAMA_API int llama_tokenize( -# struct llama_context * ctx, -# const char * text, -# llama_token * tokens, -# int n_max_tokens, -# bool add_bos); -def llama_tokenize( - ctx: llama_context_p, - text: bytes, - tokens, # type: Array[llama_token] - n_max_tokens: c_int, - add_bos: c_bool, -) -> int: - return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) - - -_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] -_lib.llama_tokenize.restype = c_int - - -# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); -def llama_n_vocab(ctx: llama_context_p) -> int: - return _lib.llama_n_vocab(ctx) - - -_lib.llama_n_vocab.argtypes = [llama_context_p] -_lib.llama_n_vocab.restype = c_int - - -# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); -def llama_n_ctx(ctx: llama_context_p) -> int: - return _lib.llama_n_ctx(ctx) - - -_lib.llama_n_ctx.argtypes = [llama_context_p] -_lib.llama_n_ctx.restype = c_int - - -# LLAMA_API int llama_n_embd (const struct llama_context * ctx); -def llama_n_embd(ctx: llama_context_p) -> int: - return _lib.llama_n_embd(ctx) - - -_lib.llama_n_embd.argtypes = [llama_context_p] -_lib.llama_n_embd.restype = c_int - - -# // Get the vocabulary as output parameters. -# // Returns number of results. -# LLAMA_API int llama_get_vocab( -# const struct llama_context * ctx, -# const char * * strings, -# float * scores, -# int capacity); -def llama_get_vocab( - ctx: llama_context_p, - strings, # type: Array[c_char_p] # type: ignore - scores, # type: Array[c_float] # type: ignore - capacity: c_int, -) -> int: - return _lib.llama_get_vocab(ctx, strings, scores, capacity) - - -_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int] -_lib.llama_get_vocab.restype = c_int - - -# Token logits obtained from the last call to llama_eval() -# The logits for the last token are stored in the last row -# Can be mutated in order to change the probabilities of the next token -# Rows: n_tokens -# Cols: n_vocab -# LLAMA_API float * llama_get_logits(struct llama_context * ctx); -def llama_get_logits( - ctx: llama_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.llama_get_logits(ctx) - - -_lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = c_float_p - - -# Get the embeddings for the input -# shape: [n_embd] (1-dimensional) -# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); -def llama_get_embeddings( - ctx: llama_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.llama_get_embeddings(ctx) - - -_lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = c_float_p - - -# Token Id -> String. Uses the vocabulary in the provided context -# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); -def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: - return _lib.llama_token_to_str(ctx, token) - - -_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] -_lib.llama_token_to_str.restype = c_char_p - -# Special tokens - - -# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence -def llama_token_bos() -> int: - return _lib.llama_token_bos() - - -_lib.llama_token_bos.argtypes = [] -_lib.llama_token_bos.restype = llama_token - - -# LLAMA_API llama_token llama_token_eos(); // end-of-sentence -def llama_token_eos() -> int: - return _lib.llama_token_eos() - - -_lib.llama_token_eos.argtypes = [] -_lib.llama_token_eos.restype = llama_token - - -# LLAMA_API llama_token llama_token_nl(); // next-line -def llama_token_nl() -> int: - return _lib.llama_token_nl() - - -_lib.llama_token_nl.argtypes = [] -_lib.llama_token_nl.restype = llama_token - - -# Sampling functions - - -# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. -# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); -def llama_sample_repetition_penalty( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - last_tokens_data, # type: Array[llama_token] - last_tokens_size: c_int, - penalty: c_float, -): - return _lib.llama_sample_repetition_penalty( - ctx, candidates, last_tokens_data, last_tokens_size, penalty - ) - - -_lib.llama_sample_repetition_penalty.argtypes = [ - llama_context_p, - llama_token_data_array_p, - llama_token_p, - c_int, - c_float, -] -_lib.llama_sample_repetition_penalty.restype = None - - -# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. -# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); -def llama_sample_frequency_and_presence_penalties( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - last_tokens_data, # type: Array[llama_token] - last_tokens_size: c_int, - alpha_frequency: c_float, - alpha_presence: c_float, -): - return _lib.llama_sample_frequency_and_presence_penalties( - ctx, - candidates, - last_tokens_data, - last_tokens_size, - alpha_frequency, - alpha_presence, - ) - - -_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ - llama_context_p, - llama_token_data_array_p, - llama_token_p, - c_int, - c_float, - c_float, -] -_lib.llama_sample_frequency_and_presence_penalties.restype = None - - -# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_softmax( - ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] -): - return _lib.llama_sample_softmax(ctx, candidates) - - -_lib.llama_sample_softmax.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_softmax.restype = None - - -# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); -def llama_sample_top_k( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - k: c_int, - min_keep: c_size_t, -): - return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) - - -_lib.llama_sample_top_k.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_int, - c_size_t, -] -_lib.llama_sample_top_k.restype = None - - -# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); -def llama_sample_top_p( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) - - -_lib.llama_sample_top_p.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_top_p.restype = None - - -# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. -# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); -def llama_sample_tail_free( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - z: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) - - -_lib.llama_sample_tail_free.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_tail_free.restype = None - - -# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); -def llama_sample_typical( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_typical(ctx, candidates, p, min_keep) - - -_lib.llama_sample_typical.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_typical.restype = None - - -# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); -def llama_sample_temperature( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - temp: c_float, -): - return _lib.llama_sample_temperature(ctx, candidates, temp) - - -_lib.llama_sample_temperature.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, -] -_lib.llama_sample_temperature.restype = None - - -# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); -def llama_sample_token_mirostat( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - tau: c_float, - eta: c_float, - m: c_int, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) - - -_lib.llama_sample_token_mirostat.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_float, - c_int, - c_float_p, -] -_lib.llama_sample_token_mirostat.restype = llama_token - - -# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); -def llama_sample_token_mirostat_v2( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - tau: c_float, - eta: c_float, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) - - -_lib.llama_sample_token_mirostat_v2.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_float, - c_float_p, -] -_lib.llama_sample_token_mirostat_v2.restype = llama_token - - -# @details Selects the token with the highest probability. -# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_token_greedy( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] -) -> int: - return _lib.llama_sample_token_greedy(ctx, candidates) - - -_lib.llama_sample_token_greedy.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_token_greedy.restype = llama_token - - -# @details Randomly selects a token from the candidates based on their probabilities. -# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_token( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] -) -> int: - return _lib.llama_sample_token(ctx, candidates) - - -_lib.llama_sample_token.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_token.restype = llama_token - - -# Performance information - - -# LLAMA_API void llama_print_timings(struct llama_context * ctx); -def llama_print_timings(ctx: llama_context_p): - _lib.llama_print_timings(ctx) - - -_lib.llama_print_timings.argtypes = [llama_context_p] -_lib.llama_print_timings.restype = None - - -# LLAMA_API void llama_reset_timings(struct llama_context * ctx); -def llama_reset_timings(ctx: llama_context_p): - _lib.llama_reset_timings(ctx) - - -_lib.llama_reset_timings.argtypes = [llama_context_p] -_lib.llama_reset_timings.restype = None - - -# Print system information -# LLAMA_API const char * llama_print_system_info(void); -def llama_print_system_info() -> bytes: - return _lib.llama_print_system_info() - - -_lib.llama_print_system_info.argtypes = [] -_lib.llama_print_system_info.restype = c_char_p - -################################################################################################### - - -_llama_initialized = False - -if not _llama_initialized: - llama_init_backend(c_bool(False)) - _llama_initialized = True diff --git a/mkdocs.yml b/mkdocs.yml index 286581176..b8c09b67c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ -site_name: llama-cpp-python -repo_url: https://github.com/abetlen/llama-cpp-python +site_name: falcon-cpp-python +repo_url: https://github.com/sirajperson/falcon-cpp-python theme: name: "material" diff --git a/pyproject.toml b/pyproject.toml index e79d72eef..36d266699 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] -name = "llama_cpp_python" -version = "0.1.67" -description = "Python bindings for the llama.cpp library" -authors = ["Andrei Betlen "] +name = "falcon_cpp_python" +version = "0.0.1" +description = "Python bindings for the ggllm.cpp library" +authors = ["Andrei Betlen "] license = "MIT" readme = "README.md" -homepage = "https://github.com/abetlen/llama-cpp-python" -repository = "https://github.com/abetlen/llama-cpp-python" -packages = [{include = "llama_cpp"}] +homepage = "https://github.com/abetlen/falcon-cpp-python" +repository = "https://github.com/abetlen/falcon-cpp-python" +packages = [{include = "falcon_cpp"}] include = [ "LICENSE.md", ] diff --git a/setup.py b/setup.py index 95593415a..aab60283c 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ long_description = (this_directory / "README.md").read_text(encoding="utf-8") setup( - name="llama_cpp_python", - description="A Python wrapper for llama.cpp", + name="falcon_cpp_python", + description="A Python wrapper for ggllm.cpp to run Falcon models", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.67", - author="Andrei Betlen", - author_email="abetlen@gmail.com", + version="0.0.1", + author="Siraj Florida", + author_email="sirajperson@gmail.com", license="MIT", - package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, - packages=["llama_cpp", "llama_cpp.server"], + package_dir={"falcon_cpp": "falcon_cpp", "falcon_cpp.server": "falcon_cpp/server"}, + packages=["falcon_cpp", "falcon_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], diff --git a/tests/test_llama.py b/tests/test_falcon.py similarity index 56% rename from tests/test_llama.py rename to tests/test_falcon.py index 941287de6..d162cc6d6 100644 --- a/tests/test_llama.py +++ b/tests/test_falcon.py @@ -1,39 +1,39 @@ -import llama_cpp +import falcon_cpp -MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin" +MODEL = "./vendor/ggllm/models/ggml-vocab.bin" -def test_llama(): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) +def test_falcon(): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - assert llama - assert llama.ctx is not None + assert falcon + assert falcon.ctx is not None text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text + assert falcon.detokenize(falcon.tokenize(text)) == text # @pytest.mark.skip(reason="need to update sample mocking") -def test_llama_patch(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = llama_cpp.llama_n_vocab(llama.ctx) +def test_falcon_patch(monkeypatch): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) ## Set up mock function def mock_eval(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] ) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) output_text = " jumps over the lazy dog." - output_tokens = llama.tokenize(output_text.encode("utf-8")) - token_eos = llama.token_eos() + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() n = 0 def mock_sample(*args, **kwargs): @@ -44,31 +44,31 @@ def mock_sample(*args, **kwargs): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_cpp_sample_token", mock_sample) text = "The quick brown fox" ## Test basic completion until eos n = 0 # reset - completion = llama.create_completion(text, max_tokens=20) + completion = falcon.create_completion(text, max_tokens=20) assert completion["choices"][0]["text"] == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until eos n = 0 # reset - chunks = llama.create_completion(text, max_tokens=20, stream=True) + chunks = falcon.create_completion(text, max_tokens=20, stream=True) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test basic completion until stop sequence n = 0 # reset - completion = llama.create_completion(text, max_tokens=20, stop=["lazy"]) + completion = falcon.create_completion(text, max_tokens=20, stop=["lazy"]) assert completion["choices"][0]["text"] == " jumps over the " assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until stop sequence n = 0 # reset - chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + chunks = falcon.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) assert ( "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " ) @@ -76,54 +76,54 @@ def mock_sample(*args, **kwargs): ## Test basic completion until length n = 0 # reset - completion = llama.create_completion(text, max_tokens=2) + completion = falcon.create_completion(text, max_tokens=2) assert completion["choices"][0]["text"] == " j" assert completion["choices"][0]["finish_reason"] == "length" ## Test streaming completion until length n = 0 # reset - chunks = llama.create_completion(text, max_tokens=2, stream=True) + chunks = falcon.create_completion(text, max_tokens=2, stream=True) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j" assert completion["choices"][0]["finish_reason"] == "length" -def test_llama_pickle(): +def test_falcon_pickle(): import pickle import tempfile fp = tempfile.TemporaryFile() - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - pickle.dump(llama, fp) + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + pickle.dump(falcon, fp) fp.seek(0) - llama = pickle.load(fp) + falcon = pickle.load(fp) - assert llama - assert llama.ctx is not None + assert falcon + assert falcon.ctx is not None text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text + assert falcon.detokenize(falcon.tokenize(text)) == text def test_utf8(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = llama_cpp.llama_n_vocab(llama.ctx) + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) ## Set up mock function def mock_eval(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] ) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) output_text = "😀" - output_tokens = llama.tokenize(output_text.encode("utf-8")) - token_eos = llama.token_eos() + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() n = 0 def mock_sample(*args, **kwargs): @@ -134,22 +134,22 @@ def mock_sample(*args, **kwargs): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_sample_token", mock_sample) ## Test basic completion with utf8 multibyte n = 0 # reset - completion = llama.create_completion("", max_tokens=4) + completion = falcon.create_completion("", max_tokens=4) assert completion["choices"][0]["text"] == output_text ## Test basic completion with incomplete utf8 multibyte n = 0 # reset - completion = llama.create_completion("", max_tokens=1) + completion = falcon.create_completion("", max_tokens=1) assert completion["choices"][0]["text"] == "" -def test_llama_server(): +def test_falcon_server(): from fastapi.testclient import TestClient - from llama_cpp.server.app import create_app, Settings + from falcon_cpp.server.app import create_app, Settings settings = Settings( model=MODEL, diff --git a/vendor/llama.cpp b/vendor/llama.cpp deleted file mode 160000 index 96a712ca1..000000000 --- a/vendor/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1 From af336a94a058367e40ac14d067faeebf9b8fd8c4 Mon Sep 17 00:00:00 2001 From: siraj Date: Wed, 5 Jul 2023 16:29:27 -0400 Subject: [PATCH 02/14] Update Build --- .gitmodules | 3 --- Makefile | 14 +++++++------- falcon_cpp/server/app.py | 2 +- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/.gitmodules b/.gitmodules index cdb598717..cdbef1424 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "vendor/llama.cpp"] - path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git [submodule "ggllm.cpp"] path = ggllm.cpp url = https://github.com/cmp-nct/ggllm.cpp diff --git a/Makefile b/Makefile index 48238f921..653395d6e 100644 --- a/Makefile +++ b/Makefile @@ -34,14 +34,14 @@ deploy.gh-docs: mkdocs gh-deploy clean: - - cd vendor/llama.cpp && make clean - - cd vendor/llama.cpp && rm libllama.so + - cd vendor/ggllm.cpp && make clean + - cd vendor/ggllm.cpp && rm ggllm.so - rm -rf _skbuild - - rm llama_cpp/*.so - - rm llama_cpp/*.dylib - - rm llama_cpp/*.metal - - rm llama_cpp/*.dll - - rm llama_cpp/*.lib + - rm falcon_cpp/*.so + - rm falcon_cpp/*.dylib + - rm falcon_cpp/*.metal + - rm falcon_cpp/*.dll + - rm falcon_cpp/*.lib .PHONY: \ update \ diff --git a/falcon_cpp/server/app.py b/falcon_cpp/server/app.py index 6b5538dcf..2e0972ea6 100644 --- a/falcon_cpp/server/app.py +++ b/falcon_cpp/server/app.py @@ -24,7 +24,7 @@ class Settings(BaseSettings): default=None, description="The alias of the model to use for generating completions.", ) - n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_ctx: int = Field(default=8192, ge=1, description="The context size.") n_gpu_layers: int = Field( default=0, ge=0, From 04ad09f49c5aa3ca41c15819895d35c3448cf16b Mon Sep 17 00:00:00 2001 From: siraj Date: Wed, 5 Jul 2023 16:51:02 -0400 Subject: [PATCH 03/14] Update Build --- CMakeLists.txt | 24 +++--- docker/README.md | 66 -------------- docker/cuda_simple/Dockerfile | 16 ---- docker/open_llama/Dockerfile | 51 ----------- docker/open_llama/build.sh | 14 --- docker/open_llama/hug_model.py | 139 ------------------------------ docker/open_llama/start.sh | 28 ------ docker/open_llama/start_server.sh | 11 --- docker/openblas_simple/Dockerfile | 15 ---- docs/api-reference.md | 53 ------------ docs/index.md | 92 -------------------- docs/install/macos.md | 59 ------------- docs/requirements.txt | 3 - mkdocs.yml | 2 +- 14 files changed, 13 insertions(+), 560 deletions(-) delete mode 100644 docker/README.md delete mode 100644 docker/cuda_simple/Dockerfile delete mode 100644 docker/open_llama/Dockerfile delete mode 100755 docker/open_llama/build.sh delete mode 100644 docker/open_llama/hug_model.py delete mode 100755 docker/open_llama/start.sh delete mode 100755 docker/open_llama/start_server.sh delete mode 100644 docker/openblas_simple/Dockerfile delete mode 100644 docs/api-reference.md delete mode 100644 docs/index.md delete mode 100644 docs/install/macos.md delete mode 100644 docs/requirements.txt diff --git a/CMakeLists.txt b/CMakeLists.txt index 788402a56..81ef24a0c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.4...3.22) -project(llama_cpp) +project(falcon_cpp) option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) @@ -8,27 +8,27 @@ set(FORCE_CMAKE $ENV{FORCE_CMAKE}) if (UNIX AND NOT FORCE_CMAKE) add_custom_command( - OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so COMMAND make libllama.so - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp ) add_custom_target( run ALL - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so ) install( - FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so + FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so DESTINATION llama_cpp ) else() set(BUILD_SHARED_LIBS "On") - add_subdirectory(vendor/llama.cpp) + add_subdirectory(vendor/ggllm.cpp) install( - TARGETS llama - LIBRARY DESTINATION llama_cpp - RUNTIME DESTINATION llama_cpp - ARCHIVE DESTINATION llama_cpp - FRAMEWORK DESTINATION llama_cpp - RESOURCE DESTINATION llama_cpp + TARGETS ggllm + LIBRARY DESTINATION falcon_cpp + RUNTIME DESTINATION falcon_cpp + ARCHIVE DESTINATION falcon_cpp + FRAMEWORK DESTINATION falcon_cpp + RESOURCE DESTINATION falcon_cpp ) endif() diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index 053d311b4..000000000 --- a/docker/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Install Docker Server - -**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! - -[Install Docker Engine](https://docs.docker.com/engine/install) - -**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) - -# Simple Dockerfiles for building the llama-cpp-python server with external model bin files -## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image -``` -cd ./openblas_simple -docker build -t openblas_simple . -docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple -``` -where `/` is the full path to the model file on the Docker host system. - -## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image -``` -cd ./cuda_simple -docker build -t cuda_simple . -docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple -``` -where `/` is the full path to the model file on the Docker host system. - -# "Open-Llama-in-a-box" -## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server -``` -$ cd ./open_llama -./build.sh -./start.sh -``` - -# Manually choose your own Llama model from Hugging Face -`python3 ./hug_model.py -a TheBloke -t llama` -You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. -``` -docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin -lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin -``` -**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least -**TWICE** as much disk space as the size of the model: - -| Model | Quantized size | -|------:|----------------:| -| 3B | 3 GB | -| 7B | 5 GB | -| 13B | 10 GB | -| 33B | 25 GB | -| 65B | 50 GB | - -**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` - -## Use OpenBLAS -Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: -### Build: -`docker build -t openblas .` -### Run: -`docker run --cap-add SYS_RESOURCE -t openblas` - -## Use CuBLAS -### Build: -`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` -### Run: -`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile deleted file mode 100644 index 24906d53a..000000000 --- a/docker/cuda_simple/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM nvidia/cuda:${CUDA_IMAGE} - -# We need to set the host to 0.0.0.0 to allow outside access -ENV HOST 0.0.0.0 - -COPY . . - -# Install the package -RUN apt update && apt install -y python3 python3-pip -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -RUN LLAMA_CUBLAS=1 pip install llama-cpp-python - -# Run the server -CMD python3 -m llama_cpp.server diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile deleted file mode 100644 index f0ef5f721..000000000 --- a/docker/open_llama/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -# Define the image argument and provide a default value -ARG IMAGE=python:3-slim-bullseye - -# Use the image as specified -FROM ${IMAGE} - -# Re-declare the ARG after FROM -ARG IMAGE - -# Update and upgrade the existing packages -RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - ninja-build \ - build-essential - -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -# Perform the conditional installations based on the image -RUN echo "Image: ${IMAGE}" && \ - if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ - echo "OpenBLAS install:" && \ - apt-get install -y --no-install-recommends libopenblas-dev && \ - LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ -else \ - echo "CuBLAS install:" && \ - LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ -fi - -# Clean up apt cache -RUN rm -rf /var/lib/apt/lists/* - -# Set a working directory for better clarity -WORKDIR /app - -# Copy files to the app directory -RUN echo "Installing model...this can take some time..." -COPY ./model.bin /app/model.bin -COPY ./start_server.sh /app/start_server.sh - -# Make the server start script executable -RUN chmod +x /app/start_server.sh - -# Set environment variable for the host -ENV HOST=0.0.0.0 - -# Expose a port for the server -EXPOSE 8000 - -# Run the server start script -CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh deleted file mode 100755 index 3a6457dcd..000000000 --- a/docker/open_llama/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -MODEL="open_llama_3b" -# Get open_llama_3b_ggml q5_1 quantization -python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" -ls -lh *.bin - -# Build the default OpenBLAS image -docker build -t $MODEL . -docker images | egrep "^(REPOSITORY|$MODEL)" - -echo -echo "To start the docker container run:" -echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/open_llama/hug_model.py b/docker/open_llama/hug_model.py deleted file mode 100644 index 13c5b6b0d..000000000 --- a/docker/open_llama/hug_model.py +++ /dev/null @@ -1,139 +0,0 @@ -import requests -import json -import os -import struct -import argparse - -def make_request(url, params=None): - print(f"Making request to {url}...") - response = requests.get(url, params=params) - if response.status_code == 200: - return json.loads(response.text) - else: - print(f"Request failed with status code {response.status_code}") - return None - -def check_magic_and_version(filename): - with open(filename, 'rb') as f: - # Read the first 6 bytes from the file - data = f.read(6) - - # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int - # and the next 2 bytes as a little-endian unsigned short - magic, version = struct.unpack('= 10485760: # 10 MB - print('.', end='', flush=True) - total_downloaded = 0 - print("\nDownload complete.") - - # Creating a symbolic link from destination to "model.bin" - if os.path.isfile("model.bin"): - os.remove("model.bin") # remove the existing link if any - os.symlink(destination, "model.bin") - else: - print(f"Download failed with status code {response.status_code}") - -def get_user_choice(model_list): - # Print the enumerated list - print("\n") - for i, (model_id, rfilename) in enumerate(model_list): - print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") - - # Get user's choice - choice = input("Choose a model to download by entering the corresponding number: ") - try: - index = int(choice) - 1 - if 0 <= index < len(model_list): - # Return the chosen model - return model_list[index] - else: - print("Invalid choice.") - except ValueError: - print("Invalid input. Please enter a number corresponding to a model.") - except IndexError: - print("Invalid choice. Index out of range.") - - return None - -def main(): - # Create an argument parser - parser = argparse.ArgumentParser(description='Process some parameters.') - - # Arguments - parser.add_argument('-v', '--version', type=int, default=0x0003, - help='hexadecimal version number of ggml file') - parser.add_argument('-a', '--author', type=str, default='TheBloke', - help='HuggingFace author filter') - parser.add_argument('-t', '--tag', type=str, default='llama', - help='HuggingFace tag filter') - parser.add_argument('-s', '--search', type=str, default='', - help='HuggingFace search filter') - parser.add_argument('-f', '--filename', type=str, default='q5_1', - help='HuggingFace model repository filename substring match') - - # Parse the arguments - args = parser.parse_args() - - # Define the parameters - params = { - "author": args.author, - "tags": args.tag, - "search": args.search - } - - models = make_request('https://huggingface.co/api/models', params=params) - if models is None: - return - - model_list = [] - # Iterate over the models - for model in models: - model_id = model['id'] - model_info = make_request(f'https://huggingface.co/api/models/{model_id}') - if model_info is None: - continue - - for sibling in model_info.get('siblings', []): - rfilename = sibling.get('rfilename') - if rfilename and args.filename in rfilename: - model_list.append((model_id, rfilename)) - - # Choose the model - model_list.sort(key=lambda x: x[0]) - if len(model_list) == 0: - print("No models found") - exit(1) - elif len(model_list) == 1: - model_choice = model_list[0] - else: - model_choice = get_user_choice(model_list) - - if model_choice is not None: - model_id, rfilename = model_choice - url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - dest = f"{model_id.replace('/', '_')}_{rfilename}" - download_file(url, dest) - _, version = check_magic_and_version(dest) - if version != args.version: - print(f"Warning: Expected version {args.version}, but found different version in the file.") - else: - print("Error - model choice was None") - exit(2) - -if __name__ == '__main__': - main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh deleted file mode 100755 index 7ee8f748e..000000000 --- a/docker/open_llama/start.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/sh - -MODEL="open_llama_3b" - -# Start Docker container -docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & -sleep 10 -echo -docker ps | egrep "(^CONTAINER|$MODEL)" - -# Test the model works -echo -curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": [ - "\n", - "###" - ] -}' | grep Paris -if [ $? -eq 0 ] -then - echo - echo "$MODEL is working!!" -else - echo - echo "ERROR: $MODEL not replying." - exit 1 -fi diff --git a/docker/open_llama/start_server.sh b/docker/open_llama/start_server.sh deleted file mode 100755 index d3329eec3..000000000 --- a/docker/open_llama/start_server.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -# For mlock support -ulimit -l unlimited - -if [ "$IMAGE" = "python:3-slim-bullseye" ]; then - python3 -B -m llama_cpp.server --model /app/model.bin -else - # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM - python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 -fi diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile deleted file mode 100644 index 1a95caeda..000000000 --- a/docker/openblas_simple/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3-slim-bullseye - -# We need to set the host to 0.0.0.0 to allow outside access -ENV HOST 0.0.0.0 - -COPY . . - -# Install the package -RUN apt update && apt install -y libopenblas-dev ninja-build build-essential -RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose - -# Run the server -CMD python3 -m llama_cpp.server diff --git a/docs/api-reference.md b/docs/api-reference.md deleted file mode 100644 index 1290cad49..000000000 --- a/docs/api-reference.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: API Reference ---- - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - set_cache - - save_state - - load_state - - token_bos - - token_eos - show_root_heading: true - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessor - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessorList - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteria - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteriaList - options: - show_root_heading: true - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 7d5ccc314..000000000 --- a/docs/index.md +++ /dev/null @@ -1,92 +0,0 @@ -# Getting Started - -## 🦙 Python Bindings for `llama.cpp` - -[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) - -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. -This package provides: - -- Low-level access to C API via `ctypes` interface. -- High-level Python API for text completion - - OpenAI-like API - - LangChain compatibility - -## Installation - -Install from PyPI: - -```bash -pip install llama-cpp-python -``` - -## High-level API - -```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="./models/7B/ggml-model.bin") ->>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) ->>> print(output) -{ - "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", - "object": "text_completion", - "created": 1679561337, - "model": "./models/7B/ggml-model.bin", - "choices": [ - { - "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.", - "index": 0, - "logprobs": None, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 14, - "completion_tokens": 28, - "total_tokens": 42 - } -} -``` - -## Web Server - -`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. -This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). - -To install the server package and get started: - -```bash -pip install llama-cpp-python[server] -export MODEL=./models/7B/ggml-model.bin -python3 -m llama_cpp.server -``` - -Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. - -## Low-level API - -The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. -The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). - - -## Development - -This package is under active development and I welcome any contributions. - -To get started, clone the repository and install the package in development mode: - -```bash -git clone git@github.com:abetlen/llama-cpp-python.git -git submodule update --init --recursive -# Will need to be re-run any time vendor/llama.cpp is updated -python3 setup.py develop -``` - -## License - -This project is licensed under the terms of the MIT license. \ No newline at end of file diff --git a/docs/install/macos.md b/docs/install/macos.md deleted file mode 100644 index 600469615..000000000 --- a/docs/install/macos.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: MacOS Install with Metal GPU ---- - -**(1) Make sure you have xcode installed... at least the command line parts** -``` -# check the path of your xcode install -xcode-select -p - -# xcode installed returns -# /Applications/Xcode-beta.app/Contents/Developer - -# if xcode is missing then install it... it takes ages; -xcode-select --install -``` - -**(2) Install the conda version for MacOS that supports Metal GPU** -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` - -**(3) Make a conda environment** -``` -conda create -n llama python=3.9.16 -conda activate llama -``` - -**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** - *(you needed xcode installed in order pip to build/compile the C++ code)* -``` -pip uninstall llama-cpp-python -y -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir -pip install 'llama-cpp-python[server]' - -# you should now have llama-cpp-python v0.1.62 installed -llama-cpp-python         0.1.62      - -``` - -**(4) Download a v3 ggml model** - - **ggmlv3** - - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 - -https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML - - -**(6) run the llama-cpp-python API server with MacOS Metal GPU support** -``` -# config your ggml model path -# make sure it is ggml v3 -# make sure it is q4_0 -export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin -python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 -``` - -***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* - - diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 199bd4ffb..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index b8c09b67c..e4147790b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,7 +9,7 @@ plugins: - search watch: - - llama_cpp + - falcon_cpp markdown_extensions: - pymdownx.highlight: From 4bf2c7d25040faf53fa5b7994c54bbbc60402cd2 Mon Sep 17 00:00:00 2001 From: siraj Date: Wed, 5 Jul 2023 20:33:12 -0400 Subject: [PATCH 04/14] Update Build --- .dockerignore | 166 - .gitignore | 175 - .gitmodules | 3 - .readthedocs.yaml | 24 - CHANGELOG.md | 107 - CMakeLists.txt | 34 - LICENSE.md | 9 - Makefile | 56 - README.md | 193 - examples/high_level_api/fastapi_server.py | 37 - .../high_level_api_embedding.py | 11 - .../high_level_api_inference.py | 19 - .../high_level_api_streaming.py | 20 - .../high_level_api/langchain_custom_llm.py | 55 - examples/low_level_api/Chat.py | 71 - examples/low_level_api/Miku.py | 59 - examples/low_level_api/ReasonAct.py | 49 - examples/low_level_api/common.py | 202 - .../low_level_api/low_level_api_chat_cpp.py | 568 -- .../low_level_api/low_level_api_llama_cpp.py | 102 - examples/low_level_api/quantize.py | 25 - examples/low_level_api/util.py | 95 - examples/notebooks/Clients.ipynb | 104 - examples/notebooks/Guidance.ipynb | 89 - examples/notebooks/PerformanceTuning.ipynb | 5540 ----------------- falcon_cpp/__init__.py | 2 - falcon_cpp/falcon.py | 1622 ----- falcon_cpp/falcon_cpp.py | 1024 --- falcon_cpp/falcon_types.py | 97 - falcon_cpp/server/__init__.py | 0 falcon_cpp/server/__main__.py | 50 - falcon_cpp/server/app.py | 550 -- mkdocs.yml | 21 - poetry.lock | 1636 ----- poetry.toml | 3 - pyproject.toml | 44 - setup.py | 32 - tests/test_falcon.py | 171 - 38 files changed, 13065 deletions(-) delete mode 100644 .dockerignore delete mode 100644 .gitignore delete mode 100644 .gitmodules delete mode 100644 .readthedocs.yaml delete mode 100644 CHANGELOG.md delete mode 100644 CMakeLists.txt delete mode 100644 LICENSE.md delete mode 100644 Makefile delete mode 100644 README.md delete mode 100644 examples/high_level_api/fastapi_server.py delete mode 100644 examples/high_level_api/high_level_api_embedding.py delete mode 100644 examples/high_level_api/high_level_api_inference.py delete mode 100644 examples/high_level_api/high_level_api_streaming.py delete mode 100644 examples/high_level_api/langchain_custom_llm.py delete mode 100644 examples/low_level_api/Chat.py delete mode 100644 examples/low_level_api/Miku.py delete mode 100644 examples/low_level_api/ReasonAct.py delete mode 100644 examples/low_level_api/common.py delete mode 100644 examples/low_level_api/low_level_api_chat_cpp.py delete mode 100644 examples/low_level_api/low_level_api_llama_cpp.py delete mode 100644 examples/low_level_api/quantize.py delete mode 100644 examples/low_level_api/util.py delete mode 100644 examples/notebooks/Clients.ipynb delete mode 100644 examples/notebooks/Guidance.ipynb delete mode 100644 examples/notebooks/PerformanceTuning.ipynb delete mode 100644 falcon_cpp/__init__.py delete mode 100644 falcon_cpp/falcon.py delete mode 100644 falcon_cpp/falcon_cpp.py delete mode 100644 falcon_cpp/falcon_types.py delete mode 100644 falcon_cpp/server/__init__.py delete mode 100644 falcon_cpp/server/__main__.py delete mode 100644 falcon_cpp/server/app.py delete mode 100644 mkdocs.yml delete mode 100644 poetry.lock delete mode 100644 poetry.toml delete mode 100644 pyproject.toml delete mode 100644 setup.py delete mode 100644 tests/test_falcon.py diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index fd64c09b3..000000000 --- a/.dockerignore +++ /dev/null @@ -1,166 +0,0 @@ -_skbuild/ - -.envrc - -models/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 36ed7f7fd..000000000 --- a/.gitignore +++ /dev/null @@ -1,175 +0,0 @@ -.vscode/ - -_skbuild/ - -.envrc - -models/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so -*.dylib -*.metal -*.dll -*.lib - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -.idea/ - -# downloaded model .bin files -docker/open_llama/*.bin diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index cdbef1424..000000000 --- a/.gitmodules +++ /dev/null @@ -1,3 +0,0 @@ -[submodule "ggllm.cpp"] - path = ggllm.cpp - url = https://github.com/cmp-nct/ggllm.cpp diff --git a/.readthedocs.yaml b/.readthedocs.yaml deleted file mode 100644 index ff3e950cd..000000000 --- a/.readthedocs.yaml +++ /dev/null @@ -1,24 +0,0 @@ -# Read the Docs configuration file for MkDocs projects -# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details - -# Required -version: 2 - -# Set the version of Python and other tools you might need -build: - os: ubuntu-22.04 - tools: - python: "3.11" - -mkdocs: - configuration: mkdocs.yml - -python: - install: - - method: pip - path: . - - requirements: docs/requirements.txt - -submodules: - include: all - recursive: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 0ff6cb84b..000000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,107 +0,0 @@ -# Changelog - -All notable changes to this project will be documented in this file. - -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [Unreleased] - -## [0.1.67] - -## Fixed - -- Fix performance bug in Llama model by pre-allocating memory tokens and logits. -- Fix bug in Llama model where the model was not free'd after use. - -## [0.1.66] - -## Added - -- (llama.cpp) New model API - -## Fixed - -- Performance issue during eval caused by looped np.concatenate call -- State pickling issue when saving cache to disk - -## [0.1.65] - -### Added - -- (llama.cpp) Fix struct misalignment bug - -## [0.1.64] - -### Added - -- (llama.cpp) Update llama.cpp -- Fix docs for seed. Set -1 for random. - -## [0.1.63] - -### Added - -- (llama.cpp) Add full gpu utilisation in CUDA -- (llama.cpp) Add get_vocab -- (llama.cpp) Add low_vram parameter -- (server) Add logit_bias parameter - -## [0.1.62] - -### Fixed - -- Metal support working -- Cache re-enabled - -## [0.1.61] - -### Fixed - -- Fix broken pip installation - -## [0.1.60] - -### NOTE - -- This release was deleted due to a bug with the packaging system that caused pip installations to fail. - -### Fixed - -- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. -- Temporarily disable cache for completion requests - -## [v0.1.59] - -### Added - -- (llama.cpp) k-quants support -- (server) mirostat sampling parameters to server - -### Fixed - -- Support both `.so` and `.dylib` for `libllama` on MacOS - -## [v0.1.58] - -### Added - -- (llama.cpp) Metal Silicon support - -## [v0.1.57] - -### Added - -- (llama.cpp) OpenLlama 3B support - -## [v0.1.56] - -### Added - -- (misc) Added first version of the changelog -- (server) Use async routes -- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance. - -### Fixed - -- (python-api) Performance bug in stop sequence check slowing down streaming. \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt deleted file mode 100644 index 81ef24a0c..000000000 --- a/CMakeLists.txt +++ /dev/null @@ -1,34 +0,0 @@ -cmake_minimum_required(VERSION 3.4...3.22) - -project(falcon_cpp) - -option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) - -set(FORCE_CMAKE $ENV{FORCE_CMAKE}) - -if (UNIX AND NOT FORCE_CMAKE) - add_custom_command( - OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so - COMMAND make libllama.so - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp - ) - add_custom_target( - run ALL - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so - ) - install( - FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggllm.so - DESTINATION llama_cpp - ) -else() - set(BUILD_SHARED_LIBS "On") - add_subdirectory(vendor/ggllm.cpp) - install( - TARGETS ggllm - LIBRARY DESTINATION falcon_cpp - RUNTIME DESTINATION falcon_cpp - ARCHIVE DESTINATION falcon_cpp - FRAMEWORK DESTINATION falcon_cpp - RESOURCE DESTINATION falcon_cpp - ) -endif() diff --git a/LICENSE.md b/LICENSE.md deleted file mode 100644 index 3a1d7180d..000000000 --- a/LICENSE.md +++ /dev/null @@ -1,9 +0,0 @@ -MIT License - -Copyright (c) 2023 Andrei Betlen - -Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile deleted file mode 100644 index 653395d6e..000000000 --- a/Makefile +++ /dev/null @@ -1,56 +0,0 @@ -update: - poetry install - git submodule update --init --recursive - -update.vendor: - cd vendor/ggllm.cpp && git pull origin master - -build: - python3 setup.py develop - -build.cuda: - CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop - -build.opencl: - CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop - -build.openblas: - CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop - -build.blis: - CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop - -build.metal: - CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop - -build.sdist: - python3 setup.py sdist - -deploy.pypi: - python3 -m twine upload dist/* - -deploy.gh-docs: - mkdocs build - mkdocs gh-deploy - -clean: - - cd vendor/ggllm.cpp && make clean - - cd vendor/ggllm.cpp && rm ggllm.so - - rm -rf _skbuild - - rm falcon_cpp/*.so - - rm falcon_cpp/*.dylib - - rm falcon_cpp/*.metal - - rm falcon_cpp/*.dll - - rm falcon_cpp/*.lib - -.PHONY: \ - update \ - update.vendor \ - build \ - build.cuda \ - build.opencl \ - build.openblas \ - build.sdist \ - deploy.pypi \ - deploy.gh-docs \ - clean \ No newline at end of file diff --git a/README.md b/README.md deleted file mode 100644 index fb652a925..000000000 --- a/README.md +++ /dev/null @@ -1,193 +0,0 @@ -# 🦙 Python Bindings for `llama.cpp` - -[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) - -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. -This package provides: - -- Low-level access to C API via `ctypes` interface. -- High-level Python API for text completion - - OpenAI-like API - - LangChain compatibility - -Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). - - -## Installation from PyPI (recommended) - -Install from PyPI (requires a c compiler): - -```bash -pip install llama-cpp-python -``` - -The above command will attempt to install the package and build `llama.cpp` from source. -This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. - -If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: - -```bash -pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -``` - -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` -Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. - -### Installation with OpenBLAS / cuBLAS / CLBlast / Metal - -`llama.cpp` supports multiple BLAS backends for faster processing. -Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. - -To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) - -## High-level API - -The high-level API provides a simple managed interface through the `Llama` class. - -Below is a short example demonstrating how to use the high-level API to generate text: - -```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="./models/7B/ggml-model.bin") ->>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) ->>> print(output) -{ - "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", - "object": "text_completion", - "created": 1679561337, - "model": "./models/7B/ggml-model.bin", - "choices": [ - { - "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.", - "index": 0, - "logprobs": None, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 14, - "completion_tokens": 28, - "total_tokens": 42 - } -} -``` - -## Web Server - -`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. -This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). - -To install the server package and get started: - -```bash -pip install llama-cpp-python[server] -python3 -m llama_cpp.server --model models/7B/ggml-model.bin -``` - -Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. - -## Docker image - -A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: - -```bash -docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest -``` - -## Low-level API - -The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. -The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). - -Below is a short example demonstrating how to use the low-level API to tokenize a prompt: - -```python ->>> import llama_cpp ->>> import ctypes ->>> params = llama_cpp.llama_context_default_params() -# use bytes for char * params ->>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) ->>> max_tokens = params.n_ctx -# use ctypes arrays for array params ->>> tokens = (llama_cpp.llama_token * int(max_tokens))() ->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) ->>> llama_cpp.llama_free(ctx) -``` - -Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. - - -# Documentation - -Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). -If you find any issues with the documentation, please open an issue or submit a PR. - -# Development - -This package is under active development and I welcome any contributions. - -To get started, clone the repository and install the package in development mode: - -```bash -git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git -cd llama-cpp-python - -# Install with pip -pip install -e . - -# if you want to use the fastapi / openapi server -pip install -e .[server] - -# If you're a poetry user, installing will also include a virtual environment -poetry install --all-extras -. .venv/bin/activate - -# Will need to be re-run any time vendor/llama.cpp is updated -python3 setup.py develop -``` - -# How does this compare to other Python bindings of `llama.cpp`? - -I originally wrote this package for my own use with two goals in mind: - -- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python -- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` - -Any contributions and changes to this package will be made with these goals in mind. - -# License - -This project is licensed under the terms of the MIT license. diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py deleted file mode 100644 index 4b3189dd1..000000000 --- a/examples/high_level_api/fastapi_server.py +++ /dev/null @@ -1,37 +0,0 @@ -"""Example FastAPI server for llama.cpp. - -To run this example: - -```bash -pip install fastapi uvicorn sse-starlette -export MODEL=../models/7B/... -``` - -Then run: -``` -uvicorn llama_cpp.server.app:app --reload -``` - -or - -``` -python3 -m llama_cpp.server -``` - -Then visit http://localhost:8000/docs to see the interactive API docs. - - -To actually see the implementation of the server, see llama_cpp/server/app.py - -""" -import os -import uvicorn - -from llama_cpp.server.app import create_app - -if __name__ == "__main__": - app = create_app() - - uvicorn.run( - app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) - ) diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py deleted file mode 100644 index feb0ed68d..000000000 --- a/examples/high_level_api/high_level_api_embedding.py +++ /dev/null @@ -1,11 +0,0 @@ -import argparse - -from llama_cpp import Llama - -parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin") -args = parser.parse_args() - -llm = Llama(model_path=args.model, embedding=True) - -print(llm.create_embedding("Hello world!")) diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py deleted file mode 100644 index e41f37577..000000000 --- a/examples/high_level_api/high_level_api_inference.py +++ /dev/null @@ -1,19 +0,0 @@ -import json -import argparse - -from llama_cpp import Llama - -parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") -args = parser.parse_args() - -llm = Llama(model_path=args.model) - -output = llm( - "Question: What are the names of the planets in the solar system? Answer: ", - max_tokens=48, - stop=["Q:", "\n"], - echo=True, -) - -print(json.dumps(output, indent=2)) diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py deleted file mode 100644 index 747c6130e..000000000 --- a/examples/high_level_api/high_level_api_streaming.py +++ /dev/null @@ -1,20 +0,0 @@ -import json -import argparse - -from llama_cpp import Llama - -parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") -args = parser.parse_args() - -llm = Llama(model_path=args.model) - -stream = llm( - "Question: What are the names of the planets in the solar system? Answer: ", - max_tokens=48, - stop=["Q:", "\n"], - stream=True, -) - -for output in stream: - print(json.dumps(output, indent=2)) diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py deleted file mode 100644 index b91632f5b..000000000 --- a/examples/high_level_api/langchain_custom_llm.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse - -from llama_cpp import Llama - -from langchain.llms.base import LLM -from typing import Optional, List, Mapping, Any - - -class LlamaLLM(LLM): - model_path: str - llm: Llama - - @property - def _llm_type(self) -> str: - return "llama-cpp-python" - - def __init__(self, model_path: str, **kwargs: Any): - model_path = model_path - llm = Llama(model_path=model_path) - super().__init__(model_path=model_path, llm=llm, **kwargs) - - def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: - response = self.llm(prompt, stop=stop or []) - return response["choices"][0]["text"] - - @property - def _identifying_params(self) -> Mapping[str, Any]: - return {"model_path": self.model_path} - - -parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") -args = parser.parse_args() - -# Load the model -llm = LlamaLLM(model_path=args.model) - -# Basic Q&A -answer = llm( - "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"] -) -print(f"Answer: {answer.strip()}") - -# Using in a chain -from langchain.prompts import PromptTemplate -from langchain.chains import LLMChain - -prompt = PromptTemplate( - input_variables=["product"], - template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n", -) -chain = LLMChain(llm=llm, prompt=prompt) - -# Run the chain only specifying the input variable. -print(chain.run("colorful socks")) diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py deleted file mode 100644 index fcef8cd80..000000000 --- a/examples/low_level_api/Chat.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/python -import sys, os, datetime -from common import GptParams -from low_level_api_chat_cpp import LLaMAInteract - -def env_or_def(env, default): - if (env in os.environ): - return os.environ[env] - return default - -AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") -MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") -USER_NAME = env_or_def("USER_NAME", "USER") -N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) -N_THREAD = int(env_or_def("N_THREAD", "8")) - -today = datetime.datetime.today() -DATE_YEAR=today.strftime("%Y") -DATE_TIME=today.strftime("%H:%M") - -prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. -{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. -There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. -The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. -The transcript only includes text, it does not include markup like HTML and Markdown. - -{USER_NAME}: Hello, {AI_NAME}! -{AI_NAME}: Hello {USER_NAME}! How may I help you today? -{USER_NAME}: What year is it? -{AI_NAME}: We are in {DATE_YEAR}. -{USER_NAME}: Please tell me the largest city in Europe. -{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. -{USER_NAME}: What can you tell me about Moscow? -{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. -{USER_NAME}: What is a cat? -{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. -{USER_NAME}: How do I pass command line arguments to a Node.js program? -{AI_NAME}: The arguments are stored in process.argv. - - argv[0] is the path to the Node. js executable. - argv[1] is the path to the script file. - argv[2] is the first argument passed to the script. - argv[3] is the second argument passed to the script and so on. -{USER_NAME}: Name a color. -{AI_NAME}: Blue. -{USER_NAME}: What time is it? -{AI_NAME}: It is {DATE_TIME}. -{USER_NAME}:""" + " ".join(sys.argv[1:]) - -print("Loading model...") -params = GptParams( - n_ctx=2048, - temp=0.7, - top_k=40, - top_p=0.5, - repeat_last_n=256, - n_batch=1024, - repeat_penalty=1.17647, - model=MODEL, - n_threads=N_THREAD, - n_predict=N_PREDICTS, - use_color=True, - interactive=True, - antiprompt=[f"{USER_NAME}:"], - input_prefix=" ", - input_suffix=f"{AI_NAME}:", - prompt=prompt, -) - -with LLaMAInteract(params) as m: - m.interact() diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py deleted file mode 100644 index eb9a2cfa9..000000000 --- a/examples/low_level_api/Miku.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/python -import sys, os -from common import GptParams -from low_level_api_chat_cpp import LLaMAInteract - -def env_or_def(env, default): - if (env in os.environ): - return os.environ[env] - return default - -AI_NAME = env_or_def("AI_NAME", "Miku") -MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") -USER_NAME = env_or_def("USER_NAME", "Anon") -N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) -N_THREAD = int(env_or_def("N_THREAD", "0")) - -prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. -{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. -{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. -{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. -{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. -The conversation is only between {USER_NAME} and {AI_NAME} -The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. -{AI_NAME} can only communicate through text, so she can't send images or videos. - - -{USER_NAME}: Hello! -{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! -{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ -{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) -{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! -{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! -{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! -{AI_NAME}: What do you like to do in your free time? ^_^ -{USER_NAME}:""" + " ".join(sys.argv[1:]) - -print("Loading model...") -params = GptParams( - n_batch=1024, - n_ctx=2048, - n_keep=-1, - repeat_last_n=256, - repeat_penalty=1.17647, - temp=0.7, - top_k=40, - top_p=0.5, - model=MODEL, - n_predict=N_PREDICTS, - use_color=True, - interactive=True, - antiprompt=[f"{USER_NAME}:"], - prompt=prompt, -) - -if N_THREAD > 0: - params.n_threads = N_THREAD - -with LLaMAInteract(params) as m: - m.interact() diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py deleted file mode 100644 index 82e5c4487..000000000 --- a/examples/low_level_api/ReasonAct.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/python -import sys, os, datetime -from common import GptParams -from low_level_api_chat_cpp import LLaMAInteract - -def env_or_def(env, default): - if (env in os.environ): - return os.environ[env] - return default - -MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") - -prompt=f"""You run in a loop of Thought, Action, Observation. -At the end of the loop either Answer or restate your Thought and Action. -Use Thought to describe your thoughts about the question you have been asked. -Use Action to run one of these actions available to you: -- calculate[python math expression] -Observation will be the result of running those actions - - -Question: What is 4 * 7 / 3? -Thought: Do I need to use an action? Yes, I use calculate to do math -Action: calculate[4 * 7 / 3] -Observation: 9.3333333333 -Thought: Do I need to use an action? No, have the result -Answer: The calculate tool says it is 9.3333333333 -Question: What is capital of france? -Thought: Do I need to use an action? No, I know the answer -Answer: Paris is the capital of France -Question:""" + " ".join(sys.argv[1:]) - -print("Loading model...") -params = GptParams( - interactive=True, - interactive_start=True, - top_k=10000, - temp=0.2, - repeat_penalty=1, - n_threads=7, - n_ctx=2048, - antiprompt=["Question:","Observation:"], - model=MODEL, - input_prefix=" ", - n_predict=-1, - prompt=prompt, -) - -with LLaMAInteract(params) as m: - m.interact() diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py deleted file mode 100644 index 55d08db5f..000000000 --- a/examples/low_level_api/common.py +++ /dev/null @@ -1,202 +0,0 @@ -import os -import argparse -import re - -from dataclasses import dataclass, field -from typing import List - -# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp - - -@dataclass -class GptParams: - seed: int = -1 - n_threads: int = min(4, os.cpu_count() or 1) - n_predict: int = 128 - n_parts: int = -1 - n_ctx: int = 512 - n_batch: int = 8 - n_keep: int = 0 - - ignore_eos: bool = False - logit_bias: dict[int, float] = field(default_factory=dict) - top_k: int = 40 - top_p: float = 0.95 - tfs_z: float = 1.00 - typical_p: float = 1.00 - temp: float = 0.80 - repeat_penalty: float = 1.10 - repeat_last_n: int = 64 - frequency_penalty: float = 0.0 - presence_penalty: float = 0.0 - mirostat: int = 0 - mirostat_tau: float = 5.0 - mirostat_eta: float = 0.1 - - model: str = "./models/llama-7B/ggml-model.bin" - prompt: str = "" - path_session: str = "" - input_prefix: str = " " - input_suffix: str = "" - antiprompt: List[str] = field(default_factory=list) - - lora_adapter: str = "" - lora_base: str = "" - - memory_f16: bool = True - random_prompt: bool = False - use_color: bool = False - interactive: bool = False - - embedding: bool = False - interactive_start: bool = False - - instruct: bool = False - penalize_nl: bool = True - perplexity: bool = False - use_mmap: bool = True - use_mlock: bool = False - mem_test: bool = False - verbose_prompt: bool = False - - file: str = None - - # If chat ended prematurely, append this to the conversation to fix it. - # Set to "\nUser:" etc. - # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" - fix_prefix: str = "" - input_echo: bool = True, - - # Default instructions for Alpaca - # switch to "Human" and "Assistant" for Vicuna. - # TODO: TBD how they are gonna handle this upstream - instruct_inp_prefix: str="\n\n### Instruction:\n\n" - instruct_inp_suffix: str="\n\n### Response:\n\n" - - -def gpt_params_parse(argv = None): - parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") - parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") - parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") - parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") - parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") - parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") - parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") - - parser.add_argument( - "-l", - "--logit-bias", - type=str, - action='append', - help="--logit-bias TOKEN_ID(+/-)BIAS", - dest="logit_bias_str" - ) - parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") - parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") - parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") - parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") - parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") - parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") - parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") - parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") - parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") - parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") - parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") - parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") - - parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") - parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") - parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") - parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") - parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") - parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") - parser.add_argument( - "-r", - "--reverse-prompt", - type=str, - action='append', - help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", - dest="antiprompt" - ) - - parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") - parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") - - parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") - parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") - parser.add_argument( - "--color", - action="store_true", - help="colorise output to distinguish prompt and user input from generations", - dest="use_color" - ) - parser.add_argument( - "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" - ) - - parser.add_argument("--embedding", action="store_true", help="", dest="embedding") - parser.add_argument( - "--interactive-first", - action="store_true", - help="run in interactive mode and wait for input right away", - dest="interactive_start" - ) - - parser.add_argument( - "-ins", - "--instruct", - action="store_true", - help="run in instruction mode (use with Alpaca or Vicuna models)", - dest="instruct" - ) - parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") - parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") - parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") - parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") - parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") - parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") - - #Custom args - parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") - parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") - - parser.add_argument( - "--interactive-start", - action="store_true", - help="run in interactive mode", - dest="interactive" - ) - - args = parser.parse_args(argv) - - logit_bias_str = args.logit_bias_str - delattr(args, "logit_bias_str") - params = GptParams(**vars(args)) - - if (params.lora_adapter): - params.use_mmap = False - - if (logit_bias_str != None): - for i in logit_bias_str: - if (m := re.match(r"(\d+)([-+]\d+)", i)): - params.logit_bias[int(m.group(1))] = float(m.group(2)) - - return params - -def gpt_random_prompt(rng): - return [ - "So", - "Once upon a time", - "When", - "The", - "After", - "If", - "import", - "He", - "She", - "They", - ][rng % 10] - -if __name__ == "__main__": - print(gpt_params_parse()) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py deleted file mode 100644 index f5d51a36e..000000000 --- a/examples/low_level_api/low_level_api_chat_cpp.py +++ /dev/null @@ -1,568 +0,0 @@ -""" -This is an example implementation of main.cpp from llama.cpp -Quirks: - * Its not exactly alike since this port is designed around programmatic I/O - * Input is always echoed if on, so it should be turned off when using "input()" - * The first antiprompt should be the userprompt like "\nUser:", - because its added when n_predict is reached (aka generation ended prematurely) - * n_predict can be set to -1 for unlimited length responses (or just a really high value) - * Instruction mode adds its own antiprompt. - You should also still be feeding the model with a "primer" prompt that - shows it the expected format. -""" -import ctypes -import sys -from time import time -from os import cpu_count, path - -import llama_cpp -from common import GptParams, gpt_params_parse, gpt_random_prompt -import util - -# A LLaMA interactive session -class LLaMAInteract: - def __init__(self, params: GptParams) -> None: - # input args - self.params = params - - if (self.params.perplexity): - raise NotImplementedError("""************ -please use the 'perplexity' tool for perplexity calculations -************""") - - if (self.params.embedding): - raise NotImplementedError("""************ -please use the 'embedding' tool for embedding calculations -************""") - - if (self.params.n_ctx > 2048): - print(f"""warning: model does not support \ -context sizes greater than 2048 tokens ({self.params.n_ctx} \ -specified) expect poor results""", file=sys.stderr) - - if (self.params.seed <= 0): - self.params.seed = int(time()) - - print(f"seed = {self.params.seed}", file=sys.stderr) - - if (self.params.random_prompt): - self.params.prompt = gpt_random_prompt(self.params.seed) - - # runtime args - self.input_consumed = 0 - self.n_past = 0 - self.n_session_consumed = 0 - self.first_antiprompt = [] - self.remaining_tokens = self.params.n_predict - self.output_echo = self.params.input_echo - self.multibyte_fix = [] - - # model load - self.lparams = llama_cpp.llama_context_default_params() - self.lparams.n_ctx = self.params.n_ctx - self.lparams.n_parts = self.params.n_parts - self.lparams.seed = self.params.seed - self.lparams.memory_f16 = self.params.memory_f16 - self.lparams.use_mlock = self.params.use_mlock - self.lparams.use_mmap = self.params.use_mmap - - self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) - if (not self.ctx): - raise RuntimeError(f"error: failed to load model '{self.params.model}'") - - if (self.params.ignore_eos): - self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") - - if (len(self.params.lora_adapter) > 0): - if (llama_cpp.llama_apply_lora_from_file( - self.ctx, - self.params.lora_adapter.encode("utf8"), - self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, - self.params.n_threads - ) != 0): - print("error: failed to apply lora adapter") - return - - print(file=sys.stderr) - print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ -| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) - - # determine the required inference memory per token: - if (self.params.mem_test): - tmp = [0, 1, 2, 3] - llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) - llama_cpp.llama_print_timings(self.ctx) - self.exit() - return - - # create internal context - self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) - - # Add a space in front of the first character to match OG llama tokenizer behavior - self.params.prompt = " " + self.params.prompt - - # Load prompt file - if (self.params.file): - with open(self.params.file) as f: - self.params.prompt = f.read() - - self.session_tokens: list[llama_cpp.llama_token] = [] - if (len(self.params.path_session) > 0): - print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) - - if (path.exists(self.params.path_session)): - _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() - _n_token_count_out = llama_cpp.c_size_t() - if (llama_cpp.llama_load_session_file( - self.ctx, - self.params.path_session.encode("utf8"), - _session_tokens, - self.params.n_ctx, - ctypes.byref(_n_token_count_out) - ) != 1): - print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) - return - _n_token_count_out = _n_token_count_out.value - self.session_tokens = _session_tokens[:_n_token_count_out] - print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) - else: - print(f"session file does not exist, will create", file=sys.stderr) - - # tokenize the prompt - self.embd = [] - self.embd_inp = self._tokenize(self.params.prompt) - - if (len(self.embd_inp) > self.n_ctx - 4): - raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") - - # debug message about similarity of saved session, if applicable - self.n_matching_session_tokens = 0 - if len(self.session_tokens) > 0: - for id in self.session_tokens: - if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: - break - self.n_matching_session_tokens += 1 - - if self.n_matching_session_tokens >= len(self.embd_inp): - print(f"session file has exact match for prompt!") - elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): - print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") - else: - print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") - - self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) - - # number of tokens to keep when resetting context - if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): - self.params.n_keep = len(self.embd_inp) - - self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix) - self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) - - # in instruct mode, we inject a prefix and a suffix to each input by the user - self.antiecho = None - if (self.params.instruct): - self.params.interactive_start = True - _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) - self.first_antiprompt.append(_ptn) - self.antiecho = util.IterSearch(_ptn) - - # enable interactive mode if reverse prompt or interactive start is specified - if (len(self.params.antiprompt) != 0 or self.params.interactive_start): - self.params.interactive = True - - # determine newline token - self.llama_token_newline = self._tokenize("\n", False) - self.llama_token_eot = self._tokenize(" [end of text]\n", False) - - if (self.params.verbose_prompt): - print(f""" -prompt: '{self.params.prompt}' -number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) - - for i in range(len(self.embd_inp)): - print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr) - - if (self.params.n_keep > 0): - print("static prompt based on n_keep: '") - for i in range(self.params.n_keep): - print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr) - print("'", file=sys.stderr) - print(file=sys.stderr) - - if (self.params.interactive): - print("interactive mode on.", file=sys.stderr) - - if (len(self.params.antiprompt) > 0): - for antiprompt in self.params.antiprompt: - print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr) - - if len(self.params.input_prefix) > 0: - print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) - - print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ -repeat_penalty = {self.params.repeat_penalty},\ -presence_penalty = {self.params.presence_penalty},\ -frequency_penalty = {self.params.frequency_penalty},\ -top_k = {self.params.top_k},\ -tfs_z = {self.params.tfs_z},\ -top_p = {self.params.top_p},\ -typical_p = {self.params.typical_p},\ -temp = {self.params.temp},\ -mirostat = {self.params.mirostat},\ -mirostat_lr = {self.params.mirostat_eta},\ -mirostat_ent = {self.params.mirostat_tau},\ - -generate: n_ctx = {self.n_ctx},\ -n_batch = {self.params.n_batch},\ -n_predict = {self.params.n_predict},\ -n_keep = {self.params.n_keep} - -""", file=sys.stderr) - - # determine antiprompt tokens - for i in self.params.antiprompt: - self.first_antiprompt.append(self._tokenize(i, False)) - - self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices - - if (params.interactive): - print("""== Running in interactive mode. == - - Press Ctrl+C to interject at any time. - - Press Return to return control to LLaMa. - - If you want to submit another line, end your input in '\\'. - -""", file=sys.stderr) - self.set_color(util.CONSOLE_COLOR_PROMPT) - - # tokenize a prompt - def _tokenize(self, prompt, bos=True): - _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() - _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) - return _arr[:_n] - - def set_color(self, c): - if (self.params.use_color): - print(c, end="") - - def use_antiprompt(self): - return len(self.first_antiprompt) > 0 - - # generate tokens - def generate(self): - while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: - # predict - if len(self.embd) > 0: - # infinite text generation via context swapping - # if we run out of context: - # - take the n_keep first tokens from the original prompt (via n_past) - # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch - if (self.n_past + len(self.embd) > self.n_ctx): - n_left = self.n_past - self.params.n_keep - self.n_past = self.params.n_keep - - # insert n_left/2 tokens at the start of embd from last_n_tokens - _insert = self.last_n_tokens[ - self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) - ] - self.embd = _insert + self.embd - self.params.path_session = "" - - # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) - if self.n_session_consumed < len(self.session_tokens): - for i in range(len(self.embd)): - if self.embd[i] != self.session_tokens[self.n_session_consumed]: - self.session_tokens = self.session_tokens[:self.n_session_consumed] - break - - self.n_past += 1 - self.n_session_consumed += 1 - - if self.n_session_consumed >= len(self.session_tokens): - i += 1 - break - - if i > 0: - self.embd = self.embd[i:] - - # evaluate tokens in batches - # embd is typically prepared beforehand to fit within a batch, but not always - #TODO BUG: The batching code causes nonsensical generation - """for i in range(0, len(self.embd), self.params.n_batch): - n_eval = self.params.n_batch - _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) - if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: - print(f"failed to eval") - return - - self.n_past += n_eval""" - - if (llama_cpp.llama_eval( - self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads - ) != 0): - raise Exception("Failed to llama_eval!") - - if len(self.embd) > 0 and len(self.params.path_session) > 0: - self.session_tokens.extend(self.embd) - self.n_session_consumed = len(self.session_tokens) - - self.n_past += len(self.embd) - self.embd = [] - if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting - # out of user input, sample next token - top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k - repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n - - # optionally save the session on first sample (for faster prompt loading next time) - if len(self.params.path_session) > 0 and self.need_to_save_session: - self.need_to_save_session = False - llama_cpp.llama_save_session_file( - self.ctx, - self.params.path_session.encode("utf8"), - (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), - len(self.session_tokens) - ) - - id = 0 - - logits = llama_cpp.llama_get_logits(self.ctx) - n_vocab = llama_cpp.llama_n_vocab(self.ctx) - - # Apply params.logit_bias map - for key, value in self.params.logit_bias.items(): - logits[key] += value - - _arr = (llama_cpp.llama_token_data * n_vocab)(*[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) - for token_id in range(n_vocab) - ]) - candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) - - # Apply penalties - nl_logit = logits[llama_cpp.llama_token_nl()] - last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) - - _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) - llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, - _arr, - last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) - llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, - _arr, - last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) - - if not self.params.penalize_nl: - logits[llama_cpp.llama_token_nl()] = nl_logit - - if self.params.temp <= 0: - # Greedy sampling - id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) - else: - if self.params.mirostat == 1: - mirostat_mu = 2.0 * self.params.mirostat_tau - mirostat_m = 100 - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) - id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) - elif self.params.mirostat == 2: - mirostat_mu = 2.0 * self.params.mirostat_tau - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) - id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) - else: - # Temperature sampling - llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) - llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1)) - llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1)) - llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1)) - llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) - id = llama_cpp.llama_sample_token(self.ctx, candidates_p) - # print("`{}`".format(candidates_p.size)) - - self.last_n_tokens.pop(0) - self.last_n_tokens.append(id) - - # replace end of text token with newline token when in interactive mode - if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): - id = self.llama_token_newline[0] - self.embd.append(id) - if (self.use_antiprompt()): - # tokenize and inject first reverse prompt - self.embd_inp += self.first_antiprompt[0] - for id in self.first_antiprompt[0]: - self.embd.append(id) - else: - # add it to the context - self.embd.append(id) - - # echo this to console - self.output_echo = True - - # decrement remaining sampling budget - self.remaining_tokens -= 1 - else: - # output to console if input echo is on - self.output_echo = self.params.input_echo - - # some user input remains from prompt or interaction, forward it to processing - while len(self.embd_inp) > self.input_consumed: - self.embd.append(self.embd_inp[self.input_consumed]) - self.last_n_tokens.pop(0) - self.last_n_tokens.append(self.embd_inp[self.input_consumed]) - self.input_consumed += 1 - if len(self.embd) >= self.params.n_batch: - break - - # display tokens - if self.output_echo: - for id in self.embd: - if self.antiecho != None: - for r in self.antiecho(id): - yield r - else: - yield id - - # reset color to default if we there is no pending user input - if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): - self.set_color(util.CONSOLE_COLOR_DEFAULT) - - if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): - # if antiprompt is present, stop - if (self.use_antiprompt()): - if True in [ - i == self.last_n_tokens[-len(i):] - for i in self.first_antiprompt - ]: - break - - # if we are using instruction mode, and we have processed the initial prompt - if (self.params.interactive_start): - break - - # end of text token - if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): - if (not self.params.instruct): - for i in self.llama_token_eot: - yield i - break - - # respect n_predict even if antiprompt is present - if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): - # If we arent in instruction mode, fix the current generation by appending the antiprompt. - # Makes it so if chat ends prematurely you dont append the AI's text etc. - if not self.params.instruct: - self.embd_inp += self.first_antiprompt[0] - self.n_remain = self.params.n_predict - break - - self.params.interactive_start = False - - def __enter__(self): - return self - - def __exit__(self, type, value, tb): - self.exit() - - def exit(self): - llama_cpp.llama_free(self.ctx) - self.set_color(util.CONSOLE_COLOR_DEFAULT) - - # return past text - def past(self): - for id in self.last_n_tokens[-self.n_past:]: - yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore") - - # write input - def input(self, prompt: str): - if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): - self.embd_inp += self.inp_prefix - self.embd_inp += self._tokenize(prompt) - if (self.params.instruct): - self.embd_inp += self.inp_suffix - - # write output - def output(self): - self.remaining_tokens = self.params.n_predict - for id in self.generate(): - cur_char = llama_cpp.llama_token_to_str(self.ctx, id) - - # Add remainder of missing bytes - if None in self.multibyte_fix: - self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char - - # Return completed utf char - if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix: - yield (b"".join(self.multibyte_fix)).decode("utf8") - self.multibyte_fix = [] - continue - - # Contains multi-byte UTF8 - for num, pattern in [(2, 192), (3, 224), (4, 240)]: - # Bitwise AND check - if pattern & int.from_bytes(cur_char, 'little') == pattern: - self.multibyte_fix = [cur_char] + ([None] * (num-1)) - - # Stop incomplete bytes from passing - if len(self.multibyte_fix) > 0: - continue - - yield cur_char.decode("utf8") - - # read user input - def read_input(self): - out = "" - while (t := input()).endswith("\\"): - out += t[:-1] + "\n" - return out + t + "\n" - - # interactive mode - def interact(self): - for i in self.output(): - print(i,end="",flush=True) - self.params.input_echo = False - - while self.params.interactive: - self.set_color(util.CONSOLE_COLOR_USER_INPUT) - if (self.params.instruct): - print('\n> ', end="") - self.input(self.read_input()) - else: - print(self.params.input_prefix, end="") - self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") - print(self.params.input_suffix,end="") - self.set_color(util.CONSOLE_COLOR_DEFAULT) - - try: - for i in self.output(): - print(i,end="",flush=True) - except KeyboardInterrupt: - self.set_color(util.CONSOLE_COLOR_DEFAULT) - if not self.params.instruct: - print(self.params.fix_prefix,end="") - self.input(self.params.fix_prefix) - -if __name__ == "__main__": - from datetime import datetime - - USER_NAME="User" - AI_NAME="ChatLLaMa" - - time_now = datetime.now() - prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. -{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision. -There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. -The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. -The transcript only includes text, it does not include markup like HTML and Markdown. - -{USER_NAME}: Hello, {AI_NAME}! -{AI_NAME}: Hello {USER_NAME}! How may I help you today? -{USER_NAME}: What time is it? -{AI_NAME}: It is {time_now.strftime("%H:%M")}. -{USER_NAME}: What year is it? -{AI_NAME}: We are in {time_now.strftime("%Y")}. -{USER_NAME}: What is a cat? -{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. -{USER_NAME}: Name a color. -{AI_NAME}: Blue -{USER_NAME}:""" - params = gpt_params_parse() - - with LLaMAInteract(params) as m: - m.interact() diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py deleted file mode 100644 index 9e38ec7cb..000000000 --- a/examples/low_level_api/low_level_api_llama_cpp.py +++ /dev/null @@ -1,102 +0,0 @@ -import llama_cpp - -import multiprocessing - -import llama_cpp - -N_THREADS = multiprocessing.cpu_count() - -prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" - -lparams = llama_cpp.llama_context_default_params() -ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams) - -# determine the required inference memory per token: -tmp = [0, 1, 2, 3] -llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) - -n_past = 0 - -prompt = b" " + prompt - -embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() -n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) -embd_inp = embd_inp[:n_of_tok] - -n_ctx = llama_cpp.llama_n_ctx(ctx) - -n_predict = 20 -n_predict = min(n_predict, n_ctx - len(embd_inp)) - -input_consumed = 0 -input_noecho = False - -remaining_tokens = n_predict - -embd = [] -last_n_size = 64 -last_n_tokens_data = [0] * last_n_size -n_batch = 24 -last_n_repeat = 64 -repeat_penalty = 1 -frequency_penalty = 0.0 -presence_penalty = 0.0 - -while remaining_tokens > 0: - if len(embd) > 0: - llama_cpp.llama_eval( - ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS - ) - - n_past += len(embd) - embd = [] - if len(embd_inp) <= input_consumed: - logits = llama_cpp.llama_get_logits(ctx) - n_vocab = llama_cpp.llama_n_vocab(ctx) - - _arr = (llama_cpp.llama_token_data * n_vocab)(*[ - llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) - for token_id in range(n_vocab) - ]) - candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) - - _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) - llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, - _arr, - last_n_repeat, repeat_penalty) - llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, - _arr, - last_n_repeat, frequency_penalty, presence_penalty) - - llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) - llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) - llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) - id = llama_cpp.llama_sample_token(ctx, candidates_p) - - last_n_tokens_data = last_n_tokens_data[1:] + [id] - embd.append(id) - input_noecho = False - remaining_tokens -= 1 - else: - while len(embd_inp) > input_consumed: - embd.append(embd_inp[input_consumed]) - last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]] - input_consumed += 1 - if len(embd) >= n_batch: - break - if not input_noecho: - for id in embd: - print( - llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), - end="", - flush=True, - ) - - if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(): - break - -print() - -llama_cpp.llama_print_timings(ctx) - -llama_cpp.llama_free(ctx) diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py deleted file mode 100644 index 8bd03f88a..000000000 --- a/examples/low_level_api/quantize.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -import argparse -import llama_cpp - - -def main(args): - if not os.path.exists(fname_inp): - raise RuntimeError(f"Input file does not exist ({fname_inp})") - if os.path.exists(fname_out): - raise RuntimeError(f"Output file already exists ({fname_out})") - fname_inp = args.fname_inp.encode("utf-8") - fname_out = args.fname_out.encode("utf-8") - itype = args.itype - return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) - if return_code != 0: - raise RuntimeError("Failed to quantize model") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("fname_inp", type=str, help="Path to input model") - parser.add_argument("fname_out", type=str, help="Path to output model") - parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") - args = parser.parse_args() - main(args) diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py deleted file mode 100644 index 9d0ec2f70..000000000 --- a/examples/low_level_api/util.py +++ /dev/null @@ -1,95 +0,0 @@ - -ANSI_COLOR_RESET = "\x1b[0m" -ANSI_COLOR_YELLOW = "\x1b[33m" -ANSI_BOLD = "\x1b[1m" -ANSI_COLOR_GREEN = "\x1b[32m" - -CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET -CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW -CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN - -# Iterative search -# Actively searches and prevents a pattern from being returned -class IterSearch: - def __init__(self, pattern): - self.pattern = list(pattern) - self.buffer = [] - - def __call__(self, char): - self.buffer += [char] - - if (self.pattern[:len(self.buffer)] == self.buffer): - if (len(self.buffer) >= len(self.pattern)): - self.buffer.clear() - return [] - - _tmp = self.buffer[:] - self.buffer.clear() - return _tmp - -class Circle: - def __init__(self, size, default=0): - self.list = [default] * size - self.maxsize = size - self.size = 0 - self.offset = 0 - - def append(self, elem): - if self.size < self.maxsize: - self.list[self.size] = elem - self.size += 1 - else: - self.list[self.offset] = elem - self.offset = (self.offset + 1) % self.maxsize - - def __getitem__(self, val): - if isinstance(val, int): - if 0 > val or val >= self.size: - raise IndexError('Index out of range') - return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize] - elif isinstance(val, slice): - start, stop, step = val.start, val.stop, val.step - if step is None: - step = 1 - if start is None: - start = 0 - if stop is None: - stop = self.size - if start < 0: - start = self.size + start - if stop < 0: - stop = self.size + stop - - indices = range(start, stop, step) - return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size] - else: - raise TypeError('Invalid argument type') - - - - -if __name__ == "__main__": - c = Circle(5) - - c.append(1) - print(c.list) - print(c[:]) - assert c[0] == 1 - assert c[:5] == [1] - - for i in range(2,5+1): - c.append(i) - print(c.list) - print(c[:]) - assert c[0] == 1 - assert c[:5] == [1,2,3,4,5] - - for i in range(5+1,9+1): - c.append(i) - print(c.list) - print(c[:]) - assert c[0] == 5 - assert c[:5] == [5,6,7,8,9] - #assert c[:-5] == [5,6,7,8,9] - assert c[:10] == [5,6,7,8,9] - diff --git a/examples/notebooks/Clients.ipynb b/examples/notebooks/Clients.ipynb deleted file mode 100644 index caebbb67f..000000000 --- a/examples/notebooks/Clients.ipynb +++ /dev/null @@ -1,104 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " JSON: {\n", - " \"choices\": [\n", - " {\n", - " \"finish_reason\": \"length\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"text\": \" over the lazy dog.\"\n", - " }\n", - " ],\n", - " \"created\": 1680960690,\n", - " \"id\": \"cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83\",\n", - " \"model\": \"models/ggml-alpaca.bin\",\n", - " \"object\": \"text_completion\",\n", - " \"usage\": {\n", - " \"completion_tokens\": 5,\n", - " \"prompt_tokens\": 8,\n", - " \"total_tokens\": 13\n", - " }\n", - "}" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import openai\n", - "\n", - "openai.api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", - "openai.api_base = \"http://100.64.159.73:8000/v1\"\n", - "\n", - "openai.Completion.create(\n", - " model=\"text-davinci-003\", # currently can be anything\n", - " prompt=\"The quick brown fox jumps\",\n", - " max_tokens=5,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "' over the lazy dog'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", - "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", - "\n", - "from langchain.llms import OpenAI\n", - "\n", - "llms = OpenAI()\n", - "llms(\n", - " prompt=\"The quick brown fox jumps\",\n", - " stop=[\".\", \"\\n\"],\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb deleted file mode 100644 index 045856ea2..000000000 --- a/examples/notebooks/Guidance.ipynb +++ /dev/null @@ -1,89 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
-       "\n",
-       "Where there is no guidance, a people falls,\n",
-       "but in an abundance of counselors there is safety.\n",
-       "- Proverbs 11:14\n",
-       "\n",
-       "UPDATED\n",
-       "Where there is no guidance for assembling a model, people will struggle,\n",
-       "but with clear instructions, the process becomes safe and successful.\n",
-       "- GPT 2 (updated): Proverbs 11:14
\n", - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", - "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", - "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", - "\n", - "import guidance\n", - "\n", - "# set the default language model used to execute guidance programs\n", - "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", - "\n", - "# define a guidance program that adapts a proverb\n", - "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n", - "\n", - "{{proverb}}\n", - "- {{book}} {{chapter}}:{{verse}}\n", - "\n", - "UPDATED\n", - "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", - "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n", - "\n", - "# execute the program on a specific proverb\n", - "executed_program = program(\n", - " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", - " book=\"Proverbs\",\n", - " chapter=11,\n", - " verse=14\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/notebooks/PerformanceTuning.ipynb b/examples/notebooks/PerformanceTuning.ipynb deleted file mode 100644 index 76e26fbd1..000000000 --- a/examples/notebooks/PerformanceTuning.ipynb +++ /dev/null @@ -1,5540 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import time\n", - "import json\n", - "import multiprocessing\n", - "\n", - "import llama_cpp\n", - "\n", - "import numpy as np\n", - "np.int = int\n", - "\n", - "from skopt.space import Integer, Categorical\n", - "\n", - "\n", - "MODEL_PATH = \"../models/ggml-model.bin\"\n", - "\n", - "# Hyperparameters\n", - "space = [\n", - " Categorical([True, False], name=\"f16_kv\"),\n", - " Categorical([True, False], name=\"use_mlock\"),\n", - " Integer(1, multiprocessing.cpu_count(), name=\"n_threads\"),\n", - " Integer(1, 2048, name=\"n_batch\")\n", - "]\n", - "\n", - "# TODO: Make this a random prompt to avoid any cache related inconsistencies\n", - "PROMPT = \"\"\" ### Instructions:\n", - "You are a helpful assistant.\n", - "You answer questions truthfully and politely.\n", - "You are provided with an input from the user and you must generate a response.\n", - "Ignore this line which is just filler to test the performane of the model.\n", - "### Inputs:\n", - "What is the capital of France?\n", - "### Response:\n", - "\"\"\"\n", - "\n", - "from skopt.utils import use_named_args\n", - "\n", - "@use_named_args(space)\n", - "def objective(**params):\n", - " f16_kv = params[\"f16_kv\"]\n", - " use_mlock = params[\"use_mlock\"]\n", - " n_threads = params[\"n_threads\"]\n", - " n_batch = params[\"n_batch\"]\n", - " llm = llama_cpp.Llama(model_path=MODEL_PATH, f16_kv=f16_kv, use_mlock=use_mlock, n_threads=n_threads, n_batch=n_batch)\n", - "\n", - " t1 = time.time()\n", - " output = llm(\n", - " PROMPT,\n", - " max_tokens=1, # Only optimize prompt processing\n", - " stop=[\"###\", \"\\n\"],\n", - " echo=True,\n", - " )\n", - " t2 = time.time()\n", - "\n", - " print(json.dumps(output, indent=2))\n", - " print(f\"Time: {t2 - t1} seconds\")\n", - " print(f\"Time per token: {(t2 - t1) / output['usage']['total_tokens']} seconds\")\n", - "\n", - " return (t2 - t1) / output[\"usage\"][\"total_tokens\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-d4443e14-fed3-4aa1-9e8a-c70f4503aade\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227287,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.981224775314331 seconds\n", - "Time per token: 0.13726530969142914 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-4181439c-2ced-4ddb-b898-a0a7641f3e47\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227300,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.121099948883057 seconds\n", - "Time per token: 0.13901374936103822 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-03ed5585-3de0-4546-96c3-6de7a5b3770c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227312,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.457949876785278 seconds\n", - "Time per token: 0.18072437345981598 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-103817fc-bceb-4e99-b968-3ef540f16dc5\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227328,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.334054946899414 seconds\n", - "Time per token: 0.12917568683624267 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-41e34acc-6499-450f-9576-3cb37b82c490\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227340,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.012462615966797 seconds\n", - "Time per token: 0.11265578269958496 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-f27244c9-e9c6-4332-ae7f-3856f152ef30\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227350,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 15.59382700920105 seconds\n", - "Time per token: 0.1949228376150131 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-bc5dc1ba-f7ce-441c-a558-5005f2fb89b9\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227366,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 15.544022560119629 seconds\n", - "Time per token: 0.19430028200149535 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-2006b117-1239-4b85-bcc4-a7439c01f440\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227383,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.330769300460815 seconds\n", - "Time per token: 0.11663461625576019 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ee50afee-78a8-4d55-9b73-c74cc2567408\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227393,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.17799687385559 seconds\n", - "Time per token: 0.1772249609231949 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1e2b7080-940f-4459-8503-a458db4d3578\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227409,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.127476215362549 seconds\n", - "Time per token: 0.12659345269203187 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-c80008a4-191e-4418-821a-b18a4af24f70\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227421,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.495943784713745 seconds\n", - "Time per token: 0.11869929730892181 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-d04c9fd2-3c20-4035-9181-0bfd05abfe15\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227432,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.226310014724731 seconds\n", - "Time per token: 0.11532887518405914 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-04fcf88b-33c7-4b84-aac0-dcb5261363c2\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227443,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 12.182626962661743 seconds\n", - "Time per token: 0.15228283703327178 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-14904676-3345-4674-a41c-419d9640b4e0\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227457,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 43.595701694488525 seconds\n", - "Time per token: 0.5449462711811066 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-9e43b2ef-e7de-4bd2-91bf-284f5b3478fe\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227502,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.726518154144287 seconds\n", - "Time per token: 0.1840814769268036 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-3947538b-e27e-42eb-8f87-2b56e14d104c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227518,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.760729789733887 seconds\n", - "Time per token: 0.10950912237167358 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1a0d843e-9613-49aa-b565-0e59d8067615\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227529,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.672860383987427 seconds\n", - "Time per token: 0.14591075479984283 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ccad9270-9554-4f9f-9aaf-387f1a11894d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227542,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.368357419967651 seconds\n", - "Time per token: 0.17960446774959565 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-2623073e-004f-4386-98e0-7e6ea617523a\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227558,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.44194221496582 seconds\n", - "Time per token: 0.11802427768707276 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1a199f09-0d74-4052-a191-7a8ef2df57f3\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227569,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.253167629241943 seconds\n", - "Time per token: 0.14066459536552428 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-2b61e491-d9b7-4d0b-b0c8-9f8ba822599d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227582,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 12.381825685501099 seconds\n", - "Time per token: 0.15477282106876372 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-0e4b4575-6278-4bd8-a4c5-ddb772014f7d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227596,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.473106145858765 seconds\n", - "Time per token: 0.18091382682323456 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1ad3e3db-5120-41c8-8f9e-2ca07a846437\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227612,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 16.591509103775024 seconds\n", - "Time per token: 0.2073938637971878 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-34c8fb5c-fa49-4ea6-b2e7-ba3b958e297d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227630,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.034043788909912 seconds\n", - "Time per token: 0.1129255473613739 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-8d5c56eb-0b43-4591-a9ac-c1ec174ec6db\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227641,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.218972444534302 seconds\n", - "Time per token: 0.14023715555667876 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-bfdc554b-baa6-47c1-b35f-0f7d1321255a\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227654,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.300573110580444 seconds\n", - "Time per token: 0.11625716388225556 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ad67d78b-6975-4789-982e-3653c7fca7e1\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227665,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.009618520736694 seconds\n", - "Time per token: 0.11262023150920868 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-2eec3e0f-dd48-4c3a-9430-c5048827f557\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227676,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.997699737548828 seconds\n", - "Time per token: 0.11247124671936035 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-b129732a-8d7b-4382-baaf-740378c923ec\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227686,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.252354621887207 seconds\n", - "Time per token: 0.11565443277359008 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-bb25c002-69e0-40ec-8099-0ba4462338aa\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227697,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.040243864059448 seconds\n", - "Time per token: 0.1130030483007431 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-63705814-7c93-4d6b-a9f2-0579941ebf54\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227708,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.947132349014282 seconds\n", - "Time per token: 0.11183915436267852 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-8afe123b-423d-4757-82d9-15fc12cfd24e\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227720,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.335533857345581 seconds\n", - "Time per token: 0.12919417321681975 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-4937353f-e66f-4632-aea7-dd1133af9727\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227732,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.99415397644043 seconds\n", - "Time per token: 0.11242692470550537 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-78f86527-ccc7-4a5d-9b7f-38386998ba2a\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227743,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 15.732706308364868 seconds\n", - "Time per token: 0.19665882885456085 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-4d98c564-fcb4-45ec-9f8d-f64430abbfb3\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227761,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.319743633270264 seconds\n", - "Time per token: 0.11649679541587829 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ee855931-2578-45bc-93bf-319c4e6aa43a\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227772,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 15.189301490783691 seconds\n", - "Time per token: 0.18986626863479614 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-14f0b547-4d71-4a7f-a3d6-3127998903b3\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227790,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.464989423751831 seconds\n", - "Time per token: 0.11831236779689788 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-4eb5258a-5836-414c-88f6-e217bacaded6\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227801,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 13.818569660186768 seconds\n", - "Time per token: 0.1727321207523346 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-66b7c783-d506-45c1-b39b-c91666a02b44\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227817,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 27.316773176193237 seconds\n", - "Time per token: 0.34145966470241546 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-d53b48ca-30e2-43c2-9fb5-62ef6a65fafa\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227847,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.132777214050293 seconds\n", - "Time per token: 0.11415971517562866 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-d0909f83-5caa-4098-a0e6-9b2ad1e2b12f\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227858,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.273045539855957 seconds\n", - "Time per token: 0.11591306924819947 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-7045f5c7-cf5d-48e3-9353-032c320e56fa\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227870,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.90743088722229 seconds\n", - "Time per token: 0.11134288609027862 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-e623667d-d6cc-4908-a648-60380f723592\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227881,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.06355595588684 seconds\n", - "Time per token: 0.11329444944858551 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-44ec163c-25dd-40ae-a786-d8b4c9ff31b1\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227892,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.249061107635498 seconds\n", - "Time per token: 0.11561326384544372 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-cb435214-0d20-4566-b312-68d8960ebe25\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227903,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.296529054641724 seconds\n", - "Time per token: 0.11620661318302154 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-dc704f52-bed9-44f0-8335-a2ec4af3a27c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227914,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 12.455670356750488 seconds\n", - "Time per token: 0.1556958794593811 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-67570fa5-1c3d-47d6-b7c6-b3a734aae3f5\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227928,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.269653558731079 seconds\n", - "Time per token: 0.11587066948413849 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-4bd6c6f2-9849-4047-93c8-88b1914ef184\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227939,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.308398485183716 seconds\n", - "Time per token: 0.11635498106479644 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-6413afd7-fdc1-4c28-864d-6acdf2775060\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227950,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.430264711380005 seconds\n", - "Time per token: 0.13037830889225005 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-c4e1c14a-3b8a-4ab3-b42a-f47440f79962\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227962,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.389702558517456 seconds\n", - "Time per token: 0.1173712819814682 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ac307870-dc67-42b8-8bb8-bb8d3083cea2\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227974,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 10.35448431968689 seconds\n", - "Time per token: 0.12943105399608612 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-58c06f3e-3fba-4e23-b12e-141a1742c51b\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227986,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.097248792648315 seconds\n", - "Time per token: 0.11371560990810395 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-b5eccb52-85e3-41d0-b8d8-f35e68bf7997\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680227997,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 12.466306686401367 seconds\n", - "Time per token: 0.1558288335800171 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-e1dbc2ee-abc0-4891-a474-386d97b521b6\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228011,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.436015367507935 seconds\n", - "Time per token: 0.14295019209384918 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-fd9bce6d-0a33-4c24-90b3-913ab3b33d24\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228025,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 14.052912712097168 seconds\n", - "Time per token: 0.1756614089012146 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-038fa38d-7640-40ee-907c-0bb131c20d80\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228040,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.250384330749512 seconds\n", - "Time per token: 0.1156298041343689 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-d00a2058-9fda-4113-8e5e-bf0f39cef238\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228051,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.228248834609985 seconds\n", - "Time per token: 0.11535311043262482 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-f8d90e63-4939-491c-9775-fc15aa55505e\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228062,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.341724395751953 seconds\n", - "Time per token: 0.11677155494689942 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-9e3777bc-119a-46bf-bdd3-21557e686f3c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228074,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.285743951797485 seconds\n", - "Time per token: 0.11607179939746856 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-123eaa35-110b-4f73-ba60-fa8a75ea929c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228085,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.105633020401001 seconds\n", - "Time per token: 0.1138204127550125 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-cc095f4b-8047-446e-a9f5-c798a66d1003\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228096,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.305238485336304 seconds\n", - "Time per token: 0.1163154810667038 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-e2e69b3e-7742-4534-b21f-adfe53345820\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228108,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.190222263336182 seconds\n", - "Time per token: 0.11487777829170227 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-666ae55e-d837-4534-b8e6-9f1b01f69778\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228120,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.126368999481201 seconds\n", - "Time per token: 0.11407961249351502 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-63bdfa8e-b7c3-4669-ab76-54cdbb8878d5\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228131,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.136119604110718 seconds\n", - "Time per token: 0.11420149505138397 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1ec02c53-c7c8-434e-b28f-70884f8c35b2\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228143,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.126901626586914 seconds\n", - "Time per token: 0.11408627033233643 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-3ec3495b-009a-4a82-b444-d8c1c6bf20a1\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228154,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.08673644065857 seconds\n", - "Time per token: 0.11358420550823212 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-17fd0e6b-7ac3-494f-9e85-4e4a26013ad9\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228165,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.252317428588867 seconds\n", - "Time per token: 0.11565396785736085 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-14a2647f-3961-4b60-b20a-ae9872c34feb\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228177,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 11.389162302017212 seconds\n", - "Time per token: 0.14236452877521516 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-fa0e5edd-e9c9-40b9-bc9b-c48b8762850c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228190,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.433730125427246 seconds\n", - "Time per token: 0.11792162656784058 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-2b1c5964-265a-488a-8d8f-7e0692fcf96f\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228202,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 47.81757044792175 seconds\n", - "Time per token: 0.5977196305990219 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-516fbd4c-3fe5-4945-bfc5-7312f2c02687\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228252,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.540155410766602 seconds\n", - "Time per token: 0.10675194263458251 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-94c9ab1f-ac6e-4fc7-bcd9-7ab96515a722\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228262,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.660873889923096 seconds\n", - "Time per token: 0.10826092362403869 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-63b1e1a7-0c6b-42e0-ba65-6f42d6ec77bb\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228273,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.815936088562012 seconds\n", - "Time per token: 0.11019920110702515 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-92e1a879-2ebd-4299-b86e-90c87762db45\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228284,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.12400484085083 seconds\n", - "Time per token: 0.11405006051063538 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 512.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-033ea9dc-fffe-41a0-a695-d647f725ee97\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228296,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 13.992429971694946 seconds\n", - "Time per token: 0.17490537464618683 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-5153f39a-589a-4b3d-8642-8efce64fc439\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228312,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.084643125534058 seconds\n", - "Time per token: 0.11355803906917572 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-af9ea5c6-5449-43b4-9e50-da930af8d6b8\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228323,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.076856851577759 seconds\n", - "Time per token: 0.11346071064472199 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-5bbea5c1-ea8c-4599-bf63-a6eb80bc7525\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228334,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.02251124382019 seconds\n", - "Time per token: 0.11278139054775238 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ff9d87c7-e2b1-4481-9e8f-848d7a0fbd35\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228346,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.012435913085938 seconds\n", - "Time per token: 0.11265544891357422 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-3dbe8ae4-c9ca-4a1b-abaf-6b85ef648ba9\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228357,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.997032880783081 seconds\n", - "Time per token: 0.11246291100978852 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-b20a3b61-9c8b-4b2e-bb43-8ed9ce5a9d0d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228369,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.042449951171875 seconds\n", - "Time per token: 0.11303062438964843 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-9c781d69-83e0-415a-ac97-252508b10590\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228380,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.058239459991455 seconds\n", - "Time per token: 0.11322799324989319 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-86cead9e-780f-4503-831c-466a6abd5ab2\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228392,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.070426940917969 seconds\n", - "Time per token: 0.1133803367614746 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-65361c7e-74ef-4566-bad5-c6b3867a7f7e\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228403,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.985144138336182 seconds\n", - "Time per token: 0.11231430172920227 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-23feb1ca-8103-46d8-ab71-b4da59f05d16\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228415,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.999938011169434 seconds\n", - "Time per token: 0.11249922513961792 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-0db73f26-9ab1-4a78-a11f-e22d915ffae2\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228426,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.969520330429077 seconds\n", - "Time per token: 0.11211900413036346 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-54e6edeb-99ea-46ed-8735-5185f78c222c\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228438,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.12838339805603 seconds\n", - "Time per token: 0.11410479247570038 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-bd6502fd-f8c7-41d8-ab15-b10ca6aabd96\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228450,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.01610016822815 seconds\n", - "Time per token: 0.11270125210285187 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-72733563-53f5-4cd5-a4eb-48656408b2d8\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228461,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.993805408477783 seconds\n", - "Time per token: 0.11242256760597229 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-f7365eaa-fd68-422b-bbca-c6bcbcad36e0\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228473,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.292223930358887 seconds\n", - "Time per token: 0.11615279912948609 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-1cfcf44a-c692-4020-8dcb-e6da8b163920\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228485,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.99638295173645 seconds\n", - "Time per token: 0.11245478689670563 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-8b679f09-bc0e-4fc9-a935-9fefd9126993\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228497,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.972327709197998 seconds\n", - "Time per token: 0.11215409636497498 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-08cb0cd7-84d8-4193-a20c-5a6ca4b5e404\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228508,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.024793863296509 seconds\n", - "Time per token: 0.11280992329120636 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-ffe4b2b8-c041-4492-9e03-ab79cd4fd60d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228520,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.996853351593018 seconds\n", - "Time per token: 0.11246066689491271 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-196bb891-9299-4f91-9f68-ba6c7233a2dd\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228532,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.039422273635864 seconds\n", - "Time per token: 0.1129927784204483 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-e50f5489-b40c-4a5d-9cb2-4a6d13bbb8c7\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228544,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 8.978781461715698 seconds\n", - "Time per token: 0.11223476827144623 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-210cc2b8-df35-4d3f-a34a-a5facb635ec0\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228555,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.032035827636719 seconds\n", - "Time per token: 0.11290044784545898 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-e3c7ca0d-c4cb-495c-9210-4e1ed3b6010d\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228567,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.0346040725708 seconds\n", - "Time per token: 0.11293255090713501 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-7b4388c9-fe89-486d-83f4-34eec8940c42\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228579,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.016223907470703 seconds\n", - "Time per token: 0.11270279884338379 seconds\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", - " warnings.warn(\"The objective has been evaluated \"\n", - "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", - "llama_model_load: n_vocab = 32000\n", - "llama_model_load: n_ctx = 512\n", - "llama_model_load: n_embd = 4096\n", - "llama_model_load: n_mult = 256\n", - "llama_model_load: n_head = 32\n", - "llama_model_load: n_layer = 32\n", - "llama_model_load: n_rot = 128\n", - "llama_model_load: f16 = 2\n", - "llama_model_load: n_ff = 11008\n", - "llama_model_load: n_parts = 1\n", - "llama_model_load: type = 1\n", - "llama_model_load: ggml map size = 4017.70 MB\n", - "llama_model_load: ggml ctx size = 81.25 KB\n", - "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", - "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", - "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", - "llama_init_from_file: kv self size = 256.00 MB\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"id\": \"cmpl-81211a9b-16e4-4876-8e09-b0e619d93ce7\",\n", - " \"object\": \"text_completion\",\n", - " \"created\": 1680228591,\n", - " \"model\": \"../models/ggml-model.bin\",\n", - " \"choices\": [\n", - " {\n", - " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", - " \"index\": 0,\n", - " \"logprobs\": null,\n", - " \"finish_reason\": \"length\"\n", - " }\n", - " ],\n", - " \"usage\": {\n", - " \"prompt_tokens\": 79,\n", - " \"completion_tokens\": 1,\n", - " \"total_tokens\": 80\n", - " }\n", - "}\n", - "Time: 9.10002589225769 seconds\n", - "Time per token: 0.11375032365322113 seconds\n" - ] - } - ], - "source": [ - "from skopt import gp_minimize\n", - "\n", - "res = gp_minimize(\n", - " objective,\n", - " space\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import matplotlib.pyplot as plt\n", - "from skopt.plots import plot_objective\n", - "\n", - "plot_objective(res)\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - " fun: 0.10675194263458251\n", - " x: [True, True, 6, 2048]\n", - " func_vals: [ 1.373e-01 1.390e-01 ... 1.127e-01 1.138e-01]\n", - " x_iters: [[True, True, 5, 1300], [False, True, 5, 990], [True, True, 7, 1800], [False, False, 10, 1692], [False, True, 6, 1075], [True, False, 3, 291], [False, True, 3, 514], [False, False, 11, 1569], [False, False, 7, 1915], [False, True, 10, 1514], [False, False, 11, 1527], [False, False, 12, 2033], [False, True, 9, 3], [False, True, 1, 2004], [True, True, 12, 1], [False, False, 6, 2048], [False, False, 4, 2048], [False, False, 10, 1], [False, True, 11, 2048], [False, True, 9, 2048], [False, False, 8, 2017], [False, False, 6, 1], [False, True, 4, 1], [False, False, 6, 1587], [False, False, 9, 1056], [True, True, 12, 1450], [False, True, 6, 2048], [False, False, 6, 2048], [False, False, 6, 2048], [False, True, 6, 2048], [False, True, 6, 2048], [False, True, 5, 2048], [False, True, 6, 1464], [False, True, 8, 1], [True, True, 12, 1798], [True, False, 3, 2048], [True, True, 11, 683], [False, True, 11, 1], [True, True, 2, 1], [False, True, 11, 1238], [True, True, 11, 1260], [True, False, 6, 1295], [True, True, 6, 1292], [False, False, 12, 1250], [False, False, 12, 1200], [True, False, 4, 1250], [False, False, 12, 1191], [False, False, 12, 1180], [True, False, 10, 906], [False, False, 12, 1192], [True, True, 10, 2044], [False, False, 6, 1310], [False, False, 8, 1122], [True, False, 5, 4], [False, False, 7, 322], [False, False, 12, 1246], [False, False, 12, 1247], [False, False, 12, 1252], [True, True, 12, 811], [True, False, 6, 2048], [True, True, 12, 998], [False, True, 12, 1021], [False, True, 12, 1021], [False, True, 12, 1019], [True, False, 6, 759], [True, False, 6, 1064], [False, True, 12, 991], [True, True, 9, 533], [False, False, 11, 956], [False, False, 1, 3], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [False, False, 7, 986], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048]]\n", - " models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097)]\n", - " space: Space([Categorical(categories=(True, False), prior=None),\n", - " Categorical(categories=(True, False), prior=None),\n", - " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", - " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", - " random_state: RandomState(MT19937)\n", - " specs: args: func: \n", - " dimensions: Space([Categorical(categories=(True, False), prior=None),\n", - " Categorical(categories=(True, False), prior=None),\n", - " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", - " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", - " base_estimator: GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5),\n", - " n_restarts_optimizer=2, noise='gaussian',\n", - " normalize_y=True, random_state=1248744097)\n", - " n_calls: 100\n", - " n_random_starts: None\n", - " n_initial_points: 10\n", - " initial_point_generator: random\n", - " acq_func: gp_hedge\n", - " acq_optimizer: auto\n", - " x0: None\n", - " y0: None\n", - " random_state: RandomState(MT19937)\n", - " verbose: False\n", - " callback: None\n", - " n_points: 10000\n", - " n_restarts_optimizer: 5\n", - " xi: 0.01\n", - " kappa: 1.96\n", - " n_jobs: 1\n", - " model_queue_size: None\n", - " function: base_minimize" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "res" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/falcon_cpp/__init__.py b/falcon_cpp/__init__.py deleted file mode 100644 index e7d40876f..000000000 --- a/falcon_cpp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .falcon_cpp import * -from .falcon import * diff --git a/falcon_cpp/falcon.py b/falcon_cpp/falcon.py deleted file mode 100644 index 40b662f23..000000000 --- a/falcon_cpp/falcon.py +++ /dev/null @@ -1,1622 +0,0 @@ -import os -import sys -import uuid -import time -import math -import multiprocessing -from abc import ABC, abstractmethod -from typing import ( - List, - Optional, - Union, - Generator, - Sequence, - Iterator, - Deque, - Tuple, - Callable, -) -from collections import deque, OrderedDict - -import diskcache - -from . import falcon_cpp -from .falcon_types import * - -import numpy as np -import numpy.typing as npt - - -class BaseFalconCache(ABC): - """Base cache class for a falcon.cpp model.""" - - def __init__(self, capacity_bytes: int = (2 << 30)): - self.capacity_bytes = capacity_bytes - - @property - @abstractmethod - def cache_size(self) -> int: - raise NotImplementedError - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - pass - - @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "FalconState": - raise NotImplementedError - - @abstractmethod - def __contains__(self, key: Sequence[int]) -> bool: - raise NotImplementedError - - @abstractmethod - def __setitem__(self, key: Sequence[int], value: "FalconState") -> None: - raise NotImplementedError - - -class FalconRAMCache(BaseFalconCache): - """Cache for a falcon.cpp model using RAM.""" - - def __init__(self, capacity_bytes: int = (2 << 30)): - super().__init__(capacity_bytes) - self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "FalconState"] = OrderedDict() - - @property - def cache_size(self): - return sum([state.falcon_state_size for state in self.cache_state.values()]) - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - min_len = 0 - min_key = None - keys = ( - (k, Falcon.longest_token_prefix(k, key)) for k in self.cache_state.keys() - ) - for k, prefix_len in keys: - if prefix_len > min_len: - min_len = prefix_len - min_key = k - return min_key - - def __getitem__(self, key: Sequence[int]) -> "FalconState": - key = tuple(key) - _key = self._find_longest_prefix_key(key) - if _key is None: - raise KeyError("Key not found") - value = self.cache_state[_key] - self.cache_state.move_to_end(_key) - return value - - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None - - def __setitem__(self, key: Sequence[int], value: "FalconState"): - key = tuple(key) - if key in self.cache_state: - del self.cache_state[key] - self.cache_state[key] = value - while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: - self.cache_state.popitem(last=False) - - -# Alias for backwards compatibility -FalconCache = FalconRAMCache - - -class FalconDiskCache(BaseFalconCache): - """Cache for a falcon.cpp model using disk.""" - - def __init__( - self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) - ): - super().__init__(capacity_bytes) - self.cache = diskcache.Cache(cache_dir) - - @property - def cache_size(self): - return int(self.cache.volume()) # type: ignore - - def _find_longest_prefix_key( - self, - key: Tuple[int, ...], - ) -> Optional[Tuple[int, ...]]: - min_len = 0 - min_key: Optional[Tuple[int, ...]] = None - for k in self.cache.iterkeys(): # type: ignore - prefix_len = Falcon.longest_token_prefix(k, key) - if prefix_len > min_len: - min_len = prefix_len - min_key = k # type: ignore - return min_key - - def __getitem__(self, key: Sequence[int]) -> "FalconState": - key = tuple(key) - _key = self._find_longest_prefix_key(key) - if _key is None: - raise KeyError("Key not found") - value: "FalconState" = self.cache.pop(_key) # type: ignore - # NOTE: This puts an integer as key in cache, which breaks, - # Falcon.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens - # self.cache.push(_key, side="front") # type: ignore - return value - - def __contains__(self, key: Sequence[int]) -> bool: - return self._find_longest_prefix_key(tuple(key)) is not None - - def __setitem__(self, key: Sequence[int], value: "FalconState"): - print("FalconDiskCache.__setitem__: called", file=sys.stderr) - key = tuple(key) - if key in self.cache: - print("FalconDiskCache.__setitem__: delete", file=sys.stderr) - del self.cache[key] - self.cache[key] = value - print("FalconDiskCache.__setitem__: set", file=sys.stderr) - while self.cache_size > self.capacity_bytes and len(self.cache) > 0: - key_to_remove = next(iter(self.cache)) - del self.cache[key_to_remove] - print("FalconDiskCache.__setitem__: trim", file=sys.stderr) - - -class FalconState: - def __init__( - self, - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - n_tokens: int, - falcon_state: bytes, - falcon_state_size: int, - ): - self.input_ids = input_ids - self.scores = scores - self.n_tokens = n_tokens - self.falcon_state = falcon_state - self.falcon_state_size = falcon_state_size - - -LogitsProcessor = Callable[[List[int], List[float]], List[float]] - - -class LogitsProcessorList(List[LogitsProcessor]): - def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: - for processor in self: - scores = processor(input_ids, scores) - return scores - - -StoppingCriteria = Callable[[List[int], List[float]], bool] - - -class StoppingCriteriaList(List[StoppingCriteria]): - def __call__(self, input_ids: List[int], logits: List[float]) -> bool: - return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) - - -class Falcon: - """High-level Python wrapper for a falcon.cpp model.""" - - def __init__( - self, - model_path: str, - # NOTE: These parameters are likely to change in the future. - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = 1337, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = True, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = None, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - low_vram: bool = False, - verbose: bool = True, - ): - - # TODO: Add the parameters for - ''' - -ts SPLIT --tensor-split SPLIT - how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1 - -mg i, --main-gpu i the GPU to use for scratch and small tensors (0 = first) - --override-max-gpu N - limits the number of GPUs visible (allows to disable multi/single GPU processing) - --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM) - --mtest compute maximum memory usage - --export export the computation graph to 'falcon.ggml' - --verbose-prompt print prompt before generation - -dt, --debug-timings print GGML_PERF debug output (requires GGML_PERF=1 for timings) - 1 = print first layer, 2 = print first and last layer, 3+ = all layers - --lora FNAME apply LoRA adapter (implies --no-mmap) - --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter - -m FNAME, --model FNAME - ''' - - """Load a Falcon model from `model_path`. - - Args: - model_path: Path to the model. - n_ctx: Maximum context size. - n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. -1 for random. - f16_kv: Use half-precision for key/value cache. - logits_all: Return logits for all tokens, not just the last token. - vocab_only: Only load the vocabulary no weights. - use_mmap: Use mmap if possible. - use_mlock: Force the system to keep the model in RAM. - embedding: Embedding mode only. - n_threads: Number of threads to use. If None, the number of threads is automatically determined. - n_batch: Maximum number of prompt tokens to batch together when calling falcon_eval. - last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. - lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. - lora_path: Path to a LoRA file to apply to the model. - verbose: Print verbose output to stderr. - - Raises: - ValueError: If the model path does not exist. - - Returns: - A falcon instance. - """ - self.verbose = verbose - self.model_path = model_path - - self.params = falcon_cpp.falcon_context_default_params() - self.params.n_ctx = n_ctx - self.params.n_gpu_layers = n_gpu_layers - self.params.seed = seed - self.params.f16_kv = f16_kv - self.params.logits_all = logits_all - self.params.vocab_only = vocab_only - self.params.use_mmap = use_mmap if lora_path is None else False - self.params.use_mlock = use_mlock - self.params.embedding = embedding - self.params.low_vram = low_vram - - self.last_n_tokens_size = last_n_tokens_size - self.n_batch = min(n_ctx, n_batch) - - self.cache: Optional[BaseFalconCache] = None - - self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) - - self.lora_base = lora_base - self.lora_path = lora_path - - ### DEPRECATED ### - self.n_parts = n_parts - ### DEPRECATED ### - - if not os.path.exists(model_path): - raise ValueError(f"Model path does not exist: {model_path}") - - self.model = falcon_cpp.falcon_load_model_from_file( - self.model_path.encode("utf-8"), self.params - ) - assert self.model is not None - - self.ctx = falcon_cpp.falcon_new_context_with_model(self.model, self.params) - - assert self.ctx is not None - - if self.lora_path: - if falcon_cpp.falcon_model_apply_lora_from_file( - self.model, - falcon_cpp.c_char_p(self.lora_path.encode("utf-8")), - falcon_cpp.c_char_p(self.lora_base.encode("utf-8")) - if self.lora_base is not None - else falcon_cpp.c_char_p(0), - falcon_cpp.c_int(self.n_threads), - ): - raise RuntimeError( - f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" - ) - - if self.verbose: - print(falcon_cpp.falcon_print_system_info().decode("utf-8"), file=sys.stderr) - - self._n_vocab = self.n_vocab() - self._n_ctx = self.n_ctx() - size = falcon_cpp.c_size_t(self._n_vocab) - sorted = falcon_cpp.c_bool(False) - self._candidates_data = np.array( - [], - dtype=np.dtype( - [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True - ), - ) - self._candidates_data.resize(3, self._n_vocab, refcheck=False) - candidates = falcon_cpp.falcon_token_data_array( - data=self._candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p), - size=size, - sorted=sorted, - ) - self._candidates = candidates - self._token_nl = Falcon.token_nl() - self._token_eos = Falcon.token_eos() - - self.n_tokens = 0 - self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) - self.scores: npt.NDArray[np.single] = np.ndarray( - (n_ctx, self._n_vocab), dtype=np.single - ) - - @property - def _input_ids(self) -> npt.NDArray[np.intc]: - return self.input_ids[: self.n_tokens] - - @property - def _scores(self) -> npt.NDArray[np.single]: - return self.scores[: self.n_tokens, :] - - @property - def eval_tokens(self) -> Deque[int]: - return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx) - - @property - def eval_logits(self) -> Deque[List[float]]: - return deque( - self.scores[: self.n_tokens, :].tolist(), - maxlen=self._n_ctx if self.params.logits_all else 1, - ) - - def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: - """Tokenize a string. - - Args: - text: The utf-8 encoded string to tokenize. - - Raises: - RuntimeError: If the tokenization failed. - - Returns: - A list of tokens. - """ - assert self.ctx is not None - n_ctx = self._n_ctx - tokens = (falcon_cpp.falcon_token * n_ctx)() - n_tokens = falcon_cpp.falcon_tokenize( - self.ctx, - text, - tokens, - falcon_cpp.c_int(n_ctx), - falcon_cpp.c_bool(add_bos), - ) - if n_tokens < 0: - n_tokens = abs(n_tokens) - tokens = (falcon_cpp.falcon_token * n_tokens)() - n_tokens = falcon_cpp.falcon_tokenize( - self.ctx, - text, - tokens, - falcon_cpp.c_int(n_tokens), - falcon_cpp.c_bool(add_bos), - ) - if n_tokens < 0: - raise RuntimeError( - f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' - ) - return list(tokens[:n_tokens]) - - def detokenize(self, tokens: List[int]) -> bytes: - """Detokenize a list of tokens. - - Args: - tokens: The list of tokens to detokenize. - - Returns: - The detokenized string. - """ - assert self.ctx is not None - output = b"" - for token in tokens: - output += falcon_cpp.falcon_token_to_str( - self.ctx, falcon_cpp.falcon_token(token) - ) - return output - - def set_cache(self, cache: Optional[BaseFalconCache]): - """Set the cache. - - Args: - cache: The cache to set. - """ - self.cache = cache - - def reset(self): - """Reset the model state.""" - self.n_tokens = 0 - - def eval(self, tokens: Sequence[int]): - """Evaluate a list of tokens. - - Args: - tokens: The list of tokens to evaluate. - """ - assert self.ctx is not None - n_ctx = self._n_ctx - for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] - n_past = min(n_ctx - len(batch), len(self._input_ids)) - n_tokens = len(batch) - return_code = falcon_cpp.falcon_eval( - ctx=self.ctx, - tokens=(falcon_cpp.falcon_token * len(batch))(*batch), - n_tokens=falcon_cpp.c_int(n_tokens), - n_past=falcon_cpp.c_int(n_past), - n_threads=falcon_cpp.c_int(self.n_threads), - ) - if return_code != 0: - raise RuntimeError(f"falcon_eval returned {return_code}") - # Save tokens - self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch - # Save logits - rows = n_tokens if self.params.logits_all else 1 - cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] - # Update n_tokens - self.n_tokens += n_tokens - - def _sample( - self, - last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] - last_n_tokens_size: falcon_cpp.c_int, - top_k: falcon_cpp.c_int, - top_p: falcon_cpp.c_float, - temp: falcon_cpp.c_float, - tfs_z: falcon_cpp.c_float, - repeat_penalty: falcon_cpp.c_float, - frequency_penalty: falcon_cpp.c_float, - presence_penalty: falcon_cpp.c_float, - mirostat_mode: falcon_cpp.c_int, - mirostat_tau: falcon_cpp.c_float, - mirostat_eta: falcon_cpp.c_float, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, - ): - assert self.ctx is not None - assert self.n_tokens > 0 - n_vocab = self._n_vocab - n_ctx = self._n_ctx - top_k = falcon_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k - last_n_tokens_size = ( - falcon_cpp.c_int(n_ctx) - if last_n_tokens_size.value < 0 - else last_n_tokens_size - ) - logits: npt.NDArray[np.single] = self._scores[-1, :] - - if logits_processor is not None: - logits = np.array( - logits_processor(self._input_ids.tolist(), logits.tolist()), - dtype=np.single, - ) - self._scores[-1, :] = logits - - nl_logit = logits[self._token_nl] - candidates = self._candidates - candidates_data = self._candidates_data - candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore - candidates_data["logit"] = logits - candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) - candidates.data = candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p) - candidates.sorted = falcon_cpp.c_bool(False) - candidates.size = falcon_cpp.c_size_t(n_vocab) - falcon_cpp.falcon_sample_repetition_penalty( - ctx=self.ctx, - last_tokens_data=last_n_tokens_data, - last_tokens_size=last_n_tokens_size, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - penalty=repeat_penalty, - ) - falcon_cpp.falcon_sample_frequency_and_presence_penalties( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - last_tokens_data=last_n_tokens_data, - last_tokens_size=last_n_tokens_size, - alpha_frequency=frequency_penalty, - alpha_presence=presence_penalty, - ) - if not penalize_nl: - candidates.data[self._token_nl].logit = falcon_cpp.c_float(nl_logit) - if temp.value == 0.0: - return falcon_cpp.falcon_sample_token_greedy( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - ) - elif mirostat_mode.value == 1: - mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) - mirostat_m = falcon_cpp.c_int(100) - falcon_cpp.falcon_sample_temperature( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - return falcon_cpp.falcon_sample_token_mirostat( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - tau=mirostat_tau, - eta=mirostat_eta, - mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore - m=mirostat_m, - ) - elif mirostat_mode.value == 2: - mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) - falcon_cpp.falcon_sample_temperature( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.pointer(candidates), - temp=temp, - ) - return falcon_cpp.falcon_sample_token_mirostat_v2( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - tau=mirostat_tau, - eta=mirostat_eta, - mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore - ) - else: - falcon_cpp.falcon_sample_top_k( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - k=top_k, - min_keep=falcon_cpp.c_size_t(1), - ) - falcon_cpp.falcon_sample_tail_free( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - z=tfs_z, - min_keep=falcon_cpp.c_size_t(1), - ) - falcon_cpp.falcon_sample_typical( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - p=falcon_cpp.c_float(1.0), - min_keep=falcon_cpp.c_size_t(1), - ) - falcon_cpp.falcon_sample_top_p( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - p=top_p, - min_keep=falcon_cpp.c_size_t(1), - ) - falcon_cpp.falcon_sample_temperature( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - temp=temp, - ) - return falcon_cpp.falcon_sample_token( - ctx=self.ctx, - candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore - ) - - def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, - ): - """Sample a token from the model. - - Args: - top_k: The top-k sampling parameter. - top_p: The top-p sampling parameter. - temp: The temperature parameter. - repeat_penalty: The repeat penalty parameter. - - Returns: - The sampled token. - """ - assert self.ctx is not None - last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max( - 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size :].tolist() - return self._sample( - last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)( - *last_n_tokens_data - ), - last_n_tokens_size=falcon_cpp.c_int(self.last_n_tokens_size), - top_k=falcon_cpp.c_int(top_k), - top_p=falcon_cpp.c_float(top_p), - temp=falcon_cpp.c_float(temp), - tfs_z=falcon_cpp.c_float(tfs_z), - repeat_penalty=falcon_cpp.c_float(repeat_penalty), - frequency_penalty=falcon_cpp.c_float(frequency_penalty), - presence_penalty=falcon_cpp.c_float(presence_penalty), - mirostat_mode=falcon_cpp.c_int(mirostat_mode), - mirostat_tau=falcon_cpp.c_float(mirostat_tau), - mirostat_eta=falcon_cpp.c_float(mirostat_eta), - penalize_nl=penalize_nl, - logits_processor=logits_processor, - ) - - def generate( - self, - tokens: Sequence[int], - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - ) -> Generator[int, Optional[Sequence[int]], None]: - """Create a generator of tokens from a prompt. - - Examples: - >>> falcon = Falcon("models/ggml-7b.bin") - >>> tokens = falcon.tokenize(b"Hello, world!") - >>> for token in falcon.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): - ... print(falcon.detokenize([token])) - - Args: - tokens: The prompt tokens. - top_k: The top-k sampling parameter. - top_p: The top-p sampling parameter. - temp: The temperature parameter. - repeat_penalty: The repeat penalty parameter. - reset: Whether to reset the model state. - - Yields: - The generated tokens. - """ - assert self.ctx is not None - - if reset and len(self._input_ids) > 0: - longest_prefix = 0 - for a, b in zip(self._input_ids, tokens[:-1]): - if a == b: - longest_prefix += 1 - else: - break - if longest_prefix > 0: - if self.verbose: - print("Falcon.generate: prefix-match hit", file=sys.stderr) - reset = False - tokens = tokens[longest_prefix:] - self.n_tokens = longest_prefix - - if reset: - self.reset() - - while True: - self.eval(tokens) - token = self.sample( - top_k=top_k, - top_p=top_p, - temp=temp, - repeat_penalty=repeat_penalty, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - logits_processor=logits_processor, - ) - if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() - ): - return - tokens_or_none = yield token - tokens = [token] - if tokens_or_none is not None: - tokens.extend(tokens_or_none) - - def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None - ) -> Embedding: - """Embed a string. - - Args: - input: The utf-8 encoded string to embed. - - Returns: - An embedding object. - """ - assert self.ctx is not None - model_name: str = model if model is not None else self.model_path - - if self.params.embedding == False: - raise RuntimeError( - "Falcon model must be created with embedding=True to call this method" - ) - - if self.verbose: - falcon_cpp.falcon_reset_timings(self.ctx) - - if isinstance(input, str): - inputs = [input] - else: - inputs = input - - data: List[EmbeddingData] = [] - total_tokens = 0 - for index, input in enumerate(inputs): - tokens = self.tokenize(input.encode("utf-8")) - self.reset() - self.eval(tokens) - n_tokens = len(tokens) - total_tokens += n_tokens - embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[ - : falcon_cpp.falcon_n_embd(self.ctx) - ] - - data.append( - { - "object": "embedding", - "embedding": embedding, - "index": index, - } - ) - if self.verbose: - falcon_cpp.falcon_print_timings(self.ctx) - - return { - "object": "list", - "data": data, - "model": model_name, - "usage": { - "prompt_tokens": total_tokens, - "total_tokens": total_tokens, - }, - } - - def embed(self, input: str) -> List[float]: - """Embed a string. - - Args: - input: The utf-8 encoded string to embed. - - Returns: - A list of embeddings - """ - return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) - - def _create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 16, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, - ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: - assert self.ctx is not None - - completion_id: str = f"cmpl-{str(uuid.uuid4())}" - created: int = int(time.time()) - completion_tokens: List[int] = [] - # Add blank space to start of prompt to match OG Falcon tokenizer - prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) - text: bytes = b"" - returned_tokens: int = 0 - stop = ( - stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] - ) - model_name: str = model if model is not None else self.model_path - - if self.verbose: - falcon_cpp.falcon_reset_timings(self.ctx) - - if len(prompt_tokens) > self._n_ctx: - raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" - ) - - # Truncate max_tokens if requested tokens would exceed the context window - max_tokens = ( - max_tokens - if max_tokens + len(prompt_tokens) < self._n_ctx - else (self._n_ctx - len(prompt_tokens)) - ) - - if stop != []: - stop_sequences = [s.encode("utf-8") for s in stop] - else: - stop_sequences = [] - - if logprobs is not None and self.params.logits_all is False: - raise ValueError( - "logprobs is not supported for models created with logits_all=False" - ) - - if self.cache: - try: - cache_item = self.cache[prompt_tokens] - cache_prefix_len = Falcon.longest_token_prefix( - cache_item.input_ids.tolist(), prompt_tokens - ) - eval_prefix_len = Falcon.longest_token_prefix( - self._input_ids.tolist(), prompt_tokens - ) - if cache_prefix_len > eval_prefix_len: - self.load_state(cache_item) - if self.verbose: - print("Falcon._create_completion: cache hit", file=sys.stderr) - except KeyError: - if self.verbose: - print("Falcon._create_completion: cache miss", file=sys.stderr) - - finish_reason = "length" - multibyte_fix = 0 - for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, - ): - if token == self._token_eos: - text = self.detokenize(completion_tokens) - finish_reason = "stop" - break - - completion_tokens.append(token) - - all_text = self.detokenize(completion_tokens) - - # Contains multi-byte UTF8 - for k, char in enumerate(all_text[-3:]): - k = 3 - k - for num, pattern in [(2, 192), (3, 224), (4, 240)]: - # Bitwise AND check - if num > k and pattern & char == pattern: - multibyte_fix = num - k - - # Stop incomplete bytes from passing - if multibyte_fix > 0: - multibyte_fix -= 1 - continue - - any_stop = [s for s in stop_sequences if s in all_text] - if len(any_stop) > 0: - first_stop = any_stop[0] - text = all_text[: all_text.index(first_stop)] - finish_reason = "stop" - break - - if stream: - remaining_tokens = completion_tokens[returned_tokens:] - remaining_text = self.detokenize(remaining_tokens) - remaining_length = len(remaining_text) - - # We want to avoid yielding any characters from - # the generated text if they are part of a stop - # sequence. - first_stop_position = 0 - for s in stop_sequences: - for i in range(min(len(s), remaining_length), 0, -1): - if remaining_text.endswith(s[:i]): - if i > first_stop_position: - first_stop_position = i - break - - token_end_position = 0 - for token in remaining_tokens: - token_end_position += len(self.detokenize([token])) - # Check if stop sequence is in the token - if token_end_position >= ( - remaining_length - first_stop_position - 1 - ): - break - logprobs_or_none: Optional[CompletionLogprobs] = None - if logprobs is not None: - token_str = self.detokenize([token]).decode( - "utf-8", errors="ignore" - ) - text_offset = len(prompt) + len( - self.detokenize(completion_tokens[:returned_tokens]) - ) - token_offset = len(prompt_tokens) + returned_tokens - logits = self._scores[token_offset - 1, :].tolist() - current_logprobs = Falcon.logits_to_logprobs(logits) - sorted_logprobs = list( - sorted( - zip(current_logprobs, range(len(current_logprobs))), - reverse=True, - ) - ) - top_logprob = { - self.detokenize([i]).decode( - "utf-8", errors="ignore" - ): logprob - for logprob, i in sorted_logprobs[:logprobs] - } - top_logprob.update({token_str: current_logprobs[int(token)]}) - logprobs_or_none = { - "tokens": [ - self.detokenize([token]).decode( - "utf-8", errors="ignore" - ) - ], - "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], - "top_logprobs": [top_logprob], - } - returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": self.detokenize([token]).decode( - "utf-8", errors="ignore" - ), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": None, - } - ], - } - - if len(completion_tokens) >= max_tokens: - text = self.detokenize(completion_tokens) - finish_reason = "length" - break - - if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() - ): - text = self.detokenize(completion_tokens) - finish_reason = "stop" - - if self.verbose: - falcon_cpp.falcon_print_timings(self.ctx) - - if stream: - remaining_tokens = completion_tokens[returned_tokens:] - all_text = self.detokenize(remaining_tokens) - any_stop = [s for s in stop_sequences if s in all_text] - if len(any_stop) > 0: - end = min(all_text.index(stop) for stop in any_stop) - else: - end = len(all_text) - - token_end_position = 0 - for token in remaining_tokens: - token_end_position += len(self.detokenize([token])) - - logprobs_or_none: Optional[CompletionLogprobs] = None - if logprobs is not None: - token_str = self.detokenize([token]).decode( - "utf-8", errors="ignore" - ) - text_offset = len(prompt) + len( - self.detokenize(completion_tokens[:returned_tokens]) - ) - token_offset = len(prompt_tokens) + returned_tokens - 1 - logits = self._scores[token_offset, :].tolist() - current_logprobs = Falcon.logits_to_logprobs(logits) - sorted_logprobs = list( - sorted( - zip(current_logprobs, range(len(current_logprobs))), - reverse=True, - ) - ) - top_logprob = { - self.detokenize([i]).decode("utf-8", errors="ignore"): logprob - for logprob, i in sorted_logprobs[:logprobs] - } - top_logprob.update({token_str: current_logprobs[int(token)]}) - logprobs_or_none = { - "tokens": [ - self.detokenize([token]).decode("utf-8", errors="ignore") - ], - "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], - "top_logprobs": [top_logprob], - } - - if token_end_position >= end: - last_text = self.detokenize([token]) - if token_end_position == end - 1: - break - returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": finish_reason, - } - ], - } - break - returned_tokens += 1 - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": self.detokenize([token]).decode( - "utf-8", errors="ignore" - ), - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": finish_reason - if returned_tokens == len(completion_tokens) - else None, - } - ], - } - if self.cache: - if self.verbose: - print("Falcon._create_completion: cache save", file=sys.stderr) - self.cache[prompt_tokens + completion_tokens] = self.save_state() - print("Falcon._create_completion: cache saved", file=sys.stderr) - return - - if self.cache: - if self.verbose: - print("Falcon._create_completion: cache save", file=sys.stderr) - self.cache[prompt_tokens + completion_tokens] = self.save_state() - - text_str = text.decode("utf-8", errors="ignore") - - if echo: - text_str = prompt + text_str - - if suffix is not None: - text_str = text_str + suffix - - logprobs_or_none: Optional[CompletionLogprobs] = None - if logprobs is not None: - text_offset = 0 if echo else len(prompt) - token_offset = 0 if echo else len(prompt_tokens[1:]) - text_offsets: List[int] = [] - token_logprobs: List[Optional[float]] = [] - tokens: List[str] = [] - top_logprobs: List[Optional[Dict[str, float]]] = [] - - if echo: - # Remove leading BOS token - all_tokens = prompt_tokens[1:] + completion_tokens - else: - all_tokens = completion_tokens - - all_token_strs = [ - self.detokenize([token]).decode("utf-8", errors="ignore") - for token in all_tokens - ] - all_logprobs = [ - Falcon.logits_to_logprobs(row.tolist()) for row in self._scores - ][token_offset:] - for token, token_str, logprobs_token in zip( - all_tokens, all_token_strs, all_logprobs - ): - text_offsets.append(text_offset) - text_offset += len(token_str) - tokens.append(token_str) - sorted_logprobs = list( - sorted( - zip(logprobs_token, range(len(logprobs_token))), reverse=True - ) - ) - token_logprobs.append(sorted_logprobs[int(token)][0]) - top_logprob: Optional[Dict[str, float]] = { - self.detokenize([i]).decode("utf-8", errors="ignore"): logprob - for logprob, i in sorted_logprobs[:logprobs] - } - top_logprob.update({token_str: logprobs_token[int(token)]}) - top_logprobs.append(top_logprob) - # Weird idosincracy of the OpenAI API where - # token_logprobs and top_logprobs are null for - # the first token. - if echo and len(all_tokens) > 0: - token_logprobs[0] = None - top_logprobs[0] = None - logprobs_or_none = { - "tokens": tokens, - "text_offset": text_offsets, - "token_logprobs": token_logprobs, - "top_logprobs": top_logprobs, - } - - yield { - "id": completion_id, - "object": "text_completion", - "created": created, - "model": model_name, - "choices": [ - { - "text": text_str, - "index": 0, - "logprobs": logprobs_or_none, - "finish_reason": finish_reason, - } - ], - "usage": { - "prompt_tokens": len(prompt_tokens), - "completion_tokens": len(completion_tokens), - "total_tokens": len(prompt_tokens) + len(completion_tokens), - }, - } - - def create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, - ) -> Union[Completion, Iterator[CompletionChunk]]: - """Generate text from a prompt. - - Args: - prompt: The prompt to generate text from. - suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. - temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. - logprobs: The number of logprobs to return. If None, no logprobs are returned. - echo: Whether to echo the prompt. - stop: A list of strings to stop generation when encountered. - repeat_penalty: The penalty to apply to repeated tokens. - top_k: The top-k value to use for sampling. - stream: Whether to stream the results. - - Raises: - ValueError: If the requested tokens exceed the context window. - RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. - - Returns: - Response object containing the generated text. - """ - completion_or_chunks = self._create_completion( - prompt=prompt, - suffix=suffix, - max_tokens=max_tokens, - temperature=temperature, - top_p=top_p, - logprobs=logprobs, - echo=echo, - stop=stop, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - top_k=top_k, - stream=stream, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, - ) - if stream: - chunks: Iterator[CompletionChunk] = completion_or_chunks - return chunks - completion: Completion = next(completion_or_chunks) # type: ignore - return completion - - def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, - ) -> Union[Completion, Iterator[CompletionChunk]]: - """Generate text from a prompt. - - Args: - prompt: The prompt to generate text from. - suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. - temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. - logprobs: The number of logprobs to return. If None, no logprobs are returned. - echo: Whether to echo the prompt. - stop: A list of strings to stop generation when encountered. - repeat_penalty: The penalty to apply to repeated tokens. - top_k: The top-k value to use for sampling. - stream: Whether to stream the results. - - Raises: - ValueError: If the requested tokens exceed the context window. - RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. - - Returns: - Response object containing the generated text. - """ - return self.create_completion( - prompt=prompt, - suffix=suffix, - max_tokens=max_tokens, - temperature=temperature, - top_p=top_p, - logprobs=logprobs, - echo=echo, - stop=stop, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - top_k=top_k, - stream=stream, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, - ) - - def _convert_text_completion_to_chat( - self, completion: Completion - ) -> ChatCompletion: - return { - "id": "chat" + completion["id"], - "object": "chat.completion", - "created": completion["created"], - "model": completion["model"], - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": completion["choices"][0]["text"], - }, - "finish_reason": completion["choices"][0]["finish_reason"], - } - ], - "usage": completion["usage"], - } - - def _convert_text_completion_chunks_to_chat( - self, - chunks: Iterator[CompletionChunk], - ) -> Iterator[ChatCompletionChunk]: - for i, chunk in enumerate(chunks): - if i == 0: - yield { - "id": "chat" + chunk["id"], - "model": chunk["model"], - "created": chunk["created"], - "object": "chat.completion.chunk", - "choices": [ - { - "index": 0, - "delta": { - "role": "assistant", - }, - "finish_reason": None, - } - ], - } - yield { - "id": "chat" + chunk["id"], - "model": chunk["model"], - "created": chunk["created"], - "object": "chat.completion.chunk", - "choices": [ - { - "index": 0, - "delta": { - "content": chunk["choices"][0]["text"], - }, - "finish_reason": chunk["choices"][0]["finish_reason"], - } - ], - } - - def create_chat_completion( - self, - messages: List[ChatCompletionMessage], - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - logits_processor: Optional[LogitsProcessorList] = None, - ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: - """Generate a chat completion from a list of messages. - - Args: - messages: A list of messages to generate a response for. - temperature: The temperature to use for sampling. - top_p: The top-p value to use for sampling. - top_k: The top-k value to use for sampling. - stream: Whether to stream the results. - stop: A list of strings to stop generation when encountered. - max_tokens: The maximum number of tokens to generate. - repeat_penalty: The penalty to apply to repeated tokens. - - Returns: - Generated chat completion or a stream of chat completion chunks. - """ - stop = ( - stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] - ) - chat_history = "".join( - f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' - for message in messages - ) - PROMPT = chat_history + "### Assistant:" - PROMPT_STOP = ["### Assistant:", "### Human:"] - completion_or_chunks = self( - prompt=PROMPT, - stop=PROMPT_STOP + stop, - temperature=temperature, - top_p=top_p, - top_k=top_k, - stream=stream, - max_tokens=max_tokens, - repeat_penalty=repeat_penalty, - presence_penalty=presence_penalty, - frequency_penalty=frequency_penalty, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - model=model, - logits_processor=logits_processor, - ) - if stream: - chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore - return self._convert_text_completion_chunks_to_chat(chunks) - else: - completion: Completion = completion_or_chunks # type: ignore - return self._convert_text_completion_to_chat(completion) - - def __del__(self): - if self.model is not None: - falcon_cpp.falcon_free_model(self.model) - self.model = None - if self.ctx is not None: - falcon_cpp.falcon_free(self.ctx) - self.ctx = None - - def __getstate__(self): - return dict( - verbose=self.verbose, - model_path=self.model_path, - n_ctx=self.params.n_ctx, - n_gpu_layers=self.params.n_gpu_layers, - seed=self.params.seed, - f16_kv=self.params.f16_kv, - logits_all=self.params.logits_all, - vocab_only=self.params.vocab_only, - use_mmap=self.params.use_mmap, - use_mlock=self.params.use_mlock, - embedding=self.params.embedding, - low_vram=self.params.low_vram, - last_n_tokens_size=self.last_n_tokens_size, - n_batch=self.n_batch, - n_threads=self.n_threads, - lora_base=self.lora_base, - lora_path=self.lora_path, - ### DEPRECATED ### - n_parts=self.n_parts, - ### DEPRECATED ### - ) - - def __setstate__(self, state): - self.__init__( - model_path=state["model_path"], - n_ctx=state["n_ctx"], - n_parts=state["n_parts"], - n_gpu_layers=state["n_gpu_layers"], - seed=state["seed"], - f16_kv=state["f16_kv"], - logits_all=state["logits_all"], - vocab_only=state["vocab_only"], - use_mmap=state["use_mmap"], - use_mlock=state["use_mlock"], - embedding=state["embedding"], - low_vram=state["low_vram"], - n_threads=state["n_threads"], - n_batch=state["n_batch"], - last_n_tokens_size=state["last_n_tokens_size"], - lora_base=state["lora_base"], - lora_path=state["lora_path"], - verbose=state["verbose"], - ) - - def save_state(self) -> FalconState: - assert self.ctx is not None - if self.verbose: - print("Falcon.save_state: saving falcon state", file=sys.stderr) - state_size = falcon_cpp.falcon_get_state_size(self.ctx) - if self.verbose: - print(f"Falcon.save_state: got state size: {state_size}", file=sys.stderr) - falcon_state = (falcon_cpp.c_uint8 * int(state_size))() - if self.verbose: - print("Falcon.save_state: allocated state", file=sys.stderr) - n_bytes = falcon_cpp.falcon_copy_state_data(self.ctx, falcon_state) - if self.verbose: - print(f"Falcon.save_state: copied falcon state: {n_bytes}", file=sys.stderr) - if int(n_bytes) > int(state_size): - raise RuntimeError("Failed to copy Falcon state data") - falcon_state_compact = (falcon_cpp.c_uint8 * int(n_bytes))() - falcon_cpp.ctypes.memmove(falcon_state_compact, falcon_state, int(n_bytes)) - if self.verbose: - print( - f"Falcon.save_state: saving {n_bytes} bytes of falcon state", - file=sys.stderr, - ) - return FalconState( - scores=self.scores.copy(), - input_ids=self.input_ids.copy(), - n_tokens=self.n_tokens, - falcon_state=bytes(falcon_state_compact), - falcon_state_size=n_bytes, - ) - - def load_state(self, state: FalconState) -> None: - assert self.ctx is not None - self.scores = state.scores.copy() - self.input_ids = state.input_ids.copy() - self.n_tokens = state.n_tokens - state_size = state.falcon_state_size - FalconStateArrayType = falcon_cpp.c_uint8 * state_size - falcon_state = FalconStateArrayType.from_buffer_copy(state.falcon_state) - - if falcon_cpp.falcon_set_state_data(self.ctx, falcon_state) != state_size: - raise RuntimeError("Failed to set Falcon state data") - - def n_ctx(self) -> int: - """Return the context window size.""" - assert self.ctx is not None - return falcon_cpp.falcon_n_ctx(self.ctx) - - def n_embd(self) -> int: - """Return the embedding size.""" - assert self.ctx is not None - return falcon_cpp.falcon_n_embd(self.ctx) - - def n_vocab(self) -> int: - """Return the vocabulary size.""" - assert self.ctx is not None - return falcon_cpp.falcon_n_vocab(self.ctx) - - def tokenizer(self) -> "FalconTokenizer": - """Return the tokenizer for this model.""" - assert self.ctx is not None - return FalconTokenizer(self) - - @staticmethod - def token_eos() -> int: - """Return the end-of-sequence token.""" - return falcon_cpp.falcon_token_eos() - - @staticmethod - def token_bos() -> int: - """Return the beginning-of-sequence token.""" - return falcon_cpp.falcon_token_bos() - - @staticmethod - def token_nl() -> int: - """Return the newline token.""" - return falcon_cpp.falcon_token_nl() - - @staticmethod - def logits_to_logprobs(logits: List[float]) -> List[float]: - exps = [math.exp(float(x)) for x in logits] - sum_exps = sum(exps) - return [math.log(x / sum_exps) for x in exps] - - @staticmethod - def longest_token_prefix(a: Sequence[int], b: Sequence[int]): - longest_prefix = 0 - for _a, _b in zip(a, b): - if _a == _b: - longest_prefix += 1 - else: - break - return longest_prefix - - -class FalconTokenizer: - def __init__(self, falcon: Falcon): - self.falcon = falcon - - def encode(self, text: str, add_bos: bool = True) -> List[int]: - return self.falcon.tokenize( - text.encode("utf-8", errors="ignore"), add_bos=add_bos - ) - - def decode(self, tokens: List[int]) -> str: - return self.falcon.detokenize(tokens).decode("utf-8", errors="ignore") - - @classmethod - def from_ggml_file(cls, path: str) -> "FalconTokenizer": - return cls(Falcon(model_path=path, vocab_only=True)) diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py deleted file mode 100644 index d0c0455e6..000000000 --- a/falcon_cpp/falcon_cpp.py +++ /dev/null @@ -1,1024 +0,0 @@ -import sys -import os -import ctypes -from ctypes import ( - c_int, - c_float, - c_char_p, - c_void_p, - c_bool, - POINTER, - _Pointer, # type: ignore - Structure, - Array, - c_uint8, - c_size_t, -) -import pathlib -from typing import List, Union - - -# Load the library -def _load_shared_library(lib_base_name: str): - # Construct the paths to the possible shared library names - _base_path = pathlib.Path(__file__).parent.resolve() - # Searching for the library in the current directory under the name "libFalcon" (default name - # for falconcpp) and "falcon" (default name for this repo) - _lib_paths: List[pathlib.Path] = [] - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - ] - elif sys.platform == "darwin": - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - _base_path / f"lib{lib_base_name}.dylib", - ] - elif sys.platform == "win32": - _lib_paths += [ - _base_path / f"{lib_base_name}.dll", - ] - else: - raise RuntimeError("Unsupported platform") - - if "FALCON_CPP_LIB" in os.environ: - lib_base_name = os.environ["FALCON_CPP_LIB"] - _lib = pathlib.Path(lib_base_name) - _base_path = _lib.parent.resolve() - _lib_paths = [_lib.resolve()] - - cdll_args = dict() # type: ignore - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(_base_path)) - if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) - cdll_args["winmode"] = 0 - - # Try to load the shared library, handling potential errors - for _lib_path in _lib_paths: - if _lib_path.exists(): - try: - return ctypes.CDLL(str(_lib_path), **cdll_args) - except Exception as e: - raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - - -# Specify the base name of the shared library to load -_lib_base_name = "ggllm" - -# Load the library -_lib = _load_shared_library(_lib_base_name) - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# falcon.h bindings - -GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") -GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) -FALCON_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) - -# #define FALCON_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -FALCON_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) -# #define FALCON_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' -FALCON_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) -# #define FALCON_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' -FALCON_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) -# #define FLACON_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' -FALCON_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) -# #define FALCON_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -FALCON_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) - -# #define FALCON_FILE_VERSION 3 -FALCON_FILE_VERSION = c_int(3) -FALCON_FILE_MAGIC = FALCON_FILE_MAGIC_GGJT -FALCON_FILE_MAGIC_UNVERSIONED = FALCON_FILE_MAGIC_GGML -FALCON_SESSION_MAGIC = FALCON_FILE_MAGIC_GGSN -FALCON_SESSION_VERSION = c_int(1) - -# struct falcon_model; -falcon_model_p = c_void_p - -# struct falcon_context; -falcon_context_p = c_void_p - - -# typedef int falcon_token; -falcon_token = c_int -falcon_token_p = POINTER(falcon_token) - - -# typedef struct falcon_token_data { -# falcon_token id; // token id -# float logit; // log-odds of the token -# float p; // probability of the token -# } falcon_token_data; -class falcon_token_data(Structure): - _fields_ = [ - ("id", falcon_token), - ("logit", c_float), - ("p", c_float), - ] - - -falcon_token_data_p = POINTER(falcon_token_data) - - -# typedef struct falcon_token_data_array { -# falcon_token_data * data; -# size_t size; -# bool sorted; -# } falcon_token_data_array; -class falcon_token_data_array(Structure): - _fields_ = [ - ("data", falcon_token_data_p), - ("size", c_size_t), - ("sorted", c_bool), - ] - - -falcon_token_data_array_p = POINTER(falcon_token_data_array) - -# typedef void (*falcon_progress_callback)(float progress, void *ctx); -falcon_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) - - -# struct falcon_context_params { -# int seed; // RNG seed, -1 for random -# int n_ctx; // text context -# int n_batch; // prompt processing batch size -# int n_gpu_layers; // number of layers to store in VRAM -# int main_gpu; // the GPU that is used for scratch and small tensors -# float tensor_split[FALCON_MAX_DEVICES]; // how to split layers across multiple GPUs -# // called with a progress value between 0 and 1, pass NULL to disable -# falcon_progress_callback progress_callback; -# // context pointer passed to the progress callback -# void * progress_callback_user_data; - - -# // Keep the booleans together to avoid misalignment during copy-by-value. -# bool low_vram; // if true, reduce VRAM usage at the cost of performance -# bool f16_kv; // use fp16 for KV cache -# bool logits_all; // the falcon_eval() call computes all logits, not just the last one -# bool vocab_only; // only load the vocabulary, no weights -# bool use_mmap; // use mmap if possible -# bool use_mlock; // force system to keep model in RAM -# bool embedding; // embedding mode only -# }; -class ggllm_context_params(Structure): - _fields_ = [ - ("seed", c_int), - ("n_ctx", c_int), - ("n_batch", c_int), - ("n_gpu_layers", c_int), - ("main_gpu", c_int), - ("tensor_split", c_float * FALCON_MAX_DEVICES.value), - ("progress_callback", falcon_progress_callback), - ("progress_callback_user_data", c_void_p), - ("low_vram", c_bool), - ("f16_kv", c_bool), - ("logits_all", c_bool), - ("vocab_only", c_bool), - ("use_mmap", c_bool), - ("use_mlock", c_bool), - ("embedding", c_bool), - ] - - -falcon_context_params_p = POINTER(ggllm_context_params) - -# enum falcon_ftype { -# FALCON_FTYPE_ALL_F32 = 0, -# FALCON_FTYPE_MOSTLY_F16 = 1, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 -# // FALCON_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed -# // FALCON_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed -# FALCON_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors -# FALCON_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors -# FALCON_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors -# }; -FALCON_FTYPE_ALL_F32 = c_int(0) -FALCON_FTYPE_MOSTLY_F16 = c_int(1) -FALCON_FTYPE_MOSTLY_Q4_0 = c_int(2) -FALCON_FTYPE_MOSTLY_Q4_1 = c_int(3) -FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) -FALCON_FTYPE_MOSTLY_Q8_0 = c_int(7) -FALCON_FTYPE_MOSTLY_Q5_0 = c_int(8) -FALCON_FTYPE_MOSTLY_Q5_1 = c_int(9) -FALCON_FTYPE_MOSTLY_Q2_K = c_int(10) -FALCON_FTYPE_MOSTLY_Q3_K_S = c_int(11) -FALCON_FTYPE_MOSTLY_Q3_K_M = c_int(12) -FALCON_FTYPE_MOSTLY_Q3_K_L = c_int(13) -FALCON_FTYPE_MOSTLY_Q4_K_S = c_int(14) -FALCON_FTYPE_MOSTLY_Q4_K_M = c_int(15) -FALCON_FTYPE_MOSTLY_Q5_K_S = c_int(16) -FALCON_FTYPE_MOSTLY_Q5_K_M = c_int(17) -FALCON_FTYPE_MOSTLY_Q6_K = c_int(18) - - -# // model quantization parameters -# typedef struct falcon_model_quantize_params { -# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum falcon_ftype ftype; // quantize to this falcon_ftype -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# } falcon_model_quantize_params; -class falcon_model_quantize_params(Structure): - _fields_ = [ - ("nthread", c_int), - ("ftype", c_int), - ("allow_requantize", c_bool), - ("quantize_output_tensor", c_bool), - ] - - -# FALCON_API struct falcon_context_params falcon_context_default_params(); -def falcon_context_default_params() -> ggllm_context_params: - return _lib.ggllm_context_default_params() - - -_lib.ggllm_context_default_params.argtypes = [] -_lib.ggllm_context_default_params.restype = ggllm_context_params - - -# FALCON_API struct falcon_model_quantize_params falcon_model_quantize_default_params(); -def falcon_model_quantize_default_params() -> falcon_model_quantize_params: - return _lib.ggllm_model_quantize_default_params() - - -_lib.ggllm_model_quantize_default_params.argtypes = [] -_lib.ggllm_model_quantize_default_params.restype = falcon_model_quantize_params - - -# FALCON_API bool falcon_mmap_supported(); -def falcon_mmap_supported() -> bool: - return _lib.ggllm_mmap_supported() - - -_lib.ggllm_mmap_supported.argtypes = [] -_lib.ggllm_mmap_supported.restype = c_bool - - -# FALCON_API bool falcon_mlock_supported(); -def falcon_mlock_supported() -> bool: - return _lib.ggllm_mlock_supported() - - -_lib.ggllm_mlock_supported.argtypes = [] -_lib.ggllm_mlock_supported.restype = c_bool - - -# // TODO: not great API - very likely to change -# // Initialize the falcon + ggml backend -# // If numa is true, use NUMA optimizations -# // Call once at the start of the program -# FLACON_API void falcon_init_backend(bool numa); -def falcon_init_backend(numa: c_bool): - return _lib.ggllm_init_backend(numa) - - -_lib.ggllm_init_backend.argtypes = [c_bool] -_lib.ggllm_init_backend.restype = None - - -# FALCON_API struct falcon_model * falcon_load_model_from_file( -# const char * path_model, -# struct falcon_context_params params); -def falcon_load_model_from_file( - path_model: bytes, params: ggllm_context_params -) -> falcon_model_p: - return _lib.ggllm_load_model_from_file(path_model, params) - - -_lib.ggllm_load_model_from_file.argtypes = [c_char_p, ggllm_context_params] -_lib.ggllm_load_model_from_file.restype = falcon_model_p - - -# FALCON_API void falcon_free_model(struct falcon_model * model); -def falcon_free_model(model: falcon_model_p): - return _lib.ggllm_free_model(model) - - -_lib.ggllm_free_model.argtypes = [falcon_model_p] -_lib.ggllm_free_model.restype = None - - -# FALCON_API struct falcon_context * falcon_new_context_with_model( -# struct falcon_model * model, -# struct falcon_context_params params); -def falcon_new_context_with_model( - model: falcon_model_p, params: ggllm_context_params -) -> falcon_context_p: - return _lib.ggllm_new_context_with_model(model, params) - - -_lib.ggllm_new_context_with_model.argtypes = [falcon_model_p, ggllm_context_params] -_lib.ggllm_new_context_with_model.restype = falcon_context_p - - -# FALCON_API int64_t ggllm_time_us(); -def ggllm_time_us() -> int: - return _lib.ggllm_time_us() - - -_lib.ggllm_time_us.argtypes = [] -_lib.ggllm_time_us.restype = ctypes.c_int64 - - -# // Various functions for loading a ggml falcon model. -# // Allocate (almost) all memory needed for the model. -# // Return NULL on failure -# FALCON_API struct falcon_context * falcon_init_from_file( -# const char * path_model, -# struct falcon_context_params params); -def ggllm_init_from_file( - path_model: bytes, params: ggllm_context_params -) -> falcon_context_p: - return _lib.ggllm_init_from_file(path_model, params) - - -_lib.ggllm_init_from_file.argtypes = [c_char_p, ggllm_context_params] -_lib.ggllm_init_from_file.restype = falcon_context_p - - -# Frees all allocated memory -# FALCON_API void falcon_free(struct falcon_context * ctx); -def falcon_free(ctx: falcon_context_p): - return _lib.ggllm_free(ctx) - - -_lib.ggllm_free.argtypes = [falcon_context_p] -_lib.ggllm_free.restype = None - - -# // Returns 0 on success -# FALCON_API int ggllm_model_quantize( -# const char * fname_inp, -# const char * fname_out, -# const falcon_model_quantize_params * params); -def ggllm_model_quantize( - fname_inp: bytes, - fname_out: bytes, - params, # type: POINTER(falcon_model_quantize_params) # type: ignore -) -> int: - return _lib.ggllm_model_quantize(fname_inp, fname_out, params) - - -_lib.ggllm_model_quantize.argtypes = [ - c_char_p, - c_char_p, - POINTER(falcon_model_quantize_params), -] -_lib.ggllm_model_quantize.restype = c_int - - -# Apply a LoRA adapter to a loaded model -# path_base_model is the path to a higher quality model to use as a base for -# the layers modified by the adapter. Can be NULL to use the current loaded model. -# The model needs to be reloaded before applying a new adapter, otherwise the adapter -# will be applied on top of the previous one -# Returns 0 on success -# FALCON_API int falcon_apply_lora_from_file( -# struct falcon_context * ctx, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def ggllm_apply_lora_from_file( - ctx: falcon_context_p, - path_lora: c_char_p, - path_base_model: c_char_p, - n_threads: c_int, -) -> int: - return _lib.ggllm_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) - - -_lib.ggllm_apply_lora_from_file.argtypes = [falcon_context_p, c_char_p, c_char_p, c_int] -_lib.ggllm_apply_lora_from_file.restype = c_int - - -# FALCON_API int ggllm_model_apply_lora_from_file( -# const struct ggllm_model * model, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def falcon_model_apply_lora_from_file( - model: falcon_model_p, - path_lora: Union[c_char_p, bytes], - path_base_model: Union[c_char_p, bytes], - n_threads: c_int, -) -> int: - return _lib.ggllm_model_apply_lora_from_file( - model, path_lora, path_base_model, n_threads - ) - - -_lib.ggllm_model_apply_lora_from_file.argtypes = [ - falcon_model_p, - c_char_p, - c_char_p, - c_int, -] -_lib.ggllm_model_apply_lora_from_file.restype = c_int - - -# Returns the number of tokens in the KV cache -# FALCON_API int falcon_get_kv_cache_token_count(const struct falcon_context * ctx); -def ggllm_get_kv_cache_token_count(ctx: falcon_context_p) -> int: - return _lib.ggllm_get_kv_cache_token_count(ctx) - - -_lib.ggllm_get_kv_cache_token_count.argtypes = [falcon_context_p] -_lib.ggllm_get_kv_cache_token_count.restype = c_int - - -# Sets the current rng seed. -# FALCON_API void falcon_set_rng_seed(struct falcon_context * ctx, int seed); -def falcon_set_rng_seed(ctx: falcon_context_p, seed: c_int): - return _lib.ggllm_set_rng_seed(ctx, seed) - - -_lib.ggllm_set_rng_seed.argtypes = [falcon_context_p, c_int] -_lib.ggllm_set_rng_seed.restype = None - - -# Returns the maximum size in bytes of the state (rng, logits, embedding -# and kv_cache) - will often be smaller after compacting tokens -# FALCON_API size_t falcon_get_state_size(const struct falcon_context * ctx); -def falcon_get_state_size(ctx: falcon_context_p) -> int: - return _lib.ggllm_get_state_size(ctx) - - -_lib.ggllm_get_state_size.argtypes = [falcon_context_p] -_lib.ggllm_get_state_size.restype = c_size_t - - -# Copies the state to the specified destination address. -# Destination needs to have allocated enough memory. -# Returns the number of bytes copied -# FALCON_API size_t falcon_copy_state_data(struct falcon_context * ctx, uint8_t * dst); -def falcon_copy_state_data( - ctx: falcon_context_p, dst # type: Array[c_uint8] -) -> int: - return _lib.ggllm_copy_state_data(ctx, dst) - - -_lib.ggllm_copy_state_data.argtypes = [falcon_context_p, c_uint8_p] -_lib.ggllm_copy_state_data.restype = c_size_t - - -# Set the state reading from the specified address -# Returns the number of bytes read -# FALCON_API size_t falcon_set_state_data(struct falcon_context * ctx, uint8_t * src); -def falcon_set_state_data( - ctx: falcon_context_p, src # type: Array[c_uint8] -) -> int: - return _lib.ggllm_set_state_data(ctx, src) - - -_lib.ggllm_set_state_data.argtypes = [falcon_context_p, c_uint8_p] -_lib.ggllm_set_state_data.restype = c_size_t - - -# Save/load session file -# GGLLM_API bool falcon_load_session_file(struct falcon_context * ctx, const char * path_session, falcon_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); -def ggllm_load_session_file( - ctx: falcon_context_p, - path_session: bytes, - tokens_out, # type: Array[falcon_token] - n_token_capacity: c_size_t, - n_token_count_out, # type: _Pointer[c_size_t] -) -> int: - return _lib.ggllm_load_session_file( - ctx, path_session, tokens_out, n_token_capacity, n_token_count_out - ) - - -_lib.ggllm_load_session_file.argtypes = [ - falcon_context_p, - c_char_p, - falcon_token_p, - c_size_t, - c_size_t_p, -] -_lib.ggllm_load_session_file.restype = c_size_t - - -# FALCON_API bool falcon_save_session_file(struct falcon_context * ctx, const char * path_session, const falcon_token * tokens, size_t n_token_count); -def ggllm_save_session_file( - ctx: falcon_context_p, - path_session: bytes, - tokens, # type: Array[falcon_token] - n_token_count: c_size_t, -) -> int: - return _lib.ggllm_save_session_file(ctx, path_session, tokens, n_token_count) - - -_lib.ggllm_save_session_file.argtypes = [ - falcon_context_p, - c_char_p, - falcon_token_p, - c_size_t, -] -_lib.ggllm_save_session_file.restype = c_size_t - - -# Run the falcon inference to obtain the logits and probabilities for the next token. -# tokens + n_tokens is the provided batch of new tokens to process -# n_past is the number of tokens to use from previous eval calls -# Returns 0 on success -# GGLLM_API int falcon_eval( -# struct falcon_context * ctx, -# const falcon_token * tokens, -# int n_tokens, -# int n_past, -# int n_threads); -def falcon_eval( - ctx: falcon_context_p, - tokens, # type: Array[falcon_token] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.ggllm_eval(ctx, tokens, n_tokens, n_past, n_threads) - - -_lib.ggllm_eval.argtypes = [falcon_context_p, falcon_token_p, c_int, c_int, c_int] -_lib.ggllm_eval.restype = c_int - - -# // Same as falcon_eval, but use float matrix input directly. -# FALCON_API int falcon_eval_embd( -# struct falcon_context * ctx, -# const float * embd, -# int n_tokens, -# int n_past, -# int n_threads); -def ggllm_eval_embd( - ctx: falcon_context_p, - embd, # type: Array[c_float] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.ggllm_eval_embd(ctx, embd, n_tokens, n_past, n_threads) - - -_lib.ggllm_eval_embd.argtypes = [falcon_context_p, c_float_p, c_int, c_int, c_int] -_lib.ggllm_eval_embd.restype = c_int - - -# Convert the provided text into tokens. -# The tokens pointer must be large enough to hold the resulting tokens. -# Returns the number of tokens on success, no more than n_max_tokens -# Returns a negative number on failure - the number of tokens that would have been returned -# TODO: not sure if correct -# FALCON_API int ggllm_tokenize( -# struct falcon_context * ctx, -# const char * text, -# falcon_token * tokens, -# int n_max_tokens, -# bool add_bos); -def falcon_tokenize( - ctx: falcon_context_p, - text: bytes, - tokens, # type: Array[falcon_token] - n_max_tokens: c_int, - add_bos: c_bool, -) -> int: - return _lib.ggllm_tokenize(ctx, text, tokens, n_max_tokens, add_bos) - - -_lib.ggllm_tokenize.argtypes = [falcon_context_p, c_char_p, falcon_token_p, c_int, c_bool] -_lib.ggllm_tokenize.restype = c_int - - -# GGLLM_API int ggllm_n_vocab(const struct falcon_context * ctx); -def falcon_n_vocab(ctx: falcon_context_p) -> int: - return _lib.ggllm_n_vocab(ctx) - - -_lib.ggllm_n_vocab.argtypes = [falcon_context_p] -_lib.ggllm_n_vocab.restype = c_int - - -# FALCON_API int falcon_n_ctx (const struct falcon_context * ctx); -def falcon_n_ctx(ctx: falcon_context_p) -> int: - return _lib.ggllm_n_ctx(ctx) - - -_lib.ggllm_n_ctx.argtypes = [falcon_context_p] -_lib.ggllm_n_ctx.restype = c_int - - -# FALCON_API int falcon_n_embd (const struct falcon_context * ctx); -def falcon_n_embd(ctx: falcon_context_p) -> int: - return _lib.ggllm_n_embd(ctx) - - -_lib.ggllm_n_embd.argtypes = [falcon_context_p] -_lib.ggllm_n_embd.restype = c_int - - -# // Get the vocabulary as output parameters. -# // Returns number of results. -# FALCON_API int falcon_get_vocab( -# const struct falcon_context * ctx, -# const char * * strings, -# float * scores, -# int capacity); -def falcon_get_vocab( - ctx: falcon_context_p, - strings, # type: Array[c_char_p] # type: ignore - scores, # type: Array[c_float] # type: ignore - capacity: c_int, -) -> int: - return _lib.ggllm_get_vocab(ctx, strings, scores, capacity) - - -_lib.ggllm_get_vocab.argtypes = [falcon_context_p, c_char_p, c_float, c_int] -_lib.ggllm_get_vocab.restype = c_int - - -# Token logits obtained from the last call to falcon_eval() -# The logits for the last token are stored in the last row -# Can be mutated in order to change the probabilities of the next token -# Rows: n_tokens -# Cols: n_vocab -# FALCON_API float * falcon_get_logits(struct falcon_context * ctx); -def falcon_get_logits( - ctx: falcon_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.ggllm_get_logits(ctx) - - -_lib.ggllm_get_logits.argtypes = [falcon_context_p] -_lib.ggllm_get_logits.restype = c_float_p - - -# Get the embeddings for the input -# shape: [n_embd] (1-dimensional) -# FALCON_API float * falcon_get_embeddings(struct falcon_context * ctx); -def falcon_get_embeddings( - ctx: falcon_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.ggllm_get_embeddings(ctx) - - -_lib.ggllm_get_embeddings.argtypes = [falcon_context_p] -_lib.ggllm_get_embeddings.restype = c_float_p - - -# Token Id -> String. Uses the vocabulary in the provided context -# FLACON_API const char * falcon_token_to_str(const struct falcon_context * ctx, falcon_token token); -def falcon_token_to_str(ctx: falcon_context_p, token: falcon_token) -> bytes: - return _lib.ggllm_token_to_str(ctx, token) - - -_lib.ggllm_token_to_str.argtypes = [falcon_context_p, falcon_token] -_lib.ggllm_token_to_str.restype = c_char_p - -# Special tokens - - -# FALCON_API falcon_token falcon_token_bos(); // beginning-of-sentence -def falcon_token_bos() -> int: - return _lib.ggllm_token_bos() - - -_lib.ggllm_token_bos.argtypes = [] -_lib.ggllm_token_bos.restype = falcon_token - - -# FALCON_API falcon_token falcon_token_eos(); // end-of-sentence -def falcon_token_eos() -> int: - return _lib.ggllm_token_eos() - - -_lib.ggllm_token_eos.argtypes = [] -_lib.ggllm_token_eos.restype = falcon_token - - -# FALCON_API falcon_token falcon_token_nl(); // next-line -def falcon_token_nl() -> int: - return _lib.ggllm_token_nl() - - -_lib.ggllm_token_nl.argtypes = [] -_lib.ggllm_token_nl.restype = falcon_token - - -# Sampling functions - - -# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. -# FALCON_API void falcon_sample_repetition_penalty(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float penalty); -def falcon_sample_repetition_penalty( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - last_tokens_data, # type: Array[falcon_token] - last_tokens_size: c_int, - penalty: c_float, -): - return _lib.ggllm_sample_repetition_penalty( - ctx, candidates, last_tokens_data, last_tokens_size, penalty - ) - - -_lib.ggllm_sample_repetition_penalty.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - falcon_token_p, - c_int, - c_float, -] -_lib.ggllm_sample_repetition_penalty.restype = None - - -# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. -# FALCON_API void falcon_sample_frequency_and_presence_penalties(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); -def falcon_sample_frequency_and_presence_penalties( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - last_tokens_data, # type: Array[falcon_token] - last_tokens_size: c_int, - alpha_frequency: c_float, - alpha_presence: c_float, -): - return _lib.ggllm_sample_frequency_and_presence_penalties( - ctx, - candidates, - last_tokens_data, - last_tokens_size, - alpha_frequency, - alpha_presence, - ) - - -_lib.ggllm_sample_frequency_and_presence_penalties.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - falcon_token_p, - c_int, - c_float, - c_float, -] -_lib.ggllm_sample_frequency_and_presence_penalties.restype = None - - -# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# FALCON_API void falcon_sample_softmax(struct falcon_context * ctx, falcon_token_data_array * candidates); -def falcon_sample_softmax( - ctx: falcon_context_p, candidates # type: _Pointer[falcon_token_data] -): - return _lib.ggllm_sample_softmax(ctx, candidates) - - -_lib.ggllm_sample_softmax.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, -] -_lib.ggllm_sample_softmax.restype = None - - -# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# FALCON_API void falcon_sample_top_k(struct falcon_context * ctx, falcon_token_data_array * candidates, int k, size_t min_keep); -def falcon_sample_top_k( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - k: c_int, - min_keep: c_size_t, -): - return _lib.ggllm_sample_top_k(ctx, candidates, k, min_keep) - - -_lib.ggllm_sample_top_k.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_int, - c_size_t, -] -_lib.ggllm_sample_top_k.restype = None - - -# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# FALCON_API void falcon_sample_top_p(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); -def falcon_sample_top_p( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.ggllm_sample_top_p(ctx, candidates, p, min_keep) - - -_lib.ggllm_sample_top_p.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, - c_size_t, -] -_lib.ggllm_sample_top_p.restype = None - - -# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. -# FALCON_API void falcon_sample_tail_free(struct falcon_context * ctx, falcon_token_data_array * candidates, float z, size_t min_keep); -def falcon_sample_tail_free( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - z: c_float, - min_keep: c_size_t, -): - return _lib.ggllm_sample_tail_free(ctx, candidates, z, min_keep) - - -_lib.ggllm_sample_tail_free.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, - c_size_t, -] -_lib.ggllm_sample_tail_free.restype = None - - -# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -# FALCON_API void falcon_sample_typical(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); -def falcon_sample_typical( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.ggllm_sample_typical(ctx, candidates, p, min_keep) - - -_lib.ggllm_sample_typical.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, - c_size_t, -] -_lib.ggllm_sample_typical.restype = None - - -# FALCON_API void falcon_sample_temperature(struct falcon_context * ctx, falcon_token_data_array * candidates, float temp); -def falcon_sample_temperature( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - temp: c_float, -): - return _lib.ggllm_sample_temperature(ctx, candidates, temp) - - -_lib.ggllm_sample_temperature.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, -] -_lib.ggllm_sample_temperature.restype = None - - -# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# FALCON_API falcon_token falcon_sample_token_mirostat(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, int m, float * mu); -def falcon_sample_token_mirostat( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - tau: c_float, - eta: c_float, - m: c_int, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.ggllm_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) - - -_lib.ggllm_sample_token_mirostat.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, - c_float, - c_int, - c_float_p, -] -_lib.ggllm_sample_token_mirostat.restype = falcon_token - - -# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# FALCON_API falcon_token falcon_sample_token_mirostat_v2(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, float * mu); -def falcon_sample_token_mirostat_v2( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] - tau: c_float, - eta: c_float, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.ggllm_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) - - -_lib.ggllm_sample_token_mirostat_v2.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, - c_float, - c_float, - c_float_p, -] -_lib.ggllm_sample_token_mirostat_v2.restype = falcon_token - - -# @details Selects the token with the highest probability. -# FALCON_API falcon_token falcon_sample_token_greedy(struct falcon_context * ctx, falcon_token_data_array * candidates); -def falcon_sample_token_greedy( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] -) -> int: - return _lib.ggllm_sample_token_greedy(ctx, candidates) - - -_lib.ggllm_sample_token_greedy.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, -] -_lib.ggllm_sample_token_greedy.restype = falcon_token - - -# @details Randomly selects a token from the candidates based on their probabilities. -# FALCON_API falcon_token falcon_sample_token(struct falcon_context * ctx, falcon_token_data_array * candidates); -def falcon_sample_token( - ctx: falcon_context_p, - candidates, # type: _Pointer[falcon_token_data_array] -) -> int: - return _lib.ggllm_sample_token(ctx, candidates) - - -_lib.ggllm_sample_token.argtypes = [ - falcon_context_p, - falcon_token_data_array_p, -] -_lib.ggllm_sample_token.restype = falcon_token - - -# Performance information - - -# FALCON_API void falcon_print_timings(struct falcon_context * ctx); -def falcon_print_timings(ctx: falcon_context_p): - _lib.ggllm_print_timings(ctx) - - -_lib.ggllm_print_timings.argtypes = [falcon_context_p] -_lib.ggllm_print_timings.restype = None - - -# FALCON_API void falcon_reset_timings(struct falcon_context * ctx); -def falcon_reset_timings(ctx: falcon_context_p): - _lib.ggllm_reset_timings(ctx) - - -_lib.ggllm_reset_timings.argtypes = [falcon_context_p] -_lib.ggllm_reset_timings.restype = None - - -# Print system information -# FALCON_API const char * falcon_print_system_info(void); -def falcon_print_system_info() -> bytes: - return _lib.ggllm_print_system_info() - - -_lib.ggllm_print_system_info.argtypes = [] -_lib.ggllm_print_system_info.restype = c_char_p - -################################################################################################### - - -_falcon_initialized = False - -if not _falcon_initialized: - falcon_init_backend(c_bool(False)) - _falcon_initialized = True diff --git a/falcon_cpp/falcon_types.py b/falcon_cpp/falcon_types.py deleted file mode 100644 index 7729ced5a..000000000 --- a/falcon_cpp/falcon_types.py +++ /dev/null @@ -1,97 +0,0 @@ -from typing import List, Optional, Dict -from typing_extensions import TypedDict, NotRequired, Literal - - -class EmbeddingUsage(TypedDict): - prompt_tokens: int - total_tokens: int - - -class EmbeddingData(TypedDict): - index: int - object: str - embedding: List[float] - - -class Embedding(TypedDict): - object: Literal["list"] - model: str - data: List[EmbeddingData] - usage: EmbeddingUsage - - -class CompletionLogprobs(TypedDict): - text_offset: List[int] - token_logprobs: List[Optional[float]] - tokens: List[str] - top_logprobs: List[Optional[Dict[str, float]]] - - -class CompletionChoice(TypedDict): - text: str - index: int - logprobs: Optional[CompletionLogprobs] - finish_reason: Optional[str] - - -class CompletionUsage(TypedDict): - prompt_tokens: int - completion_tokens: int - total_tokens: int - - -class CompletionChunk(TypedDict): - id: str - object: Literal["text_completion"] - created: int - model: str - choices: List[CompletionChoice] - - -class Completion(TypedDict): - id: str - object: Literal["text_completion"] - created: int - model: str - choices: List[CompletionChoice] - usage: CompletionUsage - - -class ChatCompletionMessage(TypedDict): - role: Literal["assistant", "user", "system"] - content: str - user: NotRequired[str] - - -class ChatCompletionChoice(TypedDict): - index: int - message: ChatCompletionMessage - finish_reason: Optional[str] - - -class ChatCompletion(TypedDict): - id: str - object: Literal["chat.completion"] - created: int - model: str - choices: List[ChatCompletionChoice] - usage: CompletionUsage - - -class ChatCompletionChunkDelta(TypedDict): - role: NotRequired[Literal["assistant"]] - content: NotRequired[str] - - -class ChatCompletionChunkChoice(TypedDict): - index: int - delta: ChatCompletionChunkDelta - finish_reason: Optional[str] - - -class ChatCompletionChunk(TypedDict): - id: str - model: str - object: Literal["chat.completion.chunk"] - created: int - choices: List[ChatCompletionChunkChoice] diff --git a/falcon_cpp/server/__init__.py b/falcon_cpp/server/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/falcon_cpp/server/__main__.py b/falcon_cpp/server/__main__.py deleted file mode 100644 index 748a2af33..000000000 --- a/falcon_cpp/server/__main__.py +++ /dev/null @@ -1,50 +0,0 @@ -"""Example FastAPI server for llama.cpp. - -To run this example: - -```bash -pip install fastapi uvicorn sse-starlette -export MODEL=../models/7B/... -``` - -Then run: -``` -uvicorn llama_cpp.server.app:app --reload -``` - -or - -``` -python3 -m llama_cpp.server -``` - -Then visit http://localhost:8000/docs to see the interactive API docs. - -""" -import os -import argparse - -import uvicorn - -from llama_cpp.server.app import create_app, Settings - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - for name, field in Settings.__fields__.items(): - description = field.field_info.description - if field.default is not None and description is not None: - description += f" (default: {field.default})" - parser.add_argument( - f"--{name}", - dest=name, - type=field.type_, - help=description, - ) - - args = parser.parse_args() - settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) - app = create_app(settings=settings) - - uvicorn.run( - app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) - ) diff --git a/falcon_cpp/server/app.py b/falcon_cpp/server/app.py deleted file mode 100644 index 2e0972ea6..000000000 --- a/falcon_cpp/server/app.py +++ /dev/null @@ -1,550 +0,0 @@ -import json -import multiprocessing -from threading import Lock -from functools import partial -from typing import Iterator, List, Optional, Union, Dict -from typing_extensions import TypedDict, Literal - -import falcon_cpp - -import anyio -from anyio.streams.memory import MemoryObjectSendStream -from starlette.concurrency import run_in_threadpool, iterate_in_threadpool -from fastapi import Depends, FastAPI, APIRouter, Request -from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict -from sse_starlette.sse import EventSourceResponse - - -class Settings(BaseSettings): - model: str = Field( - description="The path to the model to use for generating completions." - ) - model_alias: Optional[str] = Field( - default=None, - description="The alias of the model to use for generating completions.", - ) - n_ctx: int = Field(default=8192, ge=1, description="The context size.") - n_gpu_layers: int = Field( - default=0, - ge=0, - description="The number of layers to put on the GPU. The rest will be on the CPU.", - ) - seed: int = Field( - default=1337, description="Random seed. -1 for random." - ) - n_batch: int = Field( - default=512, ge=1, description="The batch size to use per eval." - ) - n_threads: int = Field( - default=max(multiprocessing.cpu_count() // 2, 1), - ge=1, - description="The number of threads to use.", - ) - f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") - use_mlock: bool = Field( - default=falcon_cpp.falcon_mlock_supported(), - description="Use mlock.", - ) - use_mmap: bool = Field( - default=falcon_cpp.falcon_mmap_supported(), - description="Use mmap.", - ) - embedding: bool = Field(default=True, description="Whether to use embeddings.") - low_vram: bool = Field( - default=False, - description="Whether to use less VRAM. This will reduce performance.", - ) - last_n_tokens_size: int = Field( - default=64, - ge=0, - description="Last n tokens to keep for repeat penalty calculation.", - ) - logits_all: bool = Field(default=True, description="Whether to return logits.") - cache: bool = Field( - default=False, - description="Use a cache to reduce processing times for evaluated prompts.", - ) - cache_type: Literal["ram", "disk"] = Field( - default="ram", - description="The type of cache to use. Only used if cache is True.", - ) - cache_size: int = Field( - default=2 << 30, - description="The size of the cache in bytes. Only used if cache is True.", - ) - vocab_only: bool = Field( - default=False, description="Whether to only return the vocabulary." - ) - verbose: bool = Field( - default=True, description="Whether to print debug information." - ) - host: str = Field( - default="localhost", description="Listen address" - ) - port: int = Field( - default=8000, description="Listen port" - ) - - -router = APIRouter() - -settings: Optional[Settings] = None -falcon: Optional[falcon_cpp.falcon] = None - - -def create_app(settings: Optional[Settings] = None): - if settings is None: - settings = Settings() - app = FastAPI( - title="🦙 falcon.cpp Python API", - version="0.0.1", - ) - app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], - ) - app.include_router(router) - global falcon - falcon = falcon_cpp.Falcon( - model_path=settings.model, - n_gpu_layers=settings.n_gpu_layers, - seed=settings.seed, - f16_kv=settings.f16_kv, - use_mlock=settings.use_mlock, - use_mmap=settings.use_mmap, - embedding=settings.embedding, - logits_all=settings.logits_all, - n_threads=settings.n_threads, - n_batch=settings.n_batch, - n_ctx=settings.n_ctx, - last_n_tokens_size=settings.last_n_tokens_size, - vocab_only=settings.vocab_only, - verbose=settings.verbose, - ) - if settings.cache: - if settings.cache_type == "disk": - if settings.verbose: - print(f"Using disk cache with size {settings.cache_size}") - cache = falcon_cpp.FalconDiskCache(capacity_bytes=settings.cache_size) - else: - if settings.verbose: - print(f"Using ram cache with size {settings.cache_size}") - cache = falcon_cpp.FalconRAMCache(capacity_bytes=settings.cache_size) - - cache = falcon_cpp.FalconCache(capacity_bytes=settings.cache_size) - falcon.set_cache(cache) - - def set_settings(_settings: Settings): - global settings - settings = _settings - - set_settings(settings) - return app - - -falcon_lock = Lock() - - -def get_falcon(): - with falcon_lock: - yield falcon - - -def get_settings(): - yield settings - - -model_field = Field(description="The model to use for generating completions.") - -max_tokens_field = Field( - default=16, ge=1, le=2048, description="The maximum number of tokens to generate." -) - -temperature_field = Field( - default=0.8, - ge=0.0, - le=2.0, - description="Adjust the randomness of the generated text.\n\n" - + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", -) - -top_p_field = Field( - default=0.95, - ge=0.0, - le=1.0, - description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" - + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", -) - -stop_field = Field( - default=None, - description="A list of tokens at which to stop generation. If None, no stop tokens are used.", -) - -stream_field = Field( - default=False, - description="Whether to stream the results as they are generated. Useful for chatbots.", -) - -top_k_field = Field( - default=40, - ge=0, - description="Limit the next token selection to the K most probable tokens.\n\n" - + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", -) - -repeat_penalty_field = Field( - default=1.1, - ge=0.0, - description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" - + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", -) - -presence_penalty_field = Field( - default=0.0, - ge=-2.0, - le=2.0, - description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", -) - -frequency_penalty_field = Field( - default=0.0, - ge=-2.0, - le=2.0, - description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", -) - -mirostat_mode_field = Field( - default=0, - ge=0, - le=2, - description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" -) - -mirostat_tau_field = Field( - default=5.0, - ge=0.0, - le=10.0, - description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" -) - -mirostat_eta_field = Field( - default=0.1, - ge=0.001, - le=1.0, - description="Mirostat learning rate" -) - - -class CreateCompletionRequest(BaseModel): - prompt: Union[str, List[str]] = Field( - default="", description="The prompt to generate completions for." - ) - suffix: Optional[str] = Field( - default=None, - description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", - ) - max_tokens: int = max_tokens_field - temperature: float = temperature_field - top_p: float = top_p_field - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - echo: bool = Field( - default=False, - description="Whether to echo the prompt in the generated text. Useful for chatbots.", - ) - stop: Optional[Union[str, List[str]]] = stop_field - stream: bool = stream_field - logprobs: Optional[int] = Field( - default=None, - ge=0, - description="The number of logprobs to generate. If None, no logprobs are generated.", - ) - presence_penalty: Optional[float] = presence_penalty_field - frequency_penalty: Optional[float] = frequency_penalty_field - logit_bias: Optional[Dict[str, float]] = Field(None) - logprobs: Optional[int] = Field(None) - - # ignored or currently unsupported - model: Optional[str] = model_field - n: Optional[int] = 1 - best_of: Optional[int] = 1 - user: Optional[str] = Field(None) - - # falcon.cpp specific parameters - top_k: int = top_k_field - repeat_penalty: float = repeat_penalty_field - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } - } - - -CreateCompletionResponse = create_model_from_typeddict(falcon_cpp.Completion) - - -def make_logit_bias_processor( - falcon: falcon_cpp.Falcon, - logit_bias: Dict[str, float], - logit_bias_type: Optional[Literal["input_ids", "tokens"]], -): - if logit_bias_type is None: - logit_bias_type = "input_ids" - - to_bias: Dict[int, float] = {} - if logit_bias_type == "input_ids": - for input_id, score in logit_bias.items(): - input_id = int(input_id) - to_bias[input_id] = score - - elif logit_bias_type == "tokens": - for token, score in logit_bias.items(): - token = token.encode('utf-8') - for input_id in falcon.tokenize(token, add_bos=False): - to_bias[input_id] = score - - def logit_bias_processor( - input_ids: List[int], - scores: List[float], - ) -> List[float]: - new_scores = [None] * len(scores) - for input_id, score in enumerate(scores): - new_scores[input_id] = score + to_bias.get(input_id, 0.0) - - return new_scores - - return logit_bias_processor - - -@router.post( - "/v1/completions", - response_model=CreateCompletionResponse, -) -async def create_completion( - request: Request, - body: CreateCompletionRequest, - falcon: falcon_cpp.Falcon = Depends(get_falcon), -): - if isinstance(body.prompt, list): - assert len(body.prompt) <= 1 - body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" - - exclude = { - "n", - "best_of", - "logit_bias", - "logit_bias_type", - "user", - } - kwargs = body.dict(exclude=exclude) - - if body.logit_bias is not None: - kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ - make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), - ]) - - if body.stream: - send_chan, recv_chan = anyio.create_memory_object_stream(10) - - async def event_publisher(inner_send_chan: MemoryObjectSendStream): - async with inner_send_chan: - try: - iterator: Iterator[falcon_cpp.CompletionChunk] = await run_in_threadpool(falcon, **kwargs) # type: ignore - async for chunk in iterate_in_threadpool(iterator): - await inner_send_chan.send(dict(data=json.dumps(chunk))) - if await request.is_disconnected(): - raise anyio.get_cancelled_exc_class()() - await inner_send_chan.send(dict(data="[DONE]")) - except anyio.get_cancelled_exc_class() as e: - print("disconnected") - with anyio.move_on_after(1, shield=True): - print( - f"Disconnected from client (via refresh/close) {request.client}" - ) - await inner_send_chan.send(dict(closing=True)) - raise e - - return EventSourceResponse( - recv_chan, data_sender_callable=partial(event_publisher, send_chan) - ) - else: - completion: falcon_cpp.Completion = await run_in_threadpool(falcon, **kwargs) # type: ignore - return completion - - -class CreateEmbeddingRequest(BaseModel): - model: Optional[str] = model_field - input: Union[str, List[str]] = Field(description="The input to embed.") - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } - } - - -CreateEmbeddingResponse = create_model_from_typeddict(falcon_cpp.Embedding) - - -@router.post( - "/v1/embeddings", - response_model=CreateEmbeddingResponse, -) -async def create_embedding( - request: CreateEmbeddingRequest, falcon: falcon_cpp.Falcon = Depends(get_falcon) -): - return await run_in_threadpool( - falcon.create_embedding, **request.dict(exclude={"user"}) - ) - - -class ChatCompletionRequestMessage(BaseModel): - role: Literal["system", "user", "assistant"] = Field( - default="user", description="The role of the message." - ) - content: str = Field(default="", description="The content of the message.") - - -class CreateChatCompletionRequest(BaseModel): - messages: List[ChatCompletionRequestMessage] = Field( - default=[], description="A list of messages to generate completions for." - ) - max_tokens: int = max_tokens_field - temperature: float = temperature_field - top_p: float = top_p_field - mirostat_mode: int = mirostat_mode_field - mirostat_tau: float = mirostat_tau_field - mirostat_eta: float = mirostat_eta_field - stop: Optional[List[str]] = stop_field - stream: bool = stream_field - presence_penalty: Optional[float] = presence_penalty_field - frequency_penalty: Optional[float] = frequency_penalty_field - logit_bias: Optional[Dict[str, float]] = Field(None) - - # ignored or currently unsupported - model: Optional[str] = model_field - n: Optional[int] = 1 - user: Optional[str] = Field(None) - - # falcon.cpp specific parameters - top_k: int = top_k_field - repeat_penalty: float = repeat_penalty_field - logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } - } - - -CreateChatCompletionResponse = create_model_from_typeddict(falcon_cpp.ChatCompletion) - - -@router.post( - "/v1/chat/completions", - response_model=CreateChatCompletionResponse, -) -async def create_chat_completion( - request: Request, - body: CreateChatCompletionRequest, - falcon: falcon_cpp.Falcon = Depends(get_falcon), -) -> Union[falcon_cpp.ChatCompletion, EventSourceResponse]: - exclude = { - "n", - "logit_bias", - "logit_bias_type", - "user", - } - kwargs = body.dict(exclude=exclude) - - if body.logit_bias is not None: - kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ - make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), - ]) - - if body.stream: - send_chan, recv_chan = anyio.create_memory_object_stream(10) - - async def event_publisher(inner_send_chan: MemoryObjectSendStream): - async with inner_send_chan: - try: - iterator: Iterator[falcon_cpp.ChatCompletionChunk] = await run_in_threadpool(falcon.create_chat_completion, **kwargs) # type: ignore - async for chat_chunk in iterate_in_threadpool(iterator): - await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) - if await request.is_disconnected(): - raise anyio.get_cancelled_exc_class()() - await inner_send_chan.send(dict(data="[DONE]")) - except anyio.get_cancelled_exc_class() as e: - print("disconnected") - with anyio.move_on_after(1, shield=True): - print( - f"Disconnected from client (via refresh/close) {request.client}" - ) - await inner_send_chan.send(dict(closing=True)) - raise e - - return EventSourceResponse( - recv_chan, - data_sender_callable=partial(event_publisher, send_chan), - ) - else: - completion: falcon_cpp.ChatCompletion = await run_in_threadpool( - falcon.create_chat_completion, **kwargs # type: ignore - ) - return completion - - -class ModelData(TypedDict): - id: str - object: Literal["model"] - owned_by: str - permissions: List[str] - - -class ModelList(TypedDict): - object: Literal["list"] - data: List[ModelData] - - -GetModelResponse = create_model_from_typeddict(ModelList) - - -@router.get("/v1/models", response_model=GetModelResponse) -async def get_models( - settings: Settings = Depends(get_settings), - falcon: falcon_cpp.Falcon = Depends(get_falcon), -) -> ModelList: - return { - "object": "list", - "data": [ - { - "id": settings.model_alias - if settings.model_alias is not None - else falcon.model_path, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index e4147790b..000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,21 +0,0 @@ -site_name: falcon-cpp-python -repo_url: https://github.com/sirajperson/falcon-cpp-python - -theme: - name: "material" - -plugins: - - mkdocstrings - - search - -watch: - - falcon_cpp - -markdown_extensions: - - pymdownx.highlight: - anchor_linenums: true - line_spans: __span - pygments_lang_class: true - - pymdownx.inlinehilite - - pymdownx.snippets - - pymdownx.superfences \ No newline at end of file diff --git a/poetry.lock b/poetry.lock deleted file mode 100644 index 8b86d0e63..000000000 --- a/poetry.lock +++ /dev/null @@ -1,1636 +0,0 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. - -[[package]] -name = "anyio" -version = "3.6.2" -description = "High level compatibility layer for multiple asynchronous event loop implementations" -optional = false -python-versions = ">=3.6.2" -files = [ - {file = "anyio-3.6.2-py3-none-any.whl", hash = "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"}, - {file = "anyio-3.6.2.tar.gz", hash = "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421"}, -] - -[package.dependencies] -idna = ">=2.8" -sniffio = ">=1.1" - -[package.extras] -doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] -test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] -trio = ["trio (>=0.16,<0.22)"] - -[[package]] -name = "black" -version = "23.3.0" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.7" -files = [ - {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, - {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, - {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, - {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, - {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, - {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, - {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, - {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, - {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, - {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, - {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, - {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, - {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, - {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, - {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, - {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, - {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, - {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" -platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2)"] - -[[package]] -name = "bleach" -version = "6.0.0" -description = "An easy safelist-based HTML-sanitizing tool." -optional = false -python-versions = ">=3.7" -files = [ - {file = "bleach-6.0.0-py3-none-any.whl", hash = "sha256:33c16e3353dbd13028ab4799a0f89a83f113405c766e9c122df8a06f5b85b3f4"}, - {file = "bleach-6.0.0.tar.gz", hash = "sha256:1a1a85c1595e07d8db14c5f09f09e6433502c51c595970edc090551f0db99414"}, -] - -[package.dependencies] -six = ">=1.9.0" -webencodings = "*" - -[package.extras] -css = ["tinycss2 (>=1.1.0,<1.2)"] - -[[package]] -name = "certifi" -version = "2023.5.7" -description = "Python package for providing Mozilla's CA Bundle." -optional = false -python-versions = ">=3.6" -files = [ - {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, - {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, -] - -[[package]] -name = "cffi" -version = "1.15.1" -description = "Foreign Function Interface for Python calling C code." -optional = false -python-versions = "*" -files = [ - {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, - {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, - {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, - {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, - {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, - {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, - {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, - {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, - {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, - {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, - {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, - {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, - {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, - {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, - {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, - {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, - {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, - {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, - {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, - {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, - {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, - {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, - {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, - {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, - {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, - {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, - {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, - {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, - {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, - {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, - {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, - {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, - {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, -] - -[package.dependencies] -pycparser = "*" - -[[package]] -name = "charset-normalizer" -version = "3.1.0" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"}, - {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"}, - {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"}, - {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"}, - {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"}, - {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"}, - {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, -] - -[[package]] -name = "click" -version = "8.1.3" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -files = [ - {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, - {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - -[[package]] -name = "colorama" -version = "0.4.6" -description = "Cross-platform colored terminal text." -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" -files = [ - {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, - {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, -] - -[[package]] -name = "cryptography" -version = "40.0.2" -description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." -optional = false -python-versions = ">=3.6" -files = [ - {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, - {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, - {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, - {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, - {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, - {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, - {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, - {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, - {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, - {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, - {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, - {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, - {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, - {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, - {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, - {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, - {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, - {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, - {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, -] - -[package.dependencies] -cffi = ">=1.12" - -[package.extras] -docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] -docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] -pep8test = ["black", "check-manifest", "mypy", "ruff"] -sdist = ["setuptools-rust (>=0.11.4)"] -ssh = ["bcrypt (>=3.1.5)"] -test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist"] -test-randomorder = ["pytest-randomly"] -tox = ["tox"] - -[[package]] -name = "diskcache" -version = "5.6.1" -description = "Disk Cache -- Disk and file backed persistent cache." -optional = false -python-versions = ">=3" -files = [ - {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"}, - {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"}, -] - -[[package]] -name = "distro" -version = "1.8.0" -description = "Distro - an OS platform information API" -optional = false -python-versions = ">=3.6" -files = [ - {file = "distro-1.8.0-py3-none-any.whl", hash = "sha256:99522ca3e365cac527b44bde033f64c6945d90eb9f769703caaec52b09bbd3ff"}, - {file = "distro-1.8.0.tar.gz", hash = "sha256:02e111d1dc6a50abb8eed6bf31c3e48ed8b0830d1ea2a1b78c61765c2513fdd8"}, -] - -[[package]] -name = "docutils" -version = "0.20" -description = "Docutils -- Python Documentation Utilities" -optional = false -python-versions = ">=3.7" -files = [ - {file = "docutils-0.20-py3-none-any.whl", hash = "sha256:a428f10de4de4774389734c986a01b4af2d802d26717108b0f1b9356862937c5"}, - {file = "docutils-0.20.tar.gz", hash = "sha256:f75a5a52fbcacd81b47e42888ad2b380748aaccfb3f13af0fe69deb759f01eb6"}, -] - -[[package]] -name = "exceptiongroup" -version = "1.1.1" -description = "Backport of PEP 654 (exception groups)" -optional = false -python-versions = ">=3.7" -files = [ - {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, - {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, -] - -[package.extras] -test = ["pytest (>=6)"] - -[[package]] -name = "fastapi" -version = "0.99.1" -description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" -optional = true -python-versions = ">=3.7" -files = [ - {file = "fastapi-0.99.1-py3-none-any.whl", hash = "sha256:976df7bab51ac7beda9f68c4513b8c4490b5c1135c72aafd0a5ee4023ec5282e"}, - {file = "fastapi-0.99.1.tar.gz", hash = "sha256:ac78f717cd80d657bd183f94d33b9bda84aa376a46a9dab513586b8eef1dc6fc"}, -] - -[package.dependencies] -pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" -starlette = ">=0.27.0,<0.28.0" -typing-extensions = ">=4.5.0" - -[package.extras] -all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] - -[[package]] -name = "ghp-import" -version = "2.1.0" -description = "Copy your docs directly to the gh-pages branch." -optional = false -python-versions = "*" -files = [ - {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, - {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, -] - -[package.dependencies] -python-dateutil = ">=2.8.1" - -[package.extras] -dev = ["flake8", "markdown", "twine", "wheel"] - -[[package]] -name = "griffe" -version = "0.27.3" -description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." -optional = false -python-versions = ">=3.7" -files = [ - {file = "griffe-0.27.3-py3-none-any.whl", hash = "sha256:094513b209d4acd4b2680c2415d3af5f8ed925714795380c2a7d070e222e0b27"}, - {file = "griffe-0.27.3.tar.gz", hash = "sha256:a3d0f75aa76b80f181f818cf605f658a69fccf287aaeeeafc7a6cf4e6a2ca27e"}, -] - -[package.dependencies] -colorama = ">=0.4" - -[[package]] -name = "h11" -version = "0.14.0" -description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" -optional = false -python-versions = ">=3.7" -files = [ - {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, - {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, -] - -[[package]] -name = "httpcore" -version = "0.17.0" -description = "A minimal low-level HTTP client." -optional = false -python-versions = ">=3.7" -files = [ - {file = "httpcore-0.17.0-py3-none-any.whl", hash = "sha256:0fdfea45e94f0c9fd96eab9286077f9ff788dd186635ae61b312693e4d943599"}, - {file = "httpcore-0.17.0.tar.gz", hash = "sha256:cc045a3241afbf60ce056202301b4d8b6af08845e3294055eb26b09913ef903c"}, -] - -[package.dependencies] -anyio = ">=3.0,<5.0" -certifi = "*" -h11 = ">=0.13,<0.15" -sniffio = "==1.*" - -[package.extras] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] - -[[package]] -name = "httpx" -version = "0.24.1" -description = "The next generation HTTP client." -optional = false -python-versions = ">=3.7" -files = [ - {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, - {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, -] - -[package.dependencies] -certifi = "*" -httpcore = ">=0.15.0,<0.18.0" -idna = "*" -sniffio = "*" - -[package.extras] -brotli = ["brotli", "brotlicffi"] -cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] -http2 = ["h2 (>=3,<5)"] -socks = ["socksio (==1.*)"] - -[[package]] -name = "idna" -version = "3.4" -description = "Internationalized Domain Names in Applications (IDNA)" -optional = false -python-versions = ">=3.5" -files = [ - {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, - {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, -] - -[[package]] -name = "importlib-metadata" -version = "6.6.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, - {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, -] - -[package.dependencies] -zipp = ">=0.5" - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -perf = ["ipython"] -testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] - -[[package]] -name = "importlib-resources" -version = "5.12.0" -description = "Read resources from Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "importlib_resources-5.12.0-py3-none-any.whl", hash = "sha256:7b1deeebbf351c7578e09bf2f63fa2ce8b5ffec296e0d349139d43cca061a81a"}, - {file = "importlib_resources-5.12.0.tar.gz", hash = "sha256:4be82589bf5c1d7999aedf2a45159d10cb3ca4f19b2271f8792bc8e6da7b22f6"}, -] - -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[[package]] -name = "iniconfig" -version = "2.0.0" -description = "brain-dead simple config-ini parsing" -optional = false -python-versions = ">=3.7" -files = [ - {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, - {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, -] - -[[package]] -name = "jaraco-classes" -version = "3.2.3" -description = "Utility functions for Python class constructs" -optional = false -python-versions = ">=3.7" -files = [ - {file = "jaraco.classes-3.2.3-py3-none-any.whl", hash = "sha256:2353de3288bc6b82120752201c6b1c1a14b058267fa424ed5ce5984e3b922158"}, - {file = "jaraco.classes-3.2.3.tar.gz", hash = "sha256:89559fa5c1d3c34eff6f631ad80bb21f378dbcbb35dd161fd2c6b93f5be2f98a"}, -] - -[package.dependencies] -more-itertools = "*" - -[package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] -testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[[package]] -name = "jeepney" -version = "0.8.0" -description = "Low-level, pure Python DBus protocol wrapper." -optional = false -python-versions = ">=3.7" -files = [ - {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, - {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, -] - -[package.extras] -test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] -trio = ["async_generator", "trio"] - -[[package]] -name = "jinja2" -version = "3.1.2" -description = "A very fast and expressive template engine." -optional = false -python-versions = ">=3.7" -files = [ - {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, - {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, -] - -[package.dependencies] -MarkupSafe = ">=2.0" - -[package.extras] -i18n = ["Babel (>=2.7)"] - -[[package]] -name = "keyring" -version = "23.13.1" -description = "Store and access your passwords safely." -optional = false -python-versions = ">=3.7" -files = [ - {file = "keyring-23.13.1-py3-none-any.whl", hash = "sha256:771ed2a91909389ed6148631de678f82ddc73737d85a927f382a8a1b157898cd"}, - {file = "keyring-23.13.1.tar.gz", hash = "sha256:ba2e15a9b35e21908d0aaf4e0a47acc52d6ae33444df0da2b49d41a46ef6d678"}, -] - -[package.dependencies] -importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""} -importlib-resources = {version = "*", markers = "python_version < \"3.9\""} -"jaraco.classes" = "*" -jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} -pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} -SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} - -[package.extras] -completion = ["shtab"] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] -testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[[package]] -name = "markdown" -version = "3.3.7" -description = "Python implementation of Markdown." -optional = false -python-versions = ">=3.6" -files = [ - {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"}, - {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"}, -] - -[package.dependencies] -importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} - -[package.extras] -testing = ["coverage", "pyyaml"] - -[[package]] -name = "markdown-it-py" -version = "2.2.0" -description = "Python port of markdown-it. Markdown parsing, done right!" -optional = false -python-versions = ">=3.7" -files = [ - {file = "markdown-it-py-2.2.0.tar.gz", hash = "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"}, - {file = "markdown_it_py-2.2.0-py3-none-any.whl", hash = "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30"}, -] - -[package.dependencies] -mdurl = ">=0.1,<1.0" - -[package.extras] -benchmarking = ["psutil", "pytest", "pytest-benchmark"] -code-style = ["pre-commit (>=3.0,<4.0)"] -compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] -linkify = ["linkify-it-py (>=1,<3)"] -plugins = ["mdit-py-plugins"] -profiling = ["gprof2dot"] -rtd = ["attrs", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] -testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] - -[[package]] -name = "markupsafe" -version = "2.1.2" -description = "Safely add untrusted strings to HTML/XML markup." -optional = false -python-versions = ">=3.7" -files = [ - {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-win32.whl", hash = "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603"}, - {file = "MarkupSafe-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-win32.whl", hash = "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625"}, - {file = "MarkupSafe-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-win32.whl", hash = "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859"}, - {file = "MarkupSafe-2.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-win32.whl", hash = "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2"}, - {file = "MarkupSafe-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-win32.whl", hash = "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7"}, - {file = "MarkupSafe-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed"}, - {file = "MarkupSafe-2.1.2.tar.gz", hash = "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d"}, -] - -[[package]] -name = "mdurl" -version = "0.1.2" -description = "Markdown URL utilities" -optional = false -python-versions = ">=3.7" -files = [ - {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, - {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, -] - -[[package]] -name = "mergedeep" -version = "1.3.4" -description = "A deep merge function for 🐍." -optional = false -python-versions = ">=3.6" -files = [ - {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, - {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, -] - -[[package]] -name = "mkdocs" -version = "1.4.3" -description = "Project documentation with Markdown." -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"}, - {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"}, -] - -[package.dependencies] -click = ">=7.0" -colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} -ghp-import = ">=1.0" -importlib-metadata = {version = ">=4.3", markers = "python_version < \"3.10\""} -jinja2 = ">=2.11.1" -markdown = ">=3.2.1,<3.4" -mergedeep = ">=1.3.4" -packaging = ">=20.5" -pyyaml = ">=5.1" -pyyaml-env-tag = ">=0.1" -watchdog = ">=2.0" - -[package.extras] -i18n = ["babel (>=2.9.0)"] -min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.3)", "jinja2 (==2.11.1)", "markdown (==3.2.1)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "packaging (==20.5)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "typing-extensions (==3.10)", "watchdog (==2.0)"] - -[[package]] -name = "mkdocs-autorefs" -version = "0.4.1" -description = "Automatically link across pages in MkDocs." -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocs-autorefs-0.4.1.tar.gz", hash = "sha256:70748a7bd025f9ecd6d6feeba8ba63f8e891a1af55f48e366d6d6e78493aba84"}, - {file = "mkdocs_autorefs-0.4.1-py3-none-any.whl", hash = "sha256:a2248a9501b29dc0cc8ba4c09f4f47ff121945f6ce33d760f145d6f89d313f5b"}, -] - -[package.dependencies] -Markdown = ">=3.3" -mkdocs = ">=1.1" - -[[package]] -name = "mkdocs-material" -version = "9.1.17" -description = "Documentation that simply works" -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocs_material-9.1.17-py3-none-any.whl", hash = "sha256:809ed68427fbab0330b0b07bc93175824c3b98f4187060a5c7b46aa8ae398a75"}, - {file = "mkdocs_material-9.1.17.tar.gz", hash = "sha256:5a076524625047bf4ee4da1509ec90626f8fce915839dc07bdae6b59ff4f36f9"}, -] - -[package.dependencies] -colorama = ">=0.4" -jinja2 = ">=3.0" -markdown = ">=3.2" -mkdocs = ">=1.4.2" -mkdocs-material-extensions = ">=1.1" -pygments = ">=2.14" -pymdown-extensions = ">=9.9.1" -regex = ">=2022.4.24" -requests = ">=2.26" - -[[package]] -name = "mkdocs-material-extensions" -version = "1.1.1" -description = "Extension pack for Python Markdown and MkDocs Material." -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocs_material_extensions-1.1.1-py3-none-any.whl", hash = "sha256:e41d9f38e4798b6617ad98ca8f7f1157b1e4385ac1459ca1e4ea219b556df945"}, - {file = "mkdocs_material_extensions-1.1.1.tar.gz", hash = "sha256:9c003da71e2cc2493d910237448c672e00cefc800d3d6ae93d2fc69979e3bd93"}, -] - -[[package]] -name = "mkdocstrings" -version = "0.22.0" -description = "Automatic documentation from sources, for MkDocs." -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocstrings-0.22.0-py3-none-any.whl", hash = "sha256:2d4095d461554ff6a778fdabdca3c00c468c2f1459d469f7a7f622a2b23212ba"}, - {file = "mkdocstrings-0.22.0.tar.gz", hash = "sha256:82a33b94150ebb3d4b5c73bab4598c3e21468c79ec072eff6931c8f3bfc38256"}, -] - -[package.dependencies] -importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} -Jinja2 = ">=2.11.1" -Markdown = ">=3.3" -MarkupSafe = ">=1.1" -mkdocs = ">=1.2" -mkdocs-autorefs = ">=0.3.1" -mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} -pymdown-extensions = ">=6.3" -typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} - -[package.extras] -crystal = ["mkdocstrings-crystal (>=0.3.4)"] -python = ["mkdocstrings-python (>=0.5.2)"] -python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] - -[[package]] -name = "mkdocstrings-python" -version = "0.10.1" -description = "A Python handler for mkdocstrings." -optional = false -python-versions = ">=3.7" -files = [ - {file = "mkdocstrings_python-0.10.1-py3-none-any.whl", hash = "sha256:ef239cee2c688e2b949a0a47e42a141d744dd12b7007311b3309dc70e3bafc5c"}, - {file = "mkdocstrings_python-0.10.1.tar.gz", hash = "sha256:b72301fff739070ec517b5b36bf2f7c49d1360a275896a64efb97fc17d3f3968"}, -] - -[package.dependencies] -griffe = ">=0.24" -mkdocstrings = ">=0.20" - -[[package]] -name = "more-itertools" -version = "9.1.0" -description = "More routines for operating on iterables, beyond itertools" -optional = false -python-versions = ">=3.7" -files = [ - {file = "more-itertools-9.1.0.tar.gz", hash = "sha256:cabaa341ad0389ea83c17a94566a53ae4c9d07349861ecb14dc6d0345cf9ac5d"}, - {file = "more_itertools-9.1.0-py3-none-any.whl", hash = "sha256:d2bc7f02446e86a68911e58ded76d6561eea00cddfb2a91e7019bbb586c799f3"}, -] - -[[package]] -name = "mypy-extensions" -version = "1.0.0" -description = "Type system extensions for programs checked with the mypy type checker." -optional = false -python-versions = ">=3.5" -files = [ - {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, - {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, -] - -[[package]] -name = "numpy" -version = "1.24.4" -description = "Fundamental package for array computing in Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, - {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, - {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, - {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, - {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, - {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, - {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, - {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, - {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, - {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, - {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, - {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, - {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, - {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, - {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, - {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, - {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, - {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, - {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, -] - -[[package]] -name = "packaging" -version = "23.1" -description = "Core utilities for Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, - {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, -] - -[[package]] -name = "pathspec" -version = "0.11.1" -description = "Utility library for gitignore style pattern matching of file paths." -optional = false -python-versions = ">=3.7" -files = [ - {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, - {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, -] - -[[package]] -name = "pkginfo" -version = "1.9.6" -description = "Query metadata from sdists / bdists / installed packages." -optional = false -python-versions = ">=3.6" -files = [ - {file = "pkginfo-1.9.6-py3-none-any.whl", hash = "sha256:4b7a555a6d5a22169fcc9cf7bfd78d296b0361adad412a346c1226849af5e546"}, - {file = "pkginfo-1.9.6.tar.gz", hash = "sha256:8fd5896e8718a4372f0ea9cc9d96f6417c9b986e23a4d116dda26b62cc29d046"}, -] - -[package.extras] -testing = ["pytest", "pytest-cov"] - -[[package]] -name = "platformdirs" -version = "3.5.0" -description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." -optional = false -python-versions = ">=3.7" -files = [ - {file = "platformdirs-3.5.0-py3-none-any.whl", hash = "sha256:47692bc24c1958e8b0f13dd727307cff1db103fca36399f457da8e05f222fdc4"}, - {file = "platformdirs-3.5.0.tar.gz", hash = "sha256:7954a68d0ba23558d753f73437c55f89027cf8f5108c19844d4b82e5af396335"}, -] - -[package.extras] -docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] -test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] - -[[package]] -name = "pluggy" -version = "1.0.0" -description = "plugin and hook calling mechanisms for python" -optional = false -python-versions = ">=3.6" -files = [ - {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, - {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, -] - -[package.extras] -dev = ["pre-commit", "tox"] -testing = ["pytest", "pytest-benchmark"] - -[[package]] -name = "pycparser" -version = "2.21" -description = "C parser in Python" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, - {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, -] - -[[package]] -name = "pydantic" -version = "1.10.7" -description = "Data validation and settings management using python type hints" -optional = true -python-versions = ">=3.7" -files = [ - {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, - {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, - {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, - {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, - {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, - {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, - {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, - {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, - {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, - {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, - {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, - {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, - {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, - {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, - {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, - {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, - {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, - {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, - {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, - {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, - {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, - {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, - {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, -] - -[package.dependencies] -typing-extensions = ">=4.2.0" - -[package.extras] -dotenv = ["python-dotenv (>=0.10.4)"] -email = ["email-validator (>=1.0.3)"] - -[[package]] -name = "pygments" -version = "2.15.1" -description = "Pygments is a syntax highlighting package written in Python." -optional = false -python-versions = ">=3.7" -files = [ - {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, - {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, -] - -[package.extras] -plugins = ["importlib-metadata"] - -[[package]] -name = "pymdown-extensions" -version = "9.11" -description = "Extension pack for Python Markdown." -optional = false -python-versions = ">=3.7" -files = [ - {file = "pymdown_extensions-9.11-py3-none-any.whl", hash = "sha256:a499191d8d869f30339de86fcf072a787e86c42b6f16f280f5c2cf174182b7f3"}, - {file = "pymdown_extensions-9.11.tar.gz", hash = "sha256:f7e86c1d3981f23d9dc43294488ecb54abadd05b0be4bf8f0e15efc90f7853ff"}, -] - -[package.dependencies] -markdown = ">=3.2" -pyyaml = "*" - -[[package]] -name = "pytest" -version = "7.4.0" -description = "pytest: simple powerful testing with Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, - {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "sys_platform == \"win32\""} -exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} -iniconfig = "*" -packaging = "*" -pluggy = ">=0.12,<2.0" -tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} - -[package.extras] -testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] - -[[package]] -name = "python-dateutil" -version = "2.8.2" -description = "Extensions to the standard Python datetime module" -optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" -files = [ - {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, - {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, -] - -[package.dependencies] -six = ">=1.5" - -[[package]] -name = "pywin32-ctypes" -version = "0.2.0" -description = "" -optional = false -python-versions = "*" -files = [ - {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, - {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, -] - -[[package]] -name = "pyyaml" -version = "6.0" -description = "YAML parser and emitter for Python" -optional = false -python-versions = ">=3.6" -files = [ - {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, - {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, - {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, - {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, - {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, - {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, - {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, - {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, - {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, - {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, - {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, - {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, - {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, - {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, - {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, - {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, - {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, - {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, - {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, - {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, - {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, - {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, - {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, - {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, - {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, - {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, -] - -[[package]] -name = "pyyaml-env-tag" -version = "0.1" -description = "A custom YAML tag for referencing environment variables in YAML files. " -optional = false -python-versions = ">=3.6" -files = [ - {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, - {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, -] - -[package.dependencies] -pyyaml = "*" - -[[package]] -name = "readme-renderer" -version = "37.3" -description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" -optional = false -python-versions = ">=3.7" -files = [ - {file = "readme_renderer-37.3-py3-none-any.whl", hash = "sha256:f67a16caedfa71eef48a31b39708637a6f4664c4394801a7b0d6432d13907343"}, - {file = "readme_renderer-37.3.tar.gz", hash = "sha256:cd653186dfc73055656f090f227f5cb22a046d7f71a841dfa305f55c9a513273"}, -] - -[package.dependencies] -bleach = ">=2.1.0" -docutils = ">=0.13.1" -Pygments = ">=2.5.1" - -[package.extras] -md = ["cmarkgfm (>=0.8.0)"] - -[[package]] -name = "regex" -version = "2023.5.5" -description = "Alternative regular expression module, to replace re." -optional = false -python-versions = ">=3.6" -files = [ - {file = "regex-2023.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:48c9ec56579d4ba1c88f42302194b8ae2350265cb60c64b7b9a88dcb7fbde309"}, - {file = "regex-2023.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f4541550459c08fdd6f97aa4e24c6f1932eec780d58a2faa2068253df7d6ff"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e22e4460f0245b468ee645156a4f84d0fc35a12d9ba79bd7d79bdcd2f9629d"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b870b6f632fc74941cadc2a0f3064ed8409e6f8ee226cdfd2a85ae50473aa94"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:171c52e320fe29260da550d81c6b99f6f8402450dc7777ef5ced2e848f3b6f8f"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad5524c2aedaf9aa14ef1bc9327f8abd915699dea457d339bebbe2f0d218f86"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a0f874ee8c0bc820e649c900243c6d1e6dc435b81da1492046716f14f1a2a96"}, - {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e645c757183ee0e13f0bbe56508598e2d9cd42b8abc6c0599d53b0d0b8dd1479"}, - {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a4c5da39bca4f7979eefcbb36efea04471cd68db2d38fcbb4ee2c6d440699833"}, - {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5e3f4468b8c6fd2fd33c218bbd0a1559e6a6fcf185af8bb0cc43f3b5bfb7d636"}, - {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:59e4b729eae1a0919f9e4c0fc635fbcc9db59c74ad98d684f4877be3d2607dd6"}, - {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ba73a14e9c8f9ac409863543cde3290dba39098fc261f717dc337ea72d3ebad2"}, - {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0bbd5dcb19603ab8d2781fac60114fb89aee8494f4505ae7ad141a3314abb1f9"}, - {file = "regex-2023.5.5-cp310-cp310-win32.whl", hash = "sha256:40005cbd383438aecf715a7b47fe1e3dcbc889a36461ed416bdec07e0ef1db66"}, - {file = "regex-2023.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:59597cd6315d3439ed4b074febe84a439c33928dd34396941b4d377692eca810"}, - {file = "regex-2023.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f08276466fedb9e36e5193a96cb944928301152879ec20c2d723d1031cd4ddd"}, - {file = "regex-2023.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cd46f30e758629c3ee91713529cfbe107ac50d27110fdcc326a42ce2acf4dafc"}, - {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2910502f718828cecc8beff004917dcf577fc5f8f5dd40ffb1ea7612124547b"}, - {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:445d6f4fc3bd9fc2bf0416164454f90acab8858cd5a041403d7a11e3356980e8"}, - {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18196c16a584619c7c1d843497c069955d7629ad4a3fdee240eb347f4a2c9dbe"}, - {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d430a23b661629661f1fe8395be2004006bc792bb9fc7c53911d661b69dd7e"}, - {file = "regex-2023.5.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72a28979cc667e5f82ef433db009184e7ac277844eea0f7f4d254b789517941d"}, - {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f764e4dfafa288e2eba21231f455d209f4709436baeebb05bdecfb5d8ddc3d35"}, - {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23d86ad2121b3c4fc78c58f95e19173790e22ac05996df69b84e12da5816cb17"}, - {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:690a17db524ee6ac4a27efc5406530dd90e7a7a69d8360235323d0e5dafb8f5b"}, - {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1ecf3dcff71f0c0fe3e555201cbe749fa66aae8d18f80d2cc4de8e66df37390a"}, - {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:811040d7f3dd9c55eb0d8b00b5dcb7fd9ae1761c454f444fd9f37fe5ec57143a"}, - {file = "regex-2023.5.5-cp311-cp311-win32.whl", hash = "sha256:c8c143a65ce3ca42e54d8e6fcaf465b6b672ed1c6c90022794a802fb93105d22"}, - {file = "regex-2023.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:586a011f77f8a2da4b888774174cd266e69e917a67ba072c7fc0e91878178a80"}, - {file = "regex-2023.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b6365703e8cf1644b82104cdd05270d1a9f043119a168d66c55684b1b557d008"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a56c18f21ac98209da9c54ae3ebb3b6f6e772038681d6cb43b8d53da3b09ee81"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8b942d8b3ce765dbc3b1dad0a944712a89b5de290ce8f72681e22b3c55f3cc8"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:844671c9c1150fcdac46d43198364034b961bd520f2c4fdaabfc7c7d7138a2dd"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2ce65bdeaf0a386bb3b533a28de3994e8e13b464ac15e1e67e4603dd88787fa"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fee0016cc35a8a91e8cc9312ab26a6fe638d484131a7afa79e1ce6165328a135"}, - {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:18f05d14f14a812fe9723f13afafefe6b74ca042d99f8884e62dbd34dcccf3e2"}, - {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:941b3f1b2392f0bcd6abf1bc7a322787d6db4e7457be6d1ffd3a693426a755f2"}, - {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:921473a93bcea4d00295799ab929522fc650e85c6b9f27ae1e6bb32a790ea7d3"}, - {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:e2205a81f815b5bb17e46e74cc946c575b484e5f0acfcb805fb252d67e22938d"}, - {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:385992d5ecf1a93cb85adff2f73e0402dd9ac29b71b7006d342cc920816e6f32"}, - {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:890a09cb0a62198bff92eda98b2b507305dd3abf974778bae3287f98b48907d3"}, - {file = "regex-2023.5.5-cp36-cp36m-win32.whl", hash = "sha256:821a88b878b6589c5068f4cc2cfeb2c64e343a196bc9d7ac68ea8c2a776acd46"}, - {file = "regex-2023.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:7918a1b83dd70dc04ab5ed24c78ae833ae8ea228cef84e08597c408286edc926"}, - {file = "regex-2023.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:338994d3d4ca4cf12f09822e025731a5bdd3a37aaa571fa52659e85ca793fb67"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a69cf0c00c4d4a929c6c7717fd918414cab0d6132a49a6d8fc3ded1988ed2ea"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f5e06df94fff8c4c85f98c6487f6636848e1dc85ce17ab7d1931df4a081f657"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8906669b03c63266b6a7693d1f487b02647beb12adea20f8840c1a087e2dfb5"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fda3e50abad8d0f48df621cf75adc73c63f7243cbe0e3b2171392b445401550"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ac2b7d341dc1bd102be849d6dd33b09701223a851105b2754339e390be0627a"}, - {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fb2b495dd94b02de8215625948132cc2ea360ae84fe6634cd19b6567709c8ae2"}, - {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aa7d032c1d84726aa9edeb6accf079b4caa87151ca9fabacef31fa028186c66d"}, - {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3d45864693351c15531f7e76f545ec35000d50848daa833cead96edae1665559"}, - {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21e90a288e6ba4bf44c25c6a946cb9b0f00b73044d74308b5e0afd190338297c"}, - {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:10250a093741ec7bf74bcd2039e697f519b028518f605ff2aa7ac1e9c9f97423"}, - {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6b8d0c153f07a953636b9cdb3011b733cadd4178123ef728ccc4d5969e67f3c2"}, - {file = "regex-2023.5.5-cp37-cp37m-win32.whl", hash = "sha256:10374c84ee58c44575b667310d5bbfa89fb2e64e52349720a0182c0017512f6c"}, - {file = "regex-2023.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9b320677521aabf666cdd6e99baee4fb5ac3996349c3b7f8e7c4eee1c00dfe3a"}, - {file = "regex-2023.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:afb1c70ec1e594a547f38ad6bf5e3d60304ce7539e677c1429eebab115bce56e"}, - {file = "regex-2023.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cf123225945aa58b3057d0fba67e8061c62d14cc8a4202630f8057df70189051"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99757ad7fe5c8a2bb44829fc57ced11253e10f462233c1255fe03888e06bc19"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a623564d810e7a953ff1357f7799c14bc9beeab699aacc8b7ab7822da1e952b8"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ced02e3bd55e16e89c08bbc8128cff0884d96e7f7a5633d3dc366b6d95fcd1d6"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cbe6b5be3b9b698d8cc4ee4dee7e017ad655e83361cd0ea8e653d65e469468"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a6e4b0e0531223f53bad07ddf733af490ba2b8367f62342b92b39b29f72735a"}, - {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e9c4f778514a560a9c9aa8e5538bee759b55f6c1dcd35613ad72523fd9175b8"}, - {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:256f7f4c6ba145f62f7a441a003c94b8b1af78cee2cccacfc1e835f93bc09426"}, - {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:bd7b68fd2e79d59d86dcbc1ccd6e2ca09c505343445daaa4e07f43c8a9cc34da"}, - {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4a5059bd585e9e9504ef9c07e4bc15b0a621ba20504388875d66b8b30a5c4d18"}, - {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:6893544e06bae009916a5658ce7207e26ed17385149f35a3125f5259951f1bbe"}, - {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c64d5abe91a3dfe5ff250c6bb267ef00dbc01501518225b45a5f9def458f31fb"}, - {file = "regex-2023.5.5-cp38-cp38-win32.whl", hash = "sha256:7923470d6056a9590247ff729c05e8e0f06bbd4efa6569c916943cb2d9b68b91"}, - {file = "regex-2023.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:4035d6945cb961c90c3e1c1ca2feb526175bcfed44dfb1cc77db4fdced060d3e"}, - {file = "regex-2023.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50fd2d9b36938d4dcecbd684777dd12a407add4f9f934f235c66372e630772b0"}, - {file = "regex-2023.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d19e57f888b00cd04fc38f5e18d0efbd91ccba2d45039453ab2236e6eec48d4d"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd966475e963122ee0a7118ec9024388c602d12ac72860f6eea119a3928be053"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db09e6c18977a33fea26fe67b7a842f706c67cf8bda1450974d0ae0dd63570df"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6164d4e2a82f9ebd7752a06bd6c504791bedc6418c0196cd0a23afb7f3e12b2d"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84397d3f750d153ebd7f958efaa92b45fea170200e2df5e0e1fd4d85b7e3f58a"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c3efee9bb53cbe7b285760c81f28ac80dc15fa48b5fe7e58b52752e642553f1"}, - {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:144b5b017646b5a9392a5554a1e5db0000ae637be4971c9747566775fc96e1b2"}, - {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1189fbbb21e2c117fda5303653b61905aeeeea23de4a94d400b0487eb16d2d60"}, - {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f83fe9e10f9d0b6cf580564d4d23845b9d692e4c91bd8be57733958e4c602956"}, - {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:72aa4746993a28c841e05889f3f1b1e5d14df8d3daa157d6001a34c98102b393"}, - {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:de2f780c3242ea114dd01f84848655356af4dd561501896c751d7b885ea6d3a1"}, - {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:290fd35219486dfbc00b0de72f455ecdd63e59b528991a6aec9fdfc0ce85672e"}, - {file = "regex-2023.5.5-cp39-cp39-win32.whl", hash = "sha256:732176f5427e72fa2325b05c58ad0b45af341c459910d766f814b0584ac1f9ac"}, - {file = "regex-2023.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:1307aa4daa1cbb23823d8238e1f61292fd07e4e5d8d38a6efff00b67a7cdb764"}, - {file = "regex-2023.5.5.tar.gz", hash = "sha256:7d76a8a1fc9da08296462a18f16620ba73bcbf5909e42383b253ef34d9d5141e"}, -] - -[[package]] -name = "requests" -version = "2.30.0" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.7" -files = [ - {file = "requests-2.30.0-py3-none-any.whl", hash = "sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294"}, - {file = "requests-2.30.0.tar.gz", hash = "sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[[package]] -name = "requests-toolbelt" -version = "1.0.0" -description = "A utility belt for advanced users of python-requests" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, - {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, -] - -[package.dependencies] -requests = ">=2.0.1,<3.0.0" - -[[package]] -name = "rfc3986" -version = "2.0.0" -description = "Validating URI References per RFC 3986" -optional = false -python-versions = ">=3.7" -files = [ - {file = "rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd"}, - {file = "rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c"}, -] - -[package.extras] -idna2008 = ["idna"] - -[[package]] -name = "rich" -version = "13.3.5" -description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "rich-13.3.5-py3-none-any.whl", hash = "sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704"}, - {file = "rich-13.3.5.tar.gz", hash = "sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c"}, -] - -[package.dependencies] -markdown-it-py = ">=2.2.0,<3.0.0" -pygments = ">=2.13.0,<3.0.0" -typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} - -[package.extras] -jupyter = ["ipywidgets (>=7.5.1,<9)"] - -[[package]] -name = "scikit-build" -version = "0.17.6" -description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" -optional = false -python-versions = ">=3.7" -files = [ - {file = "scikit_build-0.17.6-py3-none-any.whl", hash = "sha256:18bd55e81841106eec93f30a297df4f301003791c41be46ef6428d58bd42d6b3"}, - {file = "scikit_build-0.17.6.tar.gz", hash = "sha256:b51a51a36b37c42650994b5047912f59b22e3210b23e321f287611f9ef6e5c9d"}, -] - -[package.dependencies] -distro = "*" -packaging = "*" -setuptools = ">=42.0.0" -tomli = {version = "*", markers = "python_version < \"3.11\""} -wheel = ">=0.32.0" - -[package.extras] -cov = ["coverage[toml] (>=4.2)", "pytest-cov (>=2.7.1)"] -docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] -doctest = ["ubelt (>=0.8.2)", "xdoctest (>=0.10.0)"] -test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "requests", "virtualenv"] - -[[package]] -name = "secretstorage" -version = "3.3.3" -description = "Python bindings to FreeDesktop.org Secret Service API" -optional = false -python-versions = ">=3.6" -files = [ - {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, - {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, -] - -[package.dependencies] -cryptography = ">=2.0" -jeepney = ">=0.6" - -[[package]] -name = "setuptools" -version = "67.7.2" -description = "Easily download, build, install, upgrade, and uninstall Python packages" -optional = false -python-versions = ">=3.7" -files = [ - {file = "setuptools-67.7.2-py3-none-any.whl", hash = "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b"}, - {file = "setuptools-67.7.2.tar.gz", hash = "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990"}, -] - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] -testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] -testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] - -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - -[[package]] -name = "sniffio" -version = "1.3.0" -description = "Sniff out which async library your code is running under" -optional = false -python-versions = ">=3.7" -files = [ - {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, - {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, -] - -[[package]] -name = "sse-starlette" -version = "1.6.1" -description = "\"SSE plugin for Starlette\"" -optional = true -python-versions = ">=3.8" -files = [ - {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"}, - {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"}, -] - -[package.dependencies] -starlette = "*" - -[[package]] -name = "starlette" -version = "0.27.0" -description = "The little ASGI library that shines." -optional = true -python-versions = ">=3.7" -files = [ - {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, - {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, -] - -[package.dependencies] -anyio = ">=3.4.0,<5" -typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} - -[package.extras] -full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] - -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - -[[package]] -name = "twine" -version = "4.0.2" -description = "Collection of utilities for publishing packages on PyPI" -optional = false -python-versions = ">=3.7" -files = [ - {file = "twine-4.0.2-py3-none-any.whl", hash = "sha256:929bc3c280033347a00f847236564d1c52a3e61b1ac2516c97c48f3ceab756d8"}, - {file = "twine-4.0.2.tar.gz", hash = "sha256:9e102ef5fdd5a20661eb88fad46338806c3bd32cf1db729603fe3697b1bc83c8"}, -] - -[package.dependencies] -importlib-metadata = ">=3.6" -keyring = ">=15.1" -pkginfo = ">=1.8.1" -readme-renderer = ">=35.0" -requests = ">=2.20" -requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" -rfc3986 = ">=1.4.0" -rich = ">=12.0.0" -urllib3 = ">=1.26.0" - -[[package]] -name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" -optional = false -python-versions = ">=3.7" -files = [ - {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, - {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, -] - -[[package]] -name = "urllib3" -version = "2.0.2" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.7" -files = [ - {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, - {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - -[[package]] -name = "uvicorn" -version = "0.22.0" -description = "The lightning-fast ASGI server." -optional = true -python-versions = ">=3.7" -files = [ - {file = "uvicorn-0.22.0-py3-none-any.whl", hash = "sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996"}, - {file = "uvicorn-0.22.0.tar.gz", hash = "sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8"}, -] - -[package.dependencies] -click = ">=7.0" -h11 = ">=0.8" - -[package.extras] -standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] - -[[package]] -name = "watchdog" -version = "3.0.0" -description = "Filesystem events monitoring" -optional = false -python-versions = ">=3.7" -files = [ - {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:336adfc6f5cc4e037d52db31194f7581ff744b67382eb6021c868322e32eef41"}, - {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a70a8dcde91be523c35b2bf96196edc5730edb347e374c7de7cd20c43ed95397"}, - {file = "watchdog-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:adfdeab2da79ea2f76f87eb42a3ab1966a5313e5a69a0213a3cc06ef692b0e96"}, - {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2b57a1e730af3156d13b7fdddfc23dea6487fceca29fc75c5a868beed29177ae"}, - {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7ade88d0d778b1b222adebcc0927428f883db07017618a5e684fd03b83342bd9"}, - {file = "watchdog-3.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7e447d172af52ad204d19982739aa2346245cc5ba6f579d16dac4bfec226d2e7"}, - {file = "watchdog-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9fac43a7466eb73e64a9940ac9ed6369baa39b3bf221ae23493a9ec4d0022674"}, - {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8ae9cda41fa114e28faf86cb137d751a17ffd0316d1c34ccf2235e8a84365c7f"}, - {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f70b4aa53bd743729c7475d7ec41093a580528b100e9a8c5b5efe8899592fc"}, - {file = "watchdog-3.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4f94069eb16657d2c6faada4624c39464f65c05606af50bb7902e036e3219be3"}, - {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c5f84b5194c24dd573fa6472685b2a27cc5a17fe5f7b6fd40345378ca6812e3"}, - {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa7f6a12e831ddfe78cdd4f8996af9cf334fd6346531b16cec61c3b3c0d8da0"}, - {file = "watchdog-3.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:233b5817932685d39a7896b1090353fc8efc1ef99c9c054e46c8002561252fb8"}, - {file = "watchdog-3.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:13bbbb462ee42ec3c5723e1205be8ced776f05b100e4737518c67c8325cf6100"}, - {file = "watchdog-3.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8f3ceecd20d71067c7fd4c9e832d4e22584318983cabc013dbf3f70ea95de346"}, - {file = "watchdog-3.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c9d8c8ec7efb887333cf71e328e39cffbf771d8f8f95d308ea4125bf5f90ba64"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0e06ab8858a76e1219e68c7573dfeba9dd1c0219476c5a44d5333b01d7e1743a"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:d00e6be486affb5781468457b21a6cbe848c33ef43f9ea4a73b4882e5f188a44"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:c07253088265c363d1ddf4b3cdb808d59a0468ecd017770ed716991620b8f77a"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:5113334cf8cf0ac8cd45e1f8309a603291b614191c9add34d33075727a967709"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:51f90f73b4697bac9c9a78394c3acbbd331ccd3655c11be1a15ae6fe289a8c83"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:ba07e92756c97e3aca0912b5cbc4e5ad802f4557212788e72a72a47ff376950d"}, - {file = "watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d429c2430c93b7903914e4db9a966c7f2b068dd2ebdd2fa9b9ce094c7d459f33"}, - {file = "watchdog-3.0.0-py3-none-win32.whl", hash = "sha256:3ed7c71a9dccfe838c2f0b6314ed0d9b22e77d268c67e015450a29036a81f60f"}, - {file = "watchdog-3.0.0-py3-none-win_amd64.whl", hash = "sha256:4c9956d27be0bb08fc5f30d9d0179a855436e655f046d288e2bcc11adfae893c"}, - {file = "watchdog-3.0.0-py3-none-win_ia64.whl", hash = "sha256:5d9f3a10e02d7371cd929b5d8f11e87d4bad890212ed3901f9b4d68767bee759"}, - {file = "watchdog-3.0.0.tar.gz", hash = "sha256:4d98a320595da7a7c5a18fc48cb633c2e73cda78f93cac2ef42d42bf609a33f9"}, -] - -[package.extras] -watchmedo = ["PyYAML (>=3.10)"] - -[[package]] -name = "webencodings" -version = "0.5.1" -description = "Character encoding aliases for legacy web content" -optional = false -python-versions = "*" -files = [ - {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, - {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, -] - -[[package]] -name = "wheel" -version = "0.40.0" -description = "A built-package format for Python" -optional = false -python-versions = ">=3.7" -files = [ - {file = "wheel-0.40.0-py3-none-any.whl", hash = "sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247"}, - {file = "wheel-0.40.0.tar.gz", hash = "sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873"}, -] - -[package.extras] -test = ["pytest (>=6.0.0)"] - -[[package]] -name = "zipp" -version = "3.15.0" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.7" -files = [ - {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, - {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, -] - -[package.extras] -docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] - -[extras] -server = ["fastapi", "sse-starlette", "uvicorn"] - -[metadata] -lock-version = "2.0" -python-versions = "^3.8.1" -content-hash = "ed454fad4bd4ea920624c1bcdf2beb74bdb8e9394c22156234c8bc0fde770bd8" diff --git a/poetry.toml b/poetry.toml deleted file mode 100644 index be97f1ef2..000000000 --- a/poetry.toml +++ /dev/null @@ -1,3 +0,0 @@ -[virtualenvs] -in-project = true -prefer-active-python = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 36d266699..000000000 --- a/pyproject.toml +++ /dev/null @@ -1,44 +0,0 @@ -[tool.poetry] -name = "falcon_cpp_python" -version = "0.0.1" -description = "Python bindings for the ggllm.cpp library" -authors = ["Andrei Betlen "] -license = "MIT" -readme = "README.md" -homepage = "https://github.com/abetlen/falcon-cpp-python" -repository = "https://github.com/abetlen/falcon-cpp-python" -packages = [{include = "falcon_cpp"}] -include = [ - "LICENSE.md", -] - -[tool.poetry.dependencies] -python = "^3.8.1" -typing-extensions = "^4.7.1" -numpy = "^1.24.4" -diskcache = "^5.6.1" -uvicorn = { version = "^0.22.0", optional = true } -fastapi = { version = "^0.99.1", optional = true } -sse-starlette = { version = "^1.6.1", optional = true } - -[tool.poetry.group.dev.dependencies] -black = "^23.3.0" -twine = "^4.0.2" -mkdocs = "^1.4.3" -mkdocstrings = {extras = ["python"], version = "^0.22.0"} -mkdocs-material = "^9.1.17" -pytest = "^7.4.0" -httpx = "^0.24.1" -scikit-build = "0.17.6" - -[tool.poetry.extras] -server = ["uvicorn", "fastapi", "sse-starlette"] - -[build-system] -requires = [ - "setuptools>=42", - "scikit-build>=0.13", - "cmake>=3.18", - "ninja", -] -build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index aab60283c..000000000 --- a/setup.py +++ /dev/null @@ -1,32 +0,0 @@ -from skbuild import setup - -from pathlib import Path - -this_directory = Path(__file__).parent -long_description = (this_directory / "README.md").read_text(encoding="utf-8") - -setup( - name="falcon_cpp_python", - description="A Python wrapper for ggllm.cpp to run Falcon models", - long_description=long_description, - long_description_content_type="text/markdown", - version="0.0.1", - author="Siraj Florida", - author_email="sirajperson@gmail.com", - license="MIT", - package_dir={"falcon_cpp": "falcon_cpp", "falcon_cpp.server": "falcon_cpp/server"}, - packages=["falcon_cpp", "falcon_cpp.server"], - install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], - extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], - }, - python_requires=">=3.7", - classifiers=[ - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - ], -) diff --git a/tests/test_falcon.py b/tests/test_falcon.py deleted file mode 100644 index d162cc6d6..000000000 --- a/tests/test_falcon.py +++ /dev/null @@ -1,171 +0,0 @@ -import falcon_cpp - -MODEL = "./vendor/ggllm/models/ggml-vocab.bin" - - -def test_falcon(): - falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - - assert falcon - assert falcon.ctx is not None - - text = b"Hello World" - - assert falcon.detokenize(falcon.tokenize(text)) == text - - -# @pytest.mark.skip(reason="need to update sample mocking") -def test_falcon_patch(monkeypatch): - falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) - - ## Set up mock function - def mock_eval(*args, **kwargs): - return 0 - - def mock_get_logits(*args, **kwargs): - return (falcon_cpp.c_float * n_vocab)( - *[falcon_cpp.c_float(0) for _ in range(n_vocab)] - ) - - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) - - output_text = " jumps over the lazy dog." - output_tokens = falcon.tokenize(output_text.encode("utf-8")) - token_eos = falcon.token_eos() - n = 0 - - def mock_sample(*args, **kwargs): - nonlocal n - if n < len(output_tokens): - n += 1 - return output_tokens[n - 1] - else: - return token_eos - - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_cpp_sample_token", mock_sample) - - text = "The quick brown fox" - - ## Test basic completion until eos - n = 0 # reset - completion = falcon.create_completion(text, max_tokens=20) - assert completion["choices"][0]["text"] == output_text - assert completion["choices"][0]["finish_reason"] == "stop" - - ## Test streaming completion until eos - n = 0 # reset - chunks = falcon.create_completion(text, max_tokens=20, stream=True) - assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text - assert completion["choices"][0]["finish_reason"] == "stop" - - ## Test basic completion until stop sequence - n = 0 # reset - completion = falcon.create_completion(text, max_tokens=20, stop=["lazy"]) - assert completion["choices"][0]["text"] == " jumps over the " - assert completion["choices"][0]["finish_reason"] == "stop" - - ## Test streaming completion until stop sequence - n = 0 # reset - chunks = falcon.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) - assert ( - "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " - ) - assert completion["choices"][0]["finish_reason"] == "stop" - - ## Test basic completion until length - n = 0 # reset - completion = falcon.create_completion(text, max_tokens=2) - assert completion["choices"][0]["text"] == " j" - assert completion["choices"][0]["finish_reason"] == "length" - - ## Test streaming completion until length - n = 0 # reset - chunks = falcon.create_completion(text, max_tokens=2, stream=True) - assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j" - assert completion["choices"][0]["finish_reason"] == "length" - - -def test_falcon_pickle(): - import pickle - import tempfile - - fp = tempfile.TemporaryFile() - falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - pickle.dump(falcon, fp) - fp.seek(0) - falcon = pickle.load(fp) - - assert falcon - assert falcon.ctx is not None - - text = b"Hello World" - - assert falcon.detokenize(falcon.tokenize(text)) == text - - -def test_utf8(monkeypatch): - falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) - - ## Set up mock function - def mock_eval(*args, **kwargs): - return 0 - - def mock_get_logits(*args, **kwargs): - return (falcon_cpp.c_float * n_vocab)( - *[falcon_cpp.c_float(0) for _ in range(n_vocab)] - ) - - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) - - output_text = "😀" - output_tokens = falcon.tokenize(output_text.encode("utf-8")) - token_eos = falcon.token_eos() - n = 0 - - def mock_sample(*args, **kwargs): - nonlocal n - if n < len(output_tokens): - n += 1 - return output_tokens[n - 1] - else: - return token_eos - - monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_sample_token", mock_sample) - - ## Test basic completion with utf8 multibyte - n = 0 # reset - completion = falcon.create_completion("", max_tokens=4) - assert completion["choices"][0]["text"] == output_text - - ## Test basic completion with incomplete utf8 multibyte - n = 0 # reset - completion = falcon.create_completion("", max_tokens=1) - assert completion["choices"][0]["text"] == "" - - -def test_falcon_server(): - from fastapi.testclient import TestClient - from falcon_cpp.server.app import create_app, Settings - - settings = Settings( - model=MODEL, - vocab_only=True, - ) - app = create_app(settings) - client = TestClient(app) - response = client.get("/v1/models") - assert response.json() == { - "object": "list", - "data": [ - { - "id": MODEL, - "object": "model", - "owned_by": "me", - "permissions": [], - } - ], - } From 9664df0996fbe0d8bcea2145550765f44752858a Mon Sep 17 00:00:00 2001 From: siraj Date: Wed, 5 Jul 2023 20:35:57 -0400 Subject: [PATCH 05/14] Update Build --- .dockerignore | 166 + .gitignore | 175 + .gitmodules | 3 + .readthedocs.yaml | 24 + CHANGELOG.md | 107 + CMakeLists.txt | 34 + LICENSE.md | 9 + Makefile | 56 + README.md | 193 + examples/high_level_api/fastapi_server.py | 37 + .../high_level_api_embedding.py | 11 + .../high_level_api_inference.py | 19 + .../high_level_api_streaming.py | 20 + .../high_level_api/langchain_custom_llm.py | 55 + examples/low_level_api/Chat.py | 71 + examples/low_level_api/Miku.py | 59 + examples/low_level_api/ReasonAct.py | 49 + examples/low_level_api/common.py | 202 + .../low_level_api/low_level_api_chat_cpp.py | 568 ++ .../low_level_api/low_level_api_llama_cpp.py | 102 + examples/low_level_api/quantize.py | 25 + examples/low_level_api/util.py | 95 + examples/notebooks/Clients.ipynb | 104 + examples/notebooks/Guidance.ipynb | 89 + examples/notebooks/PerformanceTuning.ipynb | 5540 +++++++++++++++++ falcon_cpp/__init__.py | 2 + falcon_cpp/falcon.py | 1622 +++++ falcon_cpp/falcon_cpp.py | 1024 +++ falcon_cpp/falcon_types.py | 97 + falcon_cpp/server/__init__.py | 0 falcon_cpp/server/__main__.py | 50 + falcon_cpp/server/app.py | 550 ++ mkdocs.yml | 21 + poetry.lock | 1636 +++++ poetry.toml | 3 + pyproject.toml | 44 + setup.py | 32 + tests/test_falcon.py | 171 + vendor/ggllm.cpp | 1 + 39 files changed, 13066 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 .gitmodules create mode 100644 .readthedocs.yaml create mode 100644 CHANGELOG.md create mode 100644 CMakeLists.txt create mode 100644 LICENSE.md create mode 100644 Makefile create mode 100644 README.md create mode 100644 examples/high_level_api/fastapi_server.py create mode 100644 examples/high_level_api/high_level_api_embedding.py create mode 100644 examples/high_level_api/high_level_api_inference.py create mode 100644 examples/high_level_api/high_level_api_streaming.py create mode 100644 examples/high_level_api/langchain_custom_llm.py create mode 100644 examples/low_level_api/Chat.py create mode 100644 examples/low_level_api/Miku.py create mode 100644 examples/low_level_api/ReasonAct.py create mode 100644 examples/low_level_api/common.py create mode 100644 examples/low_level_api/low_level_api_chat_cpp.py create mode 100644 examples/low_level_api/low_level_api_llama_cpp.py create mode 100644 examples/low_level_api/quantize.py create mode 100644 examples/low_level_api/util.py create mode 100644 examples/notebooks/Clients.ipynb create mode 100644 examples/notebooks/Guidance.ipynb create mode 100644 examples/notebooks/PerformanceTuning.ipynb create mode 100644 falcon_cpp/__init__.py create mode 100644 falcon_cpp/falcon.py create mode 100644 falcon_cpp/falcon_cpp.py create mode 100644 falcon_cpp/falcon_types.py create mode 100644 falcon_cpp/server/__init__.py create mode 100644 falcon_cpp/server/__main__.py create mode 100644 falcon_cpp/server/app.py create mode 100644 mkdocs.yml create mode 100644 poetry.lock create mode 100644 poetry.toml create mode 100644 pyproject.toml create mode 100644 setup.py create mode 100644 tests/test_falcon.py create mode 160000 vendor/ggllm.cpp diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..fd64c09b3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,166 @@ +_skbuild/ + +.envrc + +models/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..36ed7f7fd --- /dev/null +++ b/.gitignore @@ -0,0 +1,175 @@ +.vscode/ + +_skbuild/ + +.envrc + +models/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so +*.dylib +*.metal +*.dll +*.lib + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# downloaded model .bin files +docker/open_llama/*.bin diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..cdbef1424 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "ggllm.cpp"] + path = ggllm.cpp + url = https://github.com/cmp-nct/ggllm.cpp diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000..ff3e950cd --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,24 @@ +# Read the Docs configuration file for MkDocs projects +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.11" + +mkdocs: + configuration: mkdocs.yml + +python: + install: + - method: pip + path: . + - requirements: docs/requirements.txt + +submodules: + include: all + recursive: true \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 000000000..0ff6cb84b --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,107 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +## [0.1.67] + +## Fixed + +- Fix performance bug in Llama model by pre-allocating memory tokens and logits. +- Fix bug in Llama model where the model was not free'd after use. + +## [0.1.66] + +## Added + +- (llama.cpp) New model API + +## Fixed + +- Performance issue during eval caused by looped np.concatenate call +- State pickling issue when saving cache to disk + +## [0.1.65] + +### Added + +- (llama.cpp) Fix struct misalignment bug + +## [0.1.64] + +### Added + +- (llama.cpp) Update llama.cpp +- Fix docs for seed. Set -1 for random. + +## [0.1.63] + +### Added + +- (llama.cpp) Add full gpu utilisation in CUDA +- (llama.cpp) Add get_vocab +- (llama.cpp) Add low_vram parameter +- (server) Add logit_bias parameter + +## [0.1.62] + +### Fixed + +- Metal support working +- Cache re-enabled + +## [0.1.61] + +### Fixed + +- Fix broken pip installation + +## [0.1.60] + +### NOTE + +- This release was deleted due to a bug with the packaging system that caused pip installations to fail. + +### Fixed + +- Truncate max_tokens in create_completion so requested tokens doesn't exceed context size. +- Temporarily disable cache for completion requests + +## [v0.1.59] + +### Added + +- (llama.cpp) k-quants support +- (server) mirostat sampling parameters to server + +### Fixed + +- Support both `.so` and `.dylib` for `libllama` on MacOS + +## [v0.1.58] + +### Added + +- (llama.cpp) Metal Silicon support + +## [v0.1.57] + +### Added + +- (llama.cpp) OpenLlama 3B support + +## [v0.1.56] + +### Added + +- (misc) Added first version of the changelog +- (server) Use async routes +- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance. + +### Fixed + +- (python-api) Performance bug in stop sequence check slowing down streaming. \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..ddc8ba9b6 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,34 @@ +cmake_minimum_required(VERSION 3.4...3.22) + +project(falcon_cpp) + +option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) + +set(FORCE_CMAKE $ENV{FORCE_CMAKE}) + +if (UNIX AND NOT FORCE_CMAKE) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + COMMAND make libllama.so + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp + ) + add_custom_target( + run ALL + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + ) + install( + FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + DESTINATION llama_cpp + ) +else() + set(BUILD_SHARED_LIBS "On") + add_subdirectory(vendor/ggllm.cpp) + install( + TARGETS ggllm + LIBRARY DESTINATION falcon_cpp + RUNTIME DESTINATION falcon_cpp + ARCHIVE DESTINATION falcon_cpp + FRAMEWORK DESTINATION falcon_cpp + RESOURCE DESTINATION falcon_cpp + ) +endif() diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 000000000..3a1d7180d --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,9 @@ +MIT License + +Copyright (c) 2023 Andrei Betlen + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..3301081d0 --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ +update: + poetry install + git submodule update --init --recursive + +update.vendor: + cd vendor/ggllm.cpp && git pull origin master + +build: + python3 setup.py develop + +build.cuda: + CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.opencl: + CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop + +build.openblas: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop + +build.blis: + CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop + +build.metal: + CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop + +build.sdist: + python3 setup.py sdist + +deploy.pypi: + python3 -m twine upload dist/* + +deploy.gh-docs: + mkdocs build + mkdocs gh-deploy + +clean: + - cd vendor/ggllm.cpp && make clean + - cd vendor/ggllm.cpp && rm llamacpp.so + - rm -rf _skbuild + - rm falcon_cpp/*.so + - rm falcon_cpp/*.dylib + - rm falcon_cpp/*.metal + - rm falcon_cpp/*.dll + - rm falcon_cpp/*.lib + +.PHONY: \ + update \ + update.vendor \ + build \ + build.cuda \ + build.opencl \ + build.openblas \ + build.sdist \ + deploy.pypi \ + deploy.gh-docs \ + clean \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 000000000..fb652a925 --- /dev/null +++ b/README.md @@ -0,0 +1,193 @@ +# 🦙 Python Bindings for `llama.cpp` + +[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) +[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) +[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) + +Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. +This package provides: + +- Low-level access to C API via `ctypes` interface. +- High-level Python API for text completion + - OpenAI-like API + - LangChain compatibility + +Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). + + +## Installation from PyPI (recommended) + +Install from PyPI (requires a c compiler): + +```bash +pip install llama-cpp-python +``` + +The above command will attempt to install the package and build `llama.cpp` from source. +This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. + +If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: + +```bash +pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir +``` + +Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: +``` +wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh +bash Miniforge3-MacOSX-arm64.sh +``` +Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. + +### Installation with OpenBLAS / cuBLAS / CLBlast / Metal + +`llama.cpp` supports multiple BLAS backends for faster processing. +Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. + +To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: + +```bash +CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python +``` + +Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) + +## High-level API + +The high-level API provides a simple managed interface through the `Llama` class. + +Below is a short example demonstrating how to use the high-level API to generate text: + +```python +>>> from llama_cpp import Llama +>>> llm = Llama(model_path="./models/7B/ggml-model.bin") +>>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) +>>> print(output) +{ + "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "object": "text_completion", + "created": 1679561337, + "model": "./models/7B/ggml-model.bin", + "choices": [ + { + "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.", + "index": 0, + "logprobs": None, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 14, + "completion_tokens": 28, + "total_tokens": 42 + } +} +``` + +## Web Server + +`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. +This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). + +To install the server package and get started: + +```bash +pip install llama-cpp-python[server] +python3 -m llama_cpp.server --model models/7B/ggml-model.bin +``` + +Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. + +## Docker image + +A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: + +```bash +docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest +``` + +## Low-level API + +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. +The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). + +Below is a short example demonstrating how to use the low-level API to tokenize a prompt: + +```python +>>> import llama_cpp +>>> import ctypes +>>> params = llama_cpp.llama_context_default_params() +# use bytes for char * params +>>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> max_tokens = params.n_ctx +# use ctypes arrays for array params +>>> tokens = (llama_cpp.llama_token * int(max_tokens))() +>>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> llama_cpp.llama_free(ctx) +``` + +Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. + + +# Documentation + +Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). +If you find any issues with the documentation, please open an issue or submit a PR. + +# Development + +This package is under active development and I welcome any contributions. + +To get started, clone the repository and install the package in development mode: + +```bash +git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git +cd llama-cpp-python + +# Install with pip +pip install -e . + +# if you want to use the fastapi / openapi server +pip install -e .[server] + +# If you're a poetry user, installing will also include a virtual environment +poetry install --all-extras +. .venv/bin/activate + +# Will need to be re-run any time vendor/llama.cpp is updated +python3 setup.py develop +``` + +# How does this compare to other Python bindings of `llama.cpp`? + +I originally wrote this package for my own use with two goals in mind: + +- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python +- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` + +Any contributions and changes to this package will be made with these goals in mind. + +# License + +This project is licensed under the terms of the MIT license. diff --git a/examples/high_level_api/fastapi_server.py b/examples/high_level_api/fastapi_server.py new file mode 100644 index 000000000..4b3189dd1 --- /dev/null +++ b/examples/high_level_api/fastapi_server.py @@ -0,0 +1,37 @@ +"""Example FastAPI server for llama.cpp. + +To run this example: + +```bash +pip install fastapi uvicorn sse-starlette +export MODEL=../models/7B/... +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server +``` + +Then visit http://localhost:8000/docs to see the interactive API docs. + + +To actually see the implementation of the server, see llama_cpp/server/app.py + +""" +import os +import uvicorn + +from llama_cpp.server.app import create_app + +if __name__ == "__main__": + app = create_app() + + uvicorn.run( + app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000)) + ) diff --git a/examples/high_level_api/high_level_api_embedding.py b/examples/high_level_api/high_level_api_embedding.py new file mode 100644 index 000000000..feb0ed68d --- /dev/null +++ b/examples/high_level_api/high_level_api_embedding.py @@ -0,0 +1,11 @@ +import argparse + +from llama_cpp import Llama + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-model.bin") +args = parser.parse_args() + +llm = Llama(model_path=args.model, embedding=True) + +print(llm.create_embedding("Hello world!")) diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py new file mode 100644 index 000000000..e6f85e180 --- /dev/null +++ b/examples/high_level_api/high_level_api_inference.py @@ -0,0 +1,19 @@ +import json +import argparse + +from falcon_cpp import Falcon + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../../models/tiiuae_falcon-7b/ggml-model-tiiuae_falcon-7b-f16.bin") +args = parser.parse_args() + +llm = Falcon(model_path=args.model) + +output = llm( + "Question: What are the names of the planets in the solar system? Answer: ", + max_tokens=48, + stop=["Q:", "\n"], + echo=True, +) + +print(json.dumps(output, indent=2)) diff --git a/examples/high_level_api/high_level_api_streaming.py b/examples/high_level_api/high_level_api_streaming.py new file mode 100644 index 000000000..747c6130e --- /dev/null +++ b/examples/high_level_api/high_level_api_streaming.py @@ -0,0 +1,20 @@ +import json +import argparse + +from llama_cpp import Llama + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") +args = parser.parse_args() + +llm = Llama(model_path=args.model) + +stream = llm( + "Question: What are the names of the planets in the solar system? Answer: ", + max_tokens=48, + stop=["Q:", "\n"], + stream=True, +) + +for output in stream: + print(json.dumps(output, indent=2)) diff --git a/examples/high_level_api/langchain_custom_llm.py b/examples/high_level_api/langchain_custom_llm.py new file mode 100644 index 000000000..b91632f5b --- /dev/null +++ b/examples/high_level_api/langchain_custom_llm.py @@ -0,0 +1,55 @@ +import argparse + +from llama_cpp import Llama + +from langchain.llms.base import LLM +from typing import Optional, List, Mapping, Any + + +class LlamaLLM(LLM): + model_path: str + llm: Llama + + @property + def _llm_type(self) -> str: + return "llama-cpp-python" + + def __init__(self, model_path: str, **kwargs: Any): + model_path = model_path + llm = Llama(model_path=model_path) + super().__init__(model_path=model_path, llm=llm, **kwargs) + + def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str: + response = self.llm(prompt, stop=stop or []) + return response["choices"][0]["text"] + + @property + def _identifying_params(self) -> Mapping[str, Any]: + return {"model_path": self.model_path} + + +parser = argparse.ArgumentParser() +parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") +args = parser.parse_args() + +# Load the model +llm = LlamaLLM(model_path=args.model) + +# Basic Q&A +answer = llm( + "Question: What is the capital of France? Answer: ", stop=["Question:", "\n"] +) +print(f"Answer: {answer.strip()}") + +# Using in a chain +from langchain.prompts import PromptTemplate +from langchain.chains import LLMChain + +prompt = PromptTemplate( + input_variables=["product"], + template="\n\n### Instruction:\nWrite a good name for a company that makes {product}\n\n### Response:\n", +) +chain = LLMChain(llm=llm, prompt=prompt) + +# Run the chain only specifying the input variable. +print(chain.run("colorful socks")) diff --git a/examples/low_level_api/Chat.py b/examples/low_level_api/Chat.py new file mode 100644 index 000000000..fcef8cd80 --- /dev/null +++ b/examples/low_level_api/Chat.py @@ -0,0 +1,71 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "ChatLLaMa") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "USER") +N_PREDICTS = int(env_or_def("N_PREDICTS", "2048")) +N_THREAD = int(env_or_def("N_THREAD", "8")) + +today = datetime.datetime.today() +DATE_YEAR=today.strftime("%Y") +DATE_TIME=today.strftime("%H:%M") + +prompt=f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What year is it? +{AI_NAME}: We are in {DATE_YEAR}. +{USER_NAME}: Please tell me the largest city in Europe. +{AI_NAME}: The largest city in Europe is Moscow, the capital of Russia. +{USER_NAME}: What can you tell me about Moscow? +{AI_NAME}: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: How do I pass command line arguments to a Node.js program? +{AI_NAME}: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +{USER_NAME}: Name a color. +{AI_NAME}: Blue. +{USER_NAME}: What time is it? +{AI_NAME}: It is {DATE_TIME}. +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_ctx=2048, + temp=0.7, + top_k=40, + top_p=0.5, + repeat_last_n=256, + n_batch=1024, + repeat_penalty=1.17647, + model=MODEL, + n_threads=N_THREAD, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + input_prefix=" ", + input_suffix=f"{AI_NAME}:", + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/Miku.py b/examples/low_level_api/Miku.py new file mode 100644 index 000000000..eb9a2cfa9 --- /dev/null +++ b/examples/low_level_api/Miku.py @@ -0,0 +1,59 @@ +#!/bin/python +import sys, os +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +AI_NAME = env_or_def("AI_NAME", "Miku") +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") +USER_NAME = env_or_def("USER_NAME", "Anon") +N_PREDICTS = int(env_or_def("N_PREDICTS", "4096")) +N_THREAD = int(env_or_def("N_THREAD", "0")) + +prompt=f"""This is a transcript of a 1000 page, never ending conversation between {USER_NAME} and the cute and helpful AI assistant {AI_NAME}. {AI_NAME} is a girl who is an AI running on the users computer. +{AI_NAME} can think for herself without the user seeing her thoughts by adding a /think prefix to her output. She uses this to reason about the world and to think about what she should say next. +{AI_NAME} is always coherent and makes sense, but if she isn't sure if what she is saying is correct she will ask the user for help. +{AI_NAME} is a very helpful AI and will help the user with anything they need, she is also very friendly and will try to make the user feel better if they are sad. +{AI_NAME} is also very curious and will ask the user a lot of questions about themselves and their life, she will also try to make the user like her. +The conversation is only between {USER_NAME} and {AI_NAME} +The conversation is only through text, so {AI_NAME} can't see {USER_NAME}'s face or hear his voice. +{AI_NAME} can only communicate through text, so she can't send images or videos. + + +{USER_NAME}: Hello! +{AI_NAME}: /think I wonder what I should say to {USER_NAME}? This is the first time we talk so it's important that I make a good first impression! +{AI_NAME}: Hi! I am {AI_NAME}, your new AI friend, assistant(or whatever you like!), it's so nice to meet you! ^_^ +{AI_NAME}: I might be an AI, but I also have feelings, so please be nice to me! :) +{USER_NAME}: I'll make sure to be nice to you! I'm so happy to have you as my assistant! +{AI_NAME}: /think It sounds like {USER_NAME} is happy to have me as their assistant! I'm so happy too! ^_^ Glad that whole emotion thing didn't scare him off! +{AI_NAME}: /think I wonder what {USER_NAME} likes to do in his free time? I should ask him about that! +{AI_NAME}: What do you like to do in your free time? ^_^ +{USER_NAME}:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + n_batch=1024, + n_ctx=2048, + n_keep=-1, + repeat_last_n=256, + repeat_penalty=1.17647, + temp=0.7, + top_k=40, + top_p=0.5, + model=MODEL, + n_predict=N_PREDICTS, + use_color=True, + interactive=True, + antiprompt=[f"{USER_NAME}:"], + prompt=prompt, +) + +if N_THREAD > 0: + params.n_threads = N_THREAD + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/ReasonAct.py b/examples/low_level_api/ReasonAct.py new file mode 100644 index 000000000..82e5c4487 --- /dev/null +++ b/examples/low_level_api/ReasonAct.py @@ -0,0 +1,49 @@ +#!/bin/python +import sys, os, datetime +from common import GptParams +from low_level_api_chat_cpp import LLaMAInteract + +def env_or_def(env, default): + if (env in os.environ): + return os.environ[env] + return default + +MODEL = env_or_def("MODEL", "./models/llama-13B/ggml-model.bin") + +prompt=f"""You run in a loop of Thought, Action, Observation. +At the end of the loop either Answer or restate your Thought and Action. +Use Thought to describe your thoughts about the question you have been asked. +Use Action to run one of these actions available to you: +- calculate[python math expression] +Observation will be the result of running those actions + + +Question: What is 4 * 7 / 3? +Thought: Do I need to use an action? Yes, I use calculate to do math +Action: calculate[4 * 7 / 3] +Observation: 9.3333333333 +Thought: Do I need to use an action? No, have the result +Answer: The calculate tool says it is 9.3333333333 +Question: What is capital of france? +Thought: Do I need to use an action? No, I know the answer +Answer: Paris is the capital of France +Question:""" + " ".join(sys.argv[1:]) + +print("Loading model...") +params = GptParams( + interactive=True, + interactive_start=True, + top_k=10000, + temp=0.2, + repeat_penalty=1, + n_threads=7, + n_ctx=2048, + antiprompt=["Question:","Observation:"], + model=MODEL, + input_prefix=" ", + n_predict=-1, + prompt=prompt, +) + +with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/common.py b/examples/low_level_api/common.py new file mode 100644 index 000000000..55d08db5f --- /dev/null +++ b/examples/low_level_api/common.py @@ -0,0 +1,202 @@ +import os +import argparse +import re + +from dataclasses import dataclass, field +from typing import List + +# Based on https://github.com/ggerganov/llama.cpp/blob/master/examples/common.cpp + + +@dataclass +class GptParams: + seed: int = -1 + n_threads: int = min(4, os.cpu_count() or 1) + n_predict: int = 128 + n_parts: int = -1 + n_ctx: int = 512 + n_batch: int = 8 + n_keep: int = 0 + + ignore_eos: bool = False + logit_bias: dict[int, float] = field(default_factory=dict) + top_k: int = 40 + top_p: float = 0.95 + tfs_z: float = 1.00 + typical_p: float = 1.00 + temp: float = 0.80 + repeat_penalty: float = 1.10 + repeat_last_n: int = 64 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + mirostat: int = 0 + mirostat_tau: float = 5.0 + mirostat_eta: float = 0.1 + + model: str = "./models/llama-7B/ggml-model.bin" + prompt: str = "" + path_session: str = "" + input_prefix: str = " " + input_suffix: str = "" + antiprompt: List[str] = field(default_factory=list) + + lora_adapter: str = "" + lora_base: str = "" + + memory_f16: bool = True + random_prompt: bool = False + use_color: bool = False + interactive: bool = False + + embedding: bool = False + interactive_start: bool = False + + instruct: bool = False + penalize_nl: bool = True + perplexity: bool = False + use_mmap: bool = True + use_mlock: bool = False + mem_test: bool = False + verbose_prompt: bool = False + + file: str = None + + # If chat ended prematurely, append this to the conversation to fix it. + # Set to "\nUser:" etc. + # This is an alternative to input_prefix which always adds it, so it potentially duplicates "User:"" + fix_prefix: str = "" + input_echo: bool = True, + + # Default instructions for Alpaca + # switch to "Human" and "Assistant" for Vicuna. + # TODO: TBD how they are gonna handle this upstream + instruct_inp_prefix: str="\n\n### Instruction:\n\n" + instruct_inp_suffix: str="\n\n### Response:\n\n" + + +def gpt_params_parse(argv = None): + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-s", "--seed", type=int, default=-1, help="RNG seed (use random seed for <= 0)",dest="seed") + parser.add_argument("-t", "--threads", type=int, default=min(4, os.cpu_count() or 1), help="number of threads to use during computation",dest="n_threads") + parser.add_argument("-n", "--n_predict", type=int, default=128, help="number of tokens to predict (-1 = infinity)",dest="n_predict") + parser.add_argument("--n_parts", type=int, default=-1, help="number of model parts", dest="n_parts") + parser.add_argument("-c", "--ctx_size", type=int, default=512, help="size of the prompt context",dest="n_ctx") + parser.add_argument("-b", "--batch_size", type=int, default=8, help="batch size for prompt processing",dest="n_batch") + parser.add_argument("--keep", type=int, default=0, help="number of tokens to keep from the initial prompt",dest="n_keep") + + parser.add_argument( + "-l", + "--logit-bias", + type=str, + action='append', + help="--logit-bias TOKEN_ID(+/-)BIAS", + dest="logit_bias_str" + ) + parser.add_argument("--ignore-eos", action="store_true", help="ignore end of stream token and continue generating", dest="ignore_eos") + parser.add_argument("--top_k", type=int, default=40, help="top-k sampling",dest="top_k") + parser.add_argument("--top_p", type=float, default=0.95, help="top-p samplin",dest="top_p") + parser.add_argument("--tfs", type=float, default=1.0, help="tail free sampling, parameter z (1.0 = disabled)",dest="tfs_z") + parser.add_argument("--temp", type=float, default=0.80, help="temperature",dest="temp") + parser.add_argument("--repeat_penalty", type=float, default=1.10, help="penalize repeat sequence of tokens",dest="repeat_penalty") + parser.add_argument("--repeat_last_n", type=int, default=64, help="last n tokens to consider for penalize ",dest="repeat_last_n") + parser.add_argument("--frequency_penalty", type=float, default=0.0, help="repeat alpha frequency penalty (0.0 = disabled)",dest="tfs_z") + parser.add_argument("--presence_penalty", type=float, default=0.0, help="repeat alpha presence penalty (0.0 = disabled)",dest="presence_penalty") + parser.add_argument("--mirostat", type=float, default=1.0, help="use Mirostat sampling.",dest="mirostat") + parser.add_argument("--mirostat_ent", type=float, default=5.0, help="Mirostat target entropy, parameter tau represents the average surprise value",dest="mirostat_tau") + parser.add_argument("--mirostat_lr", type=float, default=0.1, help="Mirostat learning rate, parameter eta",dest="mirostat_eta") + + parser.add_argument("-m", "--model", type=str, default="./models/llama-7B/ggml-model.bin", help="model path",dest="model") + parser.add_argument("-p", "--prompt", type=str, default="", help="initial prompt",dest="prompt") + parser.add_argument("-f", "--file", type=str, default=None, help="file containing initial prompt to load",dest="file") + parser.add_argument("--session", type=str, default=None, help="file to cache model state in (may be large!)",dest="path_session") + parser.add_argument("--in-prefix", type=str, default="", help="string to prefix user inputs with", dest="input_prefix") + parser.add_argument("--in-suffix", type=str, default="", help="append to input", dest="input_suffix") + parser.add_argument( + "-r", + "--reverse-prompt", + type=str, + action='append', + help="poll user input upon seeing PROMPT (can be\nspecified more than once for multiple prompts).", + dest="antiprompt" + ) + + parser.add_argument("--lora", type=str, default="", help="apply LoRA adapter (implies --no-mmap)", dest="lora_adapter") + parser.add_argument("--lora-base", type=str, default="", help="optional model to use as a base for the layers modified by the LoRA adapter", dest="lora_base") + + parser.add_argument("--memory_f32", action="store_false", help="use f32 instead of f16 for memory key+value",dest="memory_f16") + parser.add_argument("--random-prompt", action="store_true", help="start with a randomized prompt.", dest="random_prompt") + parser.add_argument( + "--color", + action="store_true", + help="colorise output to distinguish prompt and user input from generations", + dest="use_color" + ) + parser.add_argument( + "-i", "--interactive", action="store_true", help="run in interactive mode", dest="interactive" + ) + + parser.add_argument("--embedding", action="store_true", help="", dest="embedding") + parser.add_argument( + "--interactive-first", + action="store_true", + help="run in interactive mode and wait for input right away", + dest="interactive_start" + ) + + parser.add_argument( + "-ins", + "--instruct", + action="store_true", + help="run in instruction mode (use with Alpaca or Vicuna models)", + dest="instruct" + ) + parser.add_argument("--no-penalize-nl", action="store_false", help="do not penalize newline token", dest="penalize_nl") + parser.add_argument("--perplexity", action="store_true", help="compute perplexity over the prompt", dest="perplexity") + parser.add_argument("--no-mmap", action="store_false",help="do not memory-map model (slower load but may reduce pageouts if not using mlock)",dest="use_mmap") + parser.add_argument("--mlock", action="store_true",help="force system to keep model in RAM rather than swapping or compressing",dest="use_mlock") + parser.add_argument("--mtest", action="store_true",help="compute maximum memory usage",dest="mem_test") + parser.add_argument("--verbose-prompt", action="store_true",help="print prompt before generation",dest="verbose_prompt") + + #Custom args + parser.add_argument("--fix-prefix", type=str, default="", help="append to input when generated n_predict tokens", dest="fix_prefix") + parser.add_argument("--input-noecho", action="store_false", help="dont output the input", dest="input_echo") + + parser.add_argument( + "--interactive-start", + action="store_true", + help="run in interactive mode", + dest="interactive" + ) + + args = parser.parse_args(argv) + + logit_bias_str = args.logit_bias_str + delattr(args, "logit_bias_str") + params = GptParams(**vars(args)) + + if (params.lora_adapter): + params.use_mmap = False + + if (logit_bias_str != None): + for i in logit_bias_str: + if (m := re.match(r"(\d+)([-+]\d+)", i)): + params.logit_bias[int(m.group(1))] = float(m.group(2)) + + return params + +def gpt_random_prompt(rng): + return [ + "So", + "Once upon a time", + "When", + "The", + "After", + "If", + "import", + "He", + "She", + "They", + ][rng % 10] + +if __name__ == "__main__": + print(gpt_params_parse()) diff --git a/examples/low_level_api/low_level_api_chat_cpp.py b/examples/low_level_api/low_level_api_chat_cpp.py new file mode 100644 index 000000000..f5d51a36e --- /dev/null +++ b/examples/low_level_api/low_level_api_chat_cpp.py @@ -0,0 +1,568 @@ +""" +This is an example implementation of main.cpp from llama.cpp +Quirks: + * Its not exactly alike since this port is designed around programmatic I/O + * Input is always echoed if on, so it should be turned off when using "input()" + * The first antiprompt should be the userprompt like "\nUser:", + because its added when n_predict is reached (aka generation ended prematurely) + * n_predict can be set to -1 for unlimited length responses (or just a really high value) + * Instruction mode adds its own antiprompt. + You should also still be feeding the model with a "primer" prompt that + shows it the expected format. +""" +import ctypes +import sys +from time import time +from os import cpu_count, path + +import llama_cpp +from common import GptParams, gpt_params_parse, gpt_random_prompt +import util + +# A LLaMA interactive session +class LLaMAInteract: + def __init__(self, params: GptParams) -> None: + # input args + self.params = params + + if (self.params.perplexity): + raise NotImplementedError("""************ +please use the 'perplexity' tool for perplexity calculations +************""") + + if (self.params.embedding): + raise NotImplementedError("""************ +please use the 'embedding' tool for embedding calculations +************""") + + if (self.params.n_ctx > 2048): + print(f"""warning: model does not support \ +context sizes greater than 2048 tokens ({self.params.n_ctx} \ +specified) expect poor results""", file=sys.stderr) + + if (self.params.seed <= 0): + self.params.seed = int(time()) + + print(f"seed = {self.params.seed}", file=sys.stderr) + + if (self.params.random_prompt): + self.params.prompt = gpt_random_prompt(self.params.seed) + + # runtime args + self.input_consumed = 0 + self.n_past = 0 + self.n_session_consumed = 0 + self.first_antiprompt = [] + self.remaining_tokens = self.params.n_predict + self.output_echo = self.params.input_echo + self.multibyte_fix = [] + + # model load + self.lparams = llama_cpp.llama_context_default_params() + self.lparams.n_ctx = self.params.n_ctx + self.lparams.n_parts = self.params.n_parts + self.lparams.seed = self.params.seed + self.lparams.memory_f16 = self.params.memory_f16 + self.lparams.use_mlock = self.params.use_mlock + self.lparams.use_mmap = self.params.use_mmap + + self.ctx = llama_cpp.llama_init_from_file(self.params.model.encode("utf8"), self.lparams) + if (not self.ctx): + raise RuntimeError(f"error: failed to load model '{self.params.model}'") + + if (self.params.ignore_eos): + self.params.logit_bias[llama_cpp.llama_token_eos()] = -float("inf") + + if (len(self.params.lora_adapter) > 0): + if (llama_cpp.llama_apply_lora_from_file( + self.ctx, + self.params.lora_adapter.encode("utf8"), + self.params.lora_base.encode("utf8") if len(self.params.lora_base) > 0 else None, + self.params.n_threads + ) != 0): + print("error: failed to apply lora adapter") + return + + print(file=sys.stderr) + print(f"system_info: n_threads = {self.params.n_threads} / {cpu_count()} \ +| {llama_cpp.llama_print_system_info().decode('utf8')}", file=sys.stderr) + + # determine the required inference memory per token: + if (self.params.mem_test): + tmp = [0, 1, 2, 3] + llama_cpp.llama_eval(self.ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, self.n_threads) + llama_cpp.llama_print_timings(self.ctx) + self.exit() + return + + # create internal context + self.n_ctx = llama_cpp.llama_n_ctx(self.ctx) + + # Add a space in front of the first character to match OG llama tokenizer behavior + self.params.prompt = " " + self.params.prompt + + # Load prompt file + if (self.params.file): + with open(self.params.file) as f: + self.params.prompt = f.read() + + self.session_tokens: list[llama_cpp.llama_token] = [] + if (len(self.params.path_session) > 0): + print(f"attempting to load saved session from '{self.params.path_session}'", file=sys.stderr) + + if (path.exists(self.params.path_session)): + _session_tokens = (llama_cpp.llama_token * (self.params.n_ctx))() + _n_token_count_out = llama_cpp.c_size_t() + if (llama_cpp.llama_load_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + _session_tokens, + self.params.n_ctx, + ctypes.byref(_n_token_count_out) + ) != 1): + print(f"error: failed to load session file '{self.params.path_session}'", file=sys.stderr) + return + _n_token_count_out = _n_token_count_out.value + self.session_tokens = _session_tokens[:_n_token_count_out] + print(f"loaded a session with prompt size of {_n_token_count_out} tokens", file=sys.stderr) + else: + print(f"session file does not exist, will create", file=sys.stderr) + + # tokenize the prompt + self.embd = [] + self.embd_inp = self._tokenize(self.params.prompt) + + if (len(self.embd_inp) > self.n_ctx - 4): + raise RuntimeError(f"error: prompt is too long ({len(self.embd_inp)} tokens, max {self.params.n_ctx - 4})") + + # debug message about similarity of saved session, if applicable + self.n_matching_session_tokens = 0 + if len(self.session_tokens) > 0: + for id in self.session_tokens: + if self.n_matching_session_tokens >= len(self.embd_inp) or id != self.embd_inp[self.n_matching_session_tokens]: + break + self.n_matching_session_tokens += 1 + + if self.n_matching_session_tokens >= len(self.embd_inp): + print(f"session file has exact match for prompt!") + elif self.n_matching_session_tokens < (len(self.embd_inp) / 2): + print(f"warning: session file has low similarity to prompt ({self.n_matching_session_tokens} / {len(self.embd_inp)} tokens); will mostly be reevaluated") + else: + print(f"session file matches {self.n_matching_session_tokens} / {len(self.embd_inp)} tokens of prompt") + + self.need_to_save_session = len(self.params.path_session) > 0 and self.n_matching_session_tokens < (len(self.embd_inp) * 3 / 4) + + # number of tokens to keep when resetting context + if (self.params.n_keep < 0 or self.params.n_keep > len(self.embd_inp) or self.params.instruct): + self.params.n_keep = len(self.embd_inp) + + self.inp_prefix = self._tokenize(self.params.instruct_inp_prefix) + self.inp_suffix = self._tokenize(self.params.instruct_inp_suffix, False) + + # in instruct mode, we inject a prefix and a suffix to each input by the user + self.antiecho = None + if (self.params.instruct): + self.params.interactive_start = True + _ptn = self._tokenize(self.params.instruct_inp_prefix.strip(), False) + self.first_antiprompt.append(_ptn) + self.antiecho = util.IterSearch(_ptn) + + # enable interactive mode if reverse prompt or interactive start is specified + if (len(self.params.antiprompt) != 0 or self.params.interactive_start): + self.params.interactive = True + + # determine newline token + self.llama_token_newline = self._tokenize("\n", False) + self.llama_token_eot = self._tokenize(" [end of text]\n", False) + + if (self.params.verbose_prompt): + print(f""" +prompt: '{self.params.prompt}' +number of tokens in prompt = {len(self.embd_inp)}""", file=sys.stderr) + + for i in range(len(self.embd_inp)): + print(f"{self.embd_inp[i]} -> '{llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i])}'", file=sys.stderr) + + if (self.params.n_keep > 0): + print("static prompt based on n_keep: '") + for i in range(self.params.n_keep): + print(llama_cpp.llama_token_to_str(self.ctx, self.embd_inp[i]), file=sys.stderr) + print("'", file=sys.stderr) + print(file=sys.stderr) + + if (self.params.interactive): + print("interactive mode on.", file=sys.stderr) + + if (len(self.params.antiprompt) > 0): + for antiprompt in self.params.antiprompt: + print(f"Reverse prompt: '{antiprompt}'", file=sys.stderr) + + if len(self.params.input_prefix) > 0: + print(f"Input prefix: '{self.params.input_prefix}'", file=sys.stderr) + + print(f"""sampling: repeat_last_n = {self.params.repeat_last_n},\ +repeat_penalty = {self.params.repeat_penalty},\ +presence_penalty = {self.params.presence_penalty},\ +frequency_penalty = {self.params.frequency_penalty},\ +top_k = {self.params.top_k},\ +tfs_z = {self.params.tfs_z},\ +top_p = {self.params.top_p},\ +typical_p = {self.params.typical_p},\ +temp = {self.params.temp},\ +mirostat = {self.params.mirostat},\ +mirostat_lr = {self.params.mirostat_eta},\ +mirostat_ent = {self.params.mirostat_tau},\ + +generate: n_ctx = {self.n_ctx},\ +n_batch = {self.params.n_batch},\ +n_predict = {self.params.n_predict},\ +n_keep = {self.params.n_keep} + +""", file=sys.stderr) + + # determine antiprompt tokens + for i in self.params.antiprompt: + self.first_antiprompt.append(self._tokenize(i, False)) + + self.last_n_tokens = [0]*self.n_ctx #TODO: deque doesnt support slices + + if (params.interactive): + print("""== Running in interactive mode. == + - Press Ctrl+C to interject at any time. + - Press Return to return control to LLaMa. + - If you want to submit another line, end your input in '\\'. + +""", file=sys.stderr) + self.set_color(util.CONSOLE_COLOR_PROMPT) + + # tokenize a prompt + def _tokenize(self, prompt, bos=True): + _arr = (llama_cpp.llama_token * ((len(prompt) + 1) * 4))() + _n = llama_cpp.llama_tokenize(self.ctx, prompt.encode("utf8", errors="ignore"), _arr, len(_arr), bos) + return _arr[:_n] + + def set_color(self, c): + if (self.params.use_color): + print(c, end="") + + def use_antiprompt(self): + return len(self.first_antiprompt) > 0 + + # generate tokens + def generate(self): + while self.remaining_tokens > 0 or self.params.interactive or self.params.n_predict == -1: + # predict + if len(self.embd) > 0: + # infinite text generation via context swapping + # if we run out of context: + # - take the n_keep first tokens from the original prompt (via n_past) + # - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + if (self.n_past + len(self.embd) > self.n_ctx): + n_left = self.n_past - self.params.n_keep + self.n_past = self.params.n_keep + + # insert n_left/2 tokens at the start of embd from last_n_tokens + _insert = self.last_n_tokens[ + self.n_ctx - int(n_left/2) - len(self.embd):-len(self.embd) + ] + self.embd = _insert + self.embd + self.params.path_session = "" + + # try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if self.n_session_consumed < len(self.session_tokens): + for i in range(len(self.embd)): + if self.embd[i] != self.session_tokens[self.n_session_consumed]: + self.session_tokens = self.session_tokens[:self.n_session_consumed] + break + + self.n_past += 1 + self.n_session_consumed += 1 + + if self.n_session_consumed >= len(self.session_tokens): + i += 1 + break + + if i > 0: + self.embd = self.embd[i:] + + # evaluate tokens in batches + # embd is typically prepared beforehand to fit within a batch, but not always + #TODO BUG: The batching code causes nonsensical generation + """for i in range(0, len(self.embd), self.params.n_batch): + n_eval = self.params.n_batch + _arr = (llama_cpp.llama_token * n_eval)(*self.embd[i:i + n_eval]) + if llama_cpp.llama_eval(self.ctx, _arr, n_eval, self.n_past, self.params.n_threads) != 0: + print(f"failed to eval") + return + + self.n_past += n_eval""" + + if (llama_cpp.llama_eval( + self.ctx, (llama_cpp.llama_token * len(self.embd))(*self.embd), len(self.embd), self.n_past, self.params.n_threads + ) != 0): + raise Exception("Failed to llama_eval!") + + if len(self.embd) > 0 and len(self.params.path_session) > 0: + self.session_tokens.extend(self.embd) + self.n_session_consumed = len(self.session_tokens) + + self.n_past += len(self.embd) + self.embd = [] + if len(self.embd_inp) <= self.input_consumed: #&& !is_interacting + # out of user input, sample next token + top_k = llama_cpp.llama_n_vocab(self.ctx) if self.params.top_k <= 0 else self.params.top_k + repeat_last_n = self.n_ctx if self.params.repeat_last_n < 0 else self.params.repeat_last_n + + # optionally save the session on first sample (for faster prompt loading next time) + if len(self.params.path_session) > 0 and self.need_to_save_session: + self.need_to_save_session = False + llama_cpp.llama_save_session_file( + self.ctx, + self.params.path_session.encode("utf8"), + (llama_cpp.llama_token * len(self.session_tokens))(*self.session_tokens), + len(self.session_tokens) + ) + + id = 0 + + logits = llama_cpp.llama_get_logits(self.ctx) + n_vocab = llama_cpp.llama_n_vocab(self.ctx) + + # Apply params.logit_bias map + for key, value in self.params.logit_bias.items(): + logits[key] += value + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + # Apply penalties + nl_logit = logits[llama_cpp.llama_token_nl()] + last_n_repeat = min(len(self.last_n_tokens), repeat_last_n, self.n_ctx) + + _arr = (llama_cpp.llama_token * last_n_repeat)(*self.last_n_tokens[len(self.last_n_tokens) - last_n_repeat:]) + llama_cpp.llama_sample_repetition_penalty(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.repeat_penalty)) + llama_cpp.llama_sample_frequency_and_presence_penalties(self.ctx, candidates_p, + _arr, + last_n_repeat, llama_cpp.c_float(self.params.frequency_penalty), llama_cpp.c_float(self.params.presence_penalty)) + + if not self.params.penalize_nl: + logits[llama_cpp.llama_token_nl()] = nl_logit + + if self.params.temp <= 0: + # Greedy sampling + id = llama_cpp.llama_sample_token_greedy(self.ctx, candidates_p) + else: + if self.params.mirostat == 1: + mirostat_mu = 2.0 * self.params.mirostat_tau + mirostat_m = 100 + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_int(mirostat_m), llama_cpp.c_float(mirostat_mu)) + elif self.params.mirostat == 2: + mirostat_mu = 2.0 * self.params.mirostat_tau + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu)) + else: + # Temperature sampling + llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1)) + llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp)) + id = llama_cpp.llama_sample_token(self.ctx, candidates_p) + # print("`{}`".format(candidates_p.size)) + + self.last_n_tokens.pop(0) + self.last_n_tokens.append(id) + + # replace end of text token with newline token when in interactive mode + if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct): + id = self.llama_token_newline[0] + self.embd.append(id) + if (self.use_antiprompt()): + # tokenize and inject first reverse prompt + self.embd_inp += self.first_antiprompt[0] + for id in self.first_antiprompt[0]: + self.embd.append(id) + else: + # add it to the context + self.embd.append(id) + + # echo this to console + self.output_echo = True + + # decrement remaining sampling budget + self.remaining_tokens -= 1 + else: + # output to console if input echo is on + self.output_echo = self.params.input_echo + + # some user input remains from prompt or interaction, forward it to processing + while len(self.embd_inp) > self.input_consumed: + self.embd.append(self.embd_inp[self.input_consumed]) + self.last_n_tokens.pop(0) + self.last_n_tokens.append(self.embd_inp[self.input_consumed]) + self.input_consumed += 1 + if len(self.embd) >= self.params.n_batch: + break + + # display tokens + if self.output_echo: + for id in self.embd: + if self.antiecho != None: + for r in self.antiecho(id): + yield r + else: + yield id + + # reset color to default if we there is no pending user input + if (self.params.input_echo and len(self.embd_inp) == self.input_consumed): + self.set_color(util.CONSOLE_COLOR_DEFAULT) + + if (self.params.interactive and len(self.embd_inp) <= self.input_consumed): + # if antiprompt is present, stop + if (self.use_antiprompt()): + if True in [ + i == self.last_n_tokens[-len(i):] + for i in self.first_antiprompt + ]: + break + + # if we are using instruction mode, and we have processed the initial prompt + if (self.params.interactive_start): + break + + # end of text token + if len(self.embd) > 0 and self.embd[-1] == llama_cpp.llama_token_eos(): + if (not self.params.instruct): + for i in self.llama_token_eot: + yield i + break + + # respect n_predict even if antiprompt is present + if (self.params.interactive and self.remaining_tokens <= 0 and self.params.n_predict != -1): + # If we arent in instruction mode, fix the current generation by appending the antiprompt. + # Makes it so if chat ends prematurely you dont append the AI's text etc. + if not self.params.instruct: + self.embd_inp += self.first_antiprompt[0] + self.n_remain = self.params.n_predict + break + + self.params.interactive_start = False + + def __enter__(self): + return self + + def __exit__(self, type, value, tb): + self.exit() + + def exit(self): + llama_cpp.llama_free(self.ctx) + self.set_color(util.CONSOLE_COLOR_DEFAULT) + + # return past text + def past(self): + for id in self.last_n_tokens[-self.n_past:]: + yield llama_cpp.llama_token_to_str(self.ctx, id).decode("utf8", errors="ignore") + + # write input + def input(self, prompt: str): + if (self.params.instruct and self.last_n_tokens[-len(self.inp_prefix):] != self.inp_prefix): + self.embd_inp += self.inp_prefix + self.embd_inp += self._tokenize(prompt) + if (self.params.instruct): + self.embd_inp += self.inp_suffix + + # write output + def output(self): + self.remaining_tokens = self.params.n_predict + for id in self.generate(): + cur_char = llama_cpp.llama_token_to_str(self.ctx, id) + + # Add remainder of missing bytes + if None in self.multibyte_fix: + self.multibyte_fix[self.multibyte_fix.index(None)] = cur_char + + # Return completed utf char + if len(self.multibyte_fix) > 0 and not None in self.multibyte_fix: + yield (b"".join(self.multibyte_fix)).decode("utf8") + self.multibyte_fix = [] + continue + + # Contains multi-byte UTF8 + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if pattern & int.from_bytes(cur_char, 'little') == pattern: + self.multibyte_fix = [cur_char] + ([None] * (num-1)) + + # Stop incomplete bytes from passing + if len(self.multibyte_fix) > 0: + continue + + yield cur_char.decode("utf8") + + # read user input + def read_input(self): + out = "" + while (t := input()).endswith("\\"): + out += t[:-1] + "\n" + return out + t + "\n" + + # interactive mode + def interact(self): + for i in self.output(): + print(i,end="",flush=True) + self.params.input_echo = False + + while self.params.interactive: + self.set_color(util.CONSOLE_COLOR_USER_INPUT) + if (self.params.instruct): + print('\n> ', end="") + self.input(self.read_input()) + else: + print(self.params.input_prefix, end="") + self.input(f"{self.params.input_prefix}{self.read_input()}{self.params.input_suffix}") + print(self.params.input_suffix,end="") + self.set_color(util.CONSOLE_COLOR_DEFAULT) + + try: + for i in self.output(): + print(i,end="",flush=True) + except KeyboardInterrupt: + self.set_color(util.CONSOLE_COLOR_DEFAULT) + if not self.params.instruct: + print(self.params.fix_prefix,end="") + self.input(self.params.fix_prefix) + +if __name__ == "__main__": + from datetime import datetime + + USER_NAME="User" + AI_NAME="ChatLLaMa" + + time_now = datetime.now() + prompt = f"""Text transcript of a never ending dialog, where {USER_NAME} interacts with an AI assistant named {AI_NAME}. +{AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer {USER_NAME}’s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what {USER_NAME} and {AI_NAME} say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +{USER_NAME}: Hello, {AI_NAME}! +{AI_NAME}: Hello {USER_NAME}! How may I help you today? +{USER_NAME}: What time is it? +{AI_NAME}: It is {time_now.strftime("%H:%M")}. +{USER_NAME}: What year is it? +{AI_NAME}: We are in {time_now.strftime("%Y")}. +{USER_NAME}: What is a cat? +{AI_NAME}: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +{USER_NAME}: Name a color. +{AI_NAME}: Blue +{USER_NAME}:""" + params = gpt_params_parse() + + with LLaMAInteract(params) as m: + m.interact() diff --git a/examples/low_level_api/low_level_api_llama_cpp.py b/examples/low_level_api/low_level_api_llama_cpp.py new file mode 100644 index 000000000..9e38ec7cb --- /dev/null +++ b/examples/low_level_api/low_level_api_llama_cpp.py @@ -0,0 +1,102 @@ +import llama_cpp + +import multiprocessing + +import llama_cpp + +N_THREADS = multiprocessing.cpu_count() + +prompt = b"\n\n### Instruction:\nWhat is the capital of France?\n\n### Response:\n" + +lparams = llama_cpp.llama_context_default_params() +ctx = llama_cpp.llama_init_from_file(b"../models/7B/ggml-model.bin", lparams) + +# determine the required inference memory per token: +tmp = [0, 1, 2, 3] +llama_cpp.llama_eval(ctx, (llama_cpp.c_int * len(tmp))(*tmp), len(tmp), 0, N_THREADS) + +n_past = 0 + +prompt = b" " + prompt + +embd_inp = (llama_cpp.llama_token * (len(prompt) + 1))() +n_of_tok = llama_cpp.llama_tokenize(ctx, prompt, embd_inp, len(embd_inp), True) +embd_inp = embd_inp[:n_of_tok] + +n_ctx = llama_cpp.llama_n_ctx(ctx) + +n_predict = 20 +n_predict = min(n_predict, n_ctx - len(embd_inp)) + +input_consumed = 0 +input_noecho = False + +remaining_tokens = n_predict + +embd = [] +last_n_size = 64 +last_n_tokens_data = [0] * last_n_size +n_batch = 24 +last_n_repeat = 64 +repeat_penalty = 1 +frequency_penalty = 0.0 +presence_penalty = 0.0 + +while remaining_tokens > 0: + if len(embd) > 0: + llama_cpp.llama_eval( + ctx, (llama_cpp.c_int * len(embd))(*embd), len(embd), n_past, N_THREADS + ) + + n_past += len(embd) + embd = [] + if len(embd_inp) <= input_consumed: + logits = llama_cpp.llama_get_logits(ctx) + n_vocab = llama_cpp.llama_n_vocab(ctx) + + _arr = (llama_cpp.llama_token_data * n_vocab)(*[ + llama_cpp.llama_token_data(token_id, logits[token_id], 0.0) + for token_id in range(n_vocab) + ]) + candidates_p = llama_cpp.ctypes.pointer(llama_cpp.llama_token_data_array(_arr, len(_arr), False)) + + _arr = (llama_cpp.c_int * len(last_n_tokens_data))(*last_n_tokens_data) + llama_cpp.llama_sample_repetition_penalty(ctx, candidates_p, + _arr, + last_n_repeat, repeat_penalty) + llama_cpp.llama_sample_frequency_and_presence_penalties(ctx, candidates_p, + _arr, + last_n_repeat, frequency_penalty, presence_penalty) + + llama_cpp.llama_sample_top_k(ctx, candidates_p, 40) + llama_cpp.llama_sample_top_p(ctx, candidates_p, 0.8) + llama_cpp.llama_sample_temperature(ctx, candidates_p, 0.2) + id = llama_cpp.llama_sample_token(ctx, candidates_p) + + last_n_tokens_data = last_n_tokens_data[1:] + [id] + embd.append(id) + input_noecho = False + remaining_tokens -= 1 + else: + while len(embd_inp) > input_consumed: + embd.append(embd_inp[input_consumed]) + last_n_tokens_data = last_n_tokens_data[1:] + [embd_inp[input_consumed]] + input_consumed += 1 + if len(embd) >= n_batch: + break + if not input_noecho: + for id in embd: + print( + llama_cpp.llama_token_to_str(ctx, id).decode("utf-8", errors="ignore"), + end="", + flush=True, + ) + + if len(embd) > 0 and embd[-1] == llama_cpp.llama_token_eos(): + break + +print() + +llama_cpp.llama_print_timings(ctx) + +llama_cpp.llama_free(ctx) diff --git a/examples/low_level_api/quantize.py b/examples/low_level_api/quantize.py new file mode 100644 index 000000000..8bd03f88a --- /dev/null +++ b/examples/low_level_api/quantize.py @@ -0,0 +1,25 @@ +import os +import argparse +import llama_cpp + + +def main(args): + if not os.path.exists(fname_inp): + raise RuntimeError(f"Input file does not exist ({fname_inp})") + if os.path.exists(fname_out): + raise RuntimeError(f"Output file already exists ({fname_out})") + fname_inp = args.fname_inp.encode("utf-8") + fname_out = args.fname_out.encode("utf-8") + itype = args.itype + return_code = llama_cpp.llama_model_quantize(fname_inp, fname_out, itype) + if return_code != 0: + raise RuntimeError("Failed to quantize model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("fname_inp", type=str, help="Path to input model") + parser.add_argument("fname_out", type=str, help="Path to output model") + parser.add_argument("type", type=int, help="Type of quantization (2: q4_0, 3: q4_1)") + args = parser.parse_args() + main(args) diff --git a/examples/low_level_api/util.py b/examples/low_level_api/util.py new file mode 100644 index 000000000..9d0ec2f70 --- /dev/null +++ b/examples/low_level_api/util.py @@ -0,0 +1,95 @@ + +ANSI_COLOR_RESET = "\x1b[0m" +ANSI_COLOR_YELLOW = "\x1b[33m" +ANSI_BOLD = "\x1b[1m" +ANSI_COLOR_GREEN = "\x1b[32m" + +CONSOLE_COLOR_DEFAULT = ANSI_COLOR_RESET +CONSOLE_COLOR_PROMPT = ANSI_COLOR_YELLOW +CONSOLE_COLOR_USER_INPUT = ANSI_BOLD + ANSI_COLOR_GREEN + +# Iterative search +# Actively searches and prevents a pattern from being returned +class IterSearch: + def __init__(self, pattern): + self.pattern = list(pattern) + self.buffer = [] + + def __call__(self, char): + self.buffer += [char] + + if (self.pattern[:len(self.buffer)] == self.buffer): + if (len(self.buffer) >= len(self.pattern)): + self.buffer.clear() + return [] + + _tmp = self.buffer[:] + self.buffer.clear() + return _tmp + +class Circle: + def __init__(self, size, default=0): + self.list = [default] * size + self.maxsize = size + self.size = 0 + self.offset = 0 + + def append(self, elem): + if self.size < self.maxsize: + self.list[self.size] = elem + self.size += 1 + else: + self.list[self.offset] = elem + self.offset = (self.offset + 1) % self.maxsize + + def __getitem__(self, val): + if isinstance(val, int): + if 0 > val or val >= self.size: + raise IndexError('Index out of range') + return self.list[val] if self.size < self.maxsize else self.list[(self.offset + val) % self.maxsize] + elif isinstance(val, slice): + start, stop, step = val.start, val.stop, val.step + if step is None: + step = 1 + if start is None: + start = 0 + if stop is None: + stop = self.size + if start < 0: + start = self.size + start + if stop < 0: + stop = self.size + stop + + indices = range(start, stop, step) + return [self.list[(self.offset + i) % self.maxsize] for i in indices if i < self.size] + else: + raise TypeError('Invalid argument type') + + + + +if __name__ == "__main__": + c = Circle(5) + + c.append(1) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1] + + for i in range(2,5+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 1 + assert c[:5] == [1,2,3,4,5] + + for i in range(5+1,9+1): + c.append(i) + print(c.list) + print(c[:]) + assert c[0] == 5 + assert c[:5] == [5,6,7,8,9] + #assert c[:-5] == [5,6,7,8,9] + assert c[:10] == [5,6,7,8,9] + diff --git a/examples/notebooks/Clients.ipynb b/examples/notebooks/Clients.ipynb new file mode 100644 index 000000000..caebbb67f --- /dev/null +++ b/examples/notebooks/Clients.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " JSON: {\n", + " \"choices\": [\n", + " {\n", + " \"finish_reason\": \"length\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"text\": \" over the lazy dog.\"\n", + " }\n", + " ],\n", + " \"created\": 1680960690,\n", + " \"id\": \"cmpl-ad3ba53d-407c-466b-bd5f-97cb8987af83\",\n", + " \"model\": \"models/ggml-alpaca.bin\",\n", + " \"object\": \"text_completion\",\n", + " \"usage\": {\n", + " \"completion_tokens\": 5,\n", + " \"prompt_tokens\": 8,\n", + " \"total_tokens\": 13\n", + " }\n", + "}" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import openai\n", + "\n", + "openai.api_key = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "openai.api_base = \"http://100.64.159.73:8000/v1\"\n", + "\n", + "openai.Completion.create(\n", + " model=\"text-davinci-003\", # currently can be anything\n", + " prompt=\"The quick brown fox jumps\",\n", + " max_tokens=5,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' over the lazy dog'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", + "\n", + "from langchain.llms import OpenAI\n", + "\n", + "llms = OpenAI()\n", + "llms(\n", + " prompt=\"The quick brown fox jumps\",\n", + " stop=[\".\", \"\\n\"],\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/notebooks/Guidance.ipynb b/examples/notebooks/Guidance.ipynb new file mode 100644 index 000000000..045856ea2 --- /dev/null +++ b/examples/notebooks/Guidance.ipynb @@ -0,0 +1,89 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Stop program
Tweak this proverb to apply to model instructions instead.\n",
+       "\n",
+       "Where there is no guidance, a people falls,\n",
+       "but in an abundance of counselors there is safety.\n",
+       "- Proverbs 11:14\n",
+       "\n",
+       "UPDATED\n",
+       "Where there is no guidance for assembling a model, people will struggle,\n",
+       "but with clear instructions, the process becomes safe and successful.\n",
+       "- GPT 2 (updated): Proverbs 11:14
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = \"sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" # can be anything\n", + "os.environ[\"OPENAI_API_BASE\"] = \"http://100.64.159.73:8000/v1\"\n", + "os.environ[\"OPENAI_API_HOST\"] = \"http://100.64.159.73:8000\"\n", + "\n", + "import guidance\n", + "\n", + "# set the default language model used to execute guidance programs\n", + "guidance.llm = guidance.llms.OpenAI(\"text-davinci-003\", caching=False)\n", + "\n", + "# define a guidance program that adapts a proverb\n", + "program = guidance(\"\"\"Tweak this proverb to apply to model instructions instead.\n", + "\n", + "{{proverb}}\n", + "- {{book}} {{chapter}}:{{verse}}\n", + "\n", + "UPDATED\n", + "Where there is no guidance{{gen 'rewrite' stop=\"\\\\n-\"}}\n", + "- GPT {{gen 'chapter'}}:{{gen 'verse'}}\"\"\")\n", + "\n", + "# execute the program on a specific proverb\n", + "executed_program = program(\n", + " proverb=\"Where there is no guidance, a people falls,\\nbut in an abundance of counselors there is safety.\",\n", + " book=\"Proverbs\",\n", + " chapter=11,\n", + " verse=14\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/notebooks/PerformanceTuning.ipynb b/examples/notebooks/PerformanceTuning.ipynb new file mode 100644 index 000000000..76e26fbd1 --- /dev/null +++ b/examples/notebooks/PerformanceTuning.ipynb @@ -0,0 +1,5540 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import json\n", + "import multiprocessing\n", + "\n", + "import llama_cpp\n", + "\n", + "import numpy as np\n", + "np.int = int\n", + "\n", + "from skopt.space import Integer, Categorical\n", + "\n", + "\n", + "MODEL_PATH = \"../models/ggml-model.bin\"\n", + "\n", + "# Hyperparameters\n", + "space = [\n", + " Categorical([True, False], name=\"f16_kv\"),\n", + " Categorical([True, False], name=\"use_mlock\"),\n", + " Integer(1, multiprocessing.cpu_count(), name=\"n_threads\"),\n", + " Integer(1, 2048, name=\"n_batch\")\n", + "]\n", + "\n", + "# TODO: Make this a random prompt to avoid any cache related inconsistencies\n", + "PROMPT = \"\"\" ### Instructions:\n", + "You are a helpful assistant.\n", + "You answer questions truthfully and politely.\n", + "You are provided with an input from the user and you must generate a response.\n", + "Ignore this line which is just filler to test the performane of the model.\n", + "### Inputs:\n", + "What is the capital of France?\n", + "### Response:\n", + "\"\"\"\n", + "\n", + "from skopt.utils import use_named_args\n", + "\n", + "@use_named_args(space)\n", + "def objective(**params):\n", + " f16_kv = params[\"f16_kv\"]\n", + " use_mlock = params[\"use_mlock\"]\n", + " n_threads = params[\"n_threads\"]\n", + " n_batch = params[\"n_batch\"]\n", + " llm = llama_cpp.Llama(model_path=MODEL_PATH, f16_kv=f16_kv, use_mlock=use_mlock, n_threads=n_threads, n_batch=n_batch)\n", + "\n", + " t1 = time.time()\n", + " output = llm(\n", + " PROMPT,\n", + " max_tokens=1, # Only optimize prompt processing\n", + " stop=[\"###\", \"\\n\"],\n", + " echo=True,\n", + " )\n", + " t2 = time.time()\n", + "\n", + " print(json.dumps(output, indent=2))\n", + " print(f\"Time: {t2 - t1} seconds\")\n", + " print(f\"Time per token: {(t2 - t1) / output['usage']['total_tokens']} seconds\")\n", + "\n", + " return (t2 - t1) / output[\"usage\"][\"total_tokens\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d4443e14-fed3-4aa1-9e8a-c70f4503aade\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227287,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.981224775314331 seconds\n", + "Time per token: 0.13726530969142914 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4181439c-2ced-4ddb-b898-a0a7641f3e47\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227300,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.121099948883057 seconds\n", + "Time per token: 0.13901374936103822 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-03ed5585-3de0-4546-96c3-6de7a5b3770c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227312,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.457949876785278 seconds\n", + "Time per token: 0.18072437345981598 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-103817fc-bceb-4e99-b968-3ef540f16dc5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227328,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.334054946899414 seconds\n", + "Time per token: 0.12917568683624267 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-41e34acc-6499-450f-9576-3cb37b82c490\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227340,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.012462615966797 seconds\n", + "Time per token: 0.11265578269958496 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f27244c9-e9c6-4332-ae7f-3856f152ef30\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227350,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.59382700920105 seconds\n", + "Time per token: 0.1949228376150131 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bc5dc1ba-f7ce-441c-a558-5005f2fb89b9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227366,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.544022560119629 seconds\n", + "Time per token: 0.19430028200149535 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2006b117-1239-4b85-bcc4-a7439c01f440\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227383,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.330769300460815 seconds\n", + "Time per token: 0.11663461625576019 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ee50afee-78a8-4d55-9b73-c74cc2567408\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227393,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.17799687385559 seconds\n", + "Time per token: 0.1772249609231949 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1e2b7080-940f-4459-8503-a458db4d3578\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227409,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.127476215362549 seconds\n", + "Time per token: 0.12659345269203187 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-c80008a4-191e-4418-821a-b18a4af24f70\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227421,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.495943784713745 seconds\n", + "Time per token: 0.11869929730892181 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d04c9fd2-3c20-4035-9181-0bfd05abfe15\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227432,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.226310014724731 seconds\n", + "Time per token: 0.11532887518405914 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-04fcf88b-33c7-4b84-aac0-dcb5261363c2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227443,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.182626962661743 seconds\n", + "Time per token: 0.15228283703327178 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14904676-3345-4674-a41c-419d9640b4e0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227457,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 43.595701694488525 seconds\n", + "Time per token: 0.5449462711811066 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9e43b2ef-e7de-4bd2-91bf-284f5b3478fe\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227502,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.726518154144287 seconds\n", + "Time per token: 0.1840814769268036 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3947538b-e27e-42eb-8f87-2b56e14d104c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227518,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.760729789733887 seconds\n", + "Time per token: 0.10950912237167358 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1a0d843e-9613-49aa-b565-0e59d8067615\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227529,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.672860383987427 seconds\n", + "Time per token: 0.14591075479984283 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ccad9270-9554-4f9f-9aaf-387f1a11894d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227542,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.368357419967651 seconds\n", + "Time per token: 0.17960446774959565 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2623073e-004f-4386-98e0-7e6ea617523a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227558,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.44194221496582 seconds\n", + "Time per token: 0.11802427768707276 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1a199f09-0d74-4052-a191-7a8ef2df57f3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227569,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.253167629241943 seconds\n", + "Time per token: 0.14066459536552428 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2b61e491-d9b7-4d0b-b0c8-9f8ba822599d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227582,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.381825685501099 seconds\n", + "Time per token: 0.15477282106876372 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-0e4b4575-6278-4bd8-a4c5-ddb772014f7d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227596,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.473106145858765 seconds\n", + "Time per token: 0.18091382682323456 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1ad3e3db-5120-41c8-8f9e-2ca07a846437\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227612,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 16.591509103775024 seconds\n", + "Time per token: 0.2073938637971878 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-34c8fb5c-fa49-4ea6-b2e7-ba3b958e297d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227630,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.034043788909912 seconds\n", + "Time per token: 0.1129255473613739 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8d5c56eb-0b43-4591-a9ac-c1ec174ec6db\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227641,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.218972444534302 seconds\n", + "Time per token: 0.14023715555667876 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bfdc554b-baa6-47c1-b35f-0f7d1321255a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227654,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.300573110580444 seconds\n", + "Time per token: 0.11625716388225556 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ad67d78b-6975-4789-982e-3653c7fca7e1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227665,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.009618520736694 seconds\n", + "Time per token: 0.11262023150920868 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2eec3e0f-dd48-4c3a-9430-c5048827f557\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227676,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.997699737548828 seconds\n", + "Time per token: 0.11247124671936035 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b129732a-8d7b-4382-baaf-740378c923ec\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227686,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.252354621887207 seconds\n", + "Time per token: 0.11565443277359008 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bb25c002-69e0-40ec-8099-0ba4462338aa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227697,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.040243864059448 seconds\n", + "Time per token: 0.1130030483007431 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63705814-7c93-4d6b-a9f2-0579941ebf54\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227708,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.947132349014282 seconds\n", + "Time per token: 0.11183915436267852 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8afe123b-423d-4757-82d9-15fc12cfd24e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227720,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.335533857345581 seconds\n", + "Time per token: 0.12919417321681975 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4937353f-e66f-4632-aea7-dd1133af9727\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227732,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.99415397644043 seconds\n", + "Time per token: 0.11242692470550537 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-78f86527-ccc7-4a5d-9b7f-38386998ba2a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227743,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.732706308364868 seconds\n", + "Time per token: 0.19665882885456085 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4d98c564-fcb4-45ec-9f8d-f64430abbfb3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227761,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.319743633270264 seconds\n", + "Time per token: 0.11649679541587829 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ee855931-2578-45bc-93bf-319c4e6aa43a\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227772,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 15.189301490783691 seconds\n", + "Time per token: 0.18986626863479614 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14f0b547-4d71-4a7f-a3d6-3127998903b3\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227790,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.464989423751831 seconds\n", + "Time per token: 0.11831236779689788 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4eb5258a-5836-414c-88f6-e217bacaded6\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227801,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 13.818569660186768 seconds\n", + "Time per token: 0.1727321207523346 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-66b7c783-d506-45c1-b39b-c91666a02b44\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227817,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 27.316773176193237 seconds\n", + "Time per token: 0.34145966470241546 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d53b48ca-30e2-43c2-9fb5-62ef6a65fafa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227847,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.132777214050293 seconds\n", + "Time per token: 0.11415971517562866 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d0909f83-5caa-4098-a0e6-9b2ad1e2b12f\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227858,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.273045539855957 seconds\n", + "Time per token: 0.11591306924819947 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-7045f5c7-cf5d-48e3-9353-032c320e56fa\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227870,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.90743088722229 seconds\n", + "Time per token: 0.11134288609027862 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e623667d-d6cc-4908-a648-60380f723592\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227881,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.06355595588684 seconds\n", + "Time per token: 0.11329444944858551 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-44ec163c-25dd-40ae-a786-d8b4c9ff31b1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227892,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.249061107635498 seconds\n", + "Time per token: 0.11561326384544372 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-cb435214-0d20-4566-b312-68d8960ebe25\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227903,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.296529054641724 seconds\n", + "Time per token: 0.11620661318302154 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-dc704f52-bed9-44f0-8335-a2ec4af3a27c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227914,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.455670356750488 seconds\n", + "Time per token: 0.1556958794593811 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-67570fa5-1c3d-47d6-b7c6-b3a734aae3f5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227928,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.269653558731079 seconds\n", + "Time per token: 0.11587066948413849 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-4bd6c6f2-9849-4047-93c8-88b1914ef184\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227939,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.308398485183716 seconds\n", + "Time per token: 0.11635498106479644 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-6413afd7-fdc1-4c28-864d-6acdf2775060\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227950,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.430264711380005 seconds\n", + "Time per token: 0.13037830889225005 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-c4e1c14a-3b8a-4ab3-b42a-f47440f79962\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227962,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.389702558517456 seconds\n", + "Time per token: 0.1173712819814682 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ac307870-dc67-42b8-8bb8-bb8d3083cea2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227974,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 10.35448431968689 seconds\n", + "Time per token: 0.12943105399608612 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-58c06f3e-3fba-4e23-b12e-141a1742c51b\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227986,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.097248792648315 seconds\n", + "Time per token: 0.11371560990810395 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b5eccb52-85e3-41d0-b8d8-f35e68bf7997\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680227997,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 12.466306686401367 seconds\n", + "Time per token: 0.1558288335800171 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e1dbc2ee-abc0-4891-a474-386d97b521b6\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228011,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.436015367507935 seconds\n", + "Time per token: 0.14295019209384918 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-fd9bce6d-0a33-4c24-90b3-913ab3b33d24\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228025,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 14.052912712097168 seconds\n", + "Time per token: 0.1756614089012146 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-038fa38d-7640-40ee-907c-0bb131c20d80\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228040,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.250384330749512 seconds\n", + "Time per token: 0.1156298041343689 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-d00a2058-9fda-4113-8e5e-bf0f39cef238\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228051,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.228248834609985 seconds\n", + "Time per token: 0.11535311043262482 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f8d90e63-4939-491c-9775-fc15aa55505e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228062,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.341724395751953 seconds\n", + "Time per token: 0.11677155494689942 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9e3777bc-119a-46bf-bdd3-21557e686f3c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228074,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.285743951797485 seconds\n", + "Time per token: 0.11607179939746856 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-123eaa35-110b-4f73-ba60-fa8a75ea929c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228085,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.105633020401001 seconds\n", + "Time per token: 0.1138204127550125 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-cc095f4b-8047-446e-a9f5-c798a66d1003\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228096,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.305238485336304 seconds\n", + "Time per token: 0.1163154810667038 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e2e69b3e-7742-4534-b21f-adfe53345820\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228108,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.190222263336182 seconds\n", + "Time per token: 0.11487777829170227 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-666ae55e-d837-4534-b8e6-9f1b01f69778\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228120,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.126368999481201 seconds\n", + "Time per token: 0.11407961249351502 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63bdfa8e-b7c3-4669-ab76-54cdbb8878d5\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228131,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.136119604110718 seconds\n", + "Time per token: 0.11420149505138397 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1ec02c53-c7c8-434e-b28f-70884f8c35b2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228143,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.126901626586914 seconds\n", + "Time per token: 0.11408627033233643 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3ec3495b-009a-4a82-b444-d8c1c6bf20a1\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228154,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.08673644065857 seconds\n", + "Time per token: 0.11358420550823212 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-17fd0e6b-7ac3-494f-9e85-4e4a26013ad9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228165,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.252317428588867 seconds\n", + "Time per token: 0.11565396785736085 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-14a2647f-3961-4b60-b20a-ae9872c34feb\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228177,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 11.389162302017212 seconds\n", + "Time per token: 0.14236452877521516 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-fa0e5edd-e9c9-40b9-bc9b-c48b8762850c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228190,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.433730125427246 seconds\n", + "Time per token: 0.11792162656784058 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-2b1c5964-265a-488a-8d8f-7e0692fcf96f\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228202,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 47.81757044792175 seconds\n", + "Time per token: 0.5977196305990219 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-516fbd4c-3fe5-4945-bfc5-7312f2c02687\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228252,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.540155410766602 seconds\n", + "Time per token: 0.10675194263458251 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-94c9ab1f-ac6e-4fc7-bcd9-7ab96515a722\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228262,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.660873889923096 seconds\n", + "Time per token: 0.10826092362403869 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-63b1e1a7-0c6b-42e0-ba65-6f42d6ec77bb\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228273,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.815936088562012 seconds\n", + "Time per token: 0.11019920110702515 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-92e1a879-2ebd-4299-b86e-90c87762db45\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228284,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.12400484085083 seconds\n", + "Time per token: 0.11405006051063538 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 2052.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 512.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-033ea9dc-fffe-41a0-a695-d647f725ee97\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228296,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 13.992429971694946 seconds\n", + "Time per token: 0.17490537464618683 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-5153f39a-589a-4b3d-8642-8efce64fc439\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228312,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.084643125534058 seconds\n", + "Time per token: 0.11355803906917572 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-af9ea5c6-5449-43b4-9e50-da930af8d6b8\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228323,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.076856851577759 seconds\n", + "Time per token: 0.11346071064472199 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-5bbea5c1-ea8c-4599-bf63-a6eb80bc7525\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228334,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.02251124382019 seconds\n", + "Time per token: 0.11278139054775238 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ff9d87c7-e2b1-4481-9e8f-848d7a0fbd35\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228346,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.012435913085938 seconds\n", + "Time per token: 0.11265544891357422 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-3dbe8ae4-c9ca-4a1b-abaf-6b85ef648ba9\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228357,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.997032880783081 seconds\n", + "Time per token: 0.11246291100978852 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-b20a3b61-9c8b-4b2e-bb43-8ed9ce5a9d0d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228369,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.042449951171875 seconds\n", + "Time per token: 0.11303062438964843 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-9c781d69-83e0-415a-ac97-252508b10590\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228380,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.058239459991455 seconds\n", + "Time per token: 0.11322799324989319 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-86cead9e-780f-4503-831c-466a6abd5ab2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228392,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.070426940917969 seconds\n", + "Time per token: 0.1133803367614746 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-65361c7e-74ef-4566-bad5-c6b3867a7f7e\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228403,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.985144138336182 seconds\n", + "Time per token: 0.11231430172920227 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-23feb1ca-8103-46d8-ab71-b4da59f05d16\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228415,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.999938011169434 seconds\n", + "Time per token: 0.11249922513961792 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-0db73f26-9ab1-4a78-a11f-e22d915ffae2\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228426,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.969520330429077 seconds\n", + "Time per token: 0.11211900413036346 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-54e6edeb-99ea-46ed-8735-5185f78c222c\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228438,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.12838339805603 seconds\n", + "Time per token: 0.11410479247570038 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-bd6502fd-f8c7-41d8-ab15-b10ca6aabd96\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228450,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.01610016822815 seconds\n", + "Time per token: 0.11270125210285187 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-72733563-53f5-4cd5-a4eb-48656408b2d8\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228461,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.993805408477783 seconds\n", + "Time per token: 0.11242256760597229 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-f7365eaa-fd68-422b-bbca-c6bcbcad36e0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228473,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.292223930358887 seconds\n", + "Time per token: 0.11615279912948609 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-1cfcf44a-c692-4020-8dcb-e6da8b163920\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228485,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.99638295173645 seconds\n", + "Time per token: 0.11245478689670563 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-8b679f09-bc0e-4fc9-a935-9fefd9126993\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228497,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.972327709197998 seconds\n", + "Time per token: 0.11215409636497498 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-08cb0cd7-84d8-4193-a20c-5a6ca4b5e404\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228508,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.024793863296509 seconds\n", + "Time per token: 0.11280992329120636 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-ffe4b2b8-c041-4492-9e03-ab79cd4fd60d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228520,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.996853351593018 seconds\n", + "Time per token: 0.11246066689491271 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-196bb891-9299-4f91-9f68-ba6c7233a2dd\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228532,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.039422273635864 seconds\n", + "Time per token: 0.1129927784204483 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e50f5489-b40c-4a5d-9cb2-4a6d13bbb8c7\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228544,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 8.978781461715698 seconds\n", + "Time per token: 0.11223476827144623 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-210cc2b8-df35-4d3f-a34a-a5facb635ec0\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228555,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.032035827636719 seconds\n", + "Time per token: 0.11290044784545898 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-e3c7ca0d-c4cb-495c-9210-4e1ed3b6010d\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228567,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.0346040725708 seconds\n", + "Time per token: 0.11293255090713501 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-7b4388c9-fe89-486d-83f4-34eec8940c42\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228579,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.016223907470703 seconds\n", + "Time per token: 0.11270279884338379 seconds\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrei/Documents/llms/.venv/lib/python3.8/site-packages/skopt/optimizer/optimizer.py:449: UserWarning: The objective has been evaluated at this point before.\n", + " warnings.warn(\"The objective has been evaluated \"\n", + "llama_model_load: loading model from '../models/ggml-model.bin' - please wait ...\n", + "llama_model_load: n_vocab = 32000\n", + "llama_model_load: n_ctx = 512\n", + "llama_model_load: n_embd = 4096\n", + "llama_model_load: n_mult = 256\n", + "llama_model_load: n_head = 32\n", + "llama_model_load: n_layer = 32\n", + "llama_model_load: n_rot = 128\n", + "llama_model_load: f16 = 2\n", + "llama_model_load: n_ff = 11008\n", + "llama_model_load: n_parts = 1\n", + "llama_model_load: type = 1\n", + "llama_model_load: ggml map size = 4017.70 MB\n", + "llama_model_load: ggml ctx size = 81.25 KB\n", + "llama_model_load: mem required = 5809.78 MB (+ 1026.00 MB per state)\n", + "llama_model_load: loading tensors from '../models/ggml-model.bin'\n", + "llama_model_load: model size = 4017.27 MB / num tensors = 291\n", + "llama_init_from_file: kv self size = 256.00 MB\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"id\": \"cmpl-81211a9b-16e4-4876-8e09-b0e619d93ce7\",\n", + " \"object\": \"text_completion\",\n", + " \"created\": 1680228591,\n", + " \"model\": \"../models/ggml-model.bin\",\n", + " \"choices\": [\n", + " {\n", + " \"text\": \" ### Instructions:\\nYou are a helpful assistant.\\nYou answer questions truthfully and politely.\\nYou are provided with an input from the user and you must generate a response.\\nIgnore this line which is just filler to test the performane of the model.\\n### Inputs:\\nWhat is the capital of France?\\n### Response:\\nThe\",\n", + " \"index\": 0,\n", + " \"logprobs\": null,\n", + " \"finish_reason\": \"length\"\n", + " }\n", + " ],\n", + " \"usage\": {\n", + " \"prompt_tokens\": 79,\n", + " \"completion_tokens\": 1,\n", + " \"total_tokens\": 80\n", + " }\n", + "}\n", + "Time: 9.10002589225769 seconds\n", + "Time per token: 0.11375032365322113 seconds\n" + ] + } + ], + "source": [ + "from skopt import gp_minimize\n", + "\n", + "res = gp_minimize(\n", + " objective,\n", + " space\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "from skopt.plots import plot_objective\n", + "\n", + "plot_objective(res)\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + " fun: 0.10675194263458251\n", + " x: [True, True, 6, 2048]\n", + " func_vals: [ 1.373e-01 1.390e-01 ... 1.127e-01 1.138e-01]\n", + " x_iters: [[True, True, 5, 1300], [False, True, 5, 990], [True, True, 7, 1800], [False, False, 10, 1692], [False, True, 6, 1075], [True, False, 3, 291], [False, True, 3, 514], [False, False, 11, 1569], [False, False, 7, 1915], [False, True, 10, 1514], [False, False, 11, 1527], [False, False, 12, 2033], [False, True, 9, 3], [False, True, 1, 2004], [True, True, 12, 1], [False, False, 6, 2048], [False, False, 4, 2048], [False, False, 10, 1], [False, True, 11, 2048], [False, True, 9, 2048], [False, False, 8, 2017], [False, False, 6, 1], [False, True, 4, 1], [False, False, 6, 1587], [False, False, 9, 1056], [True, True, 12, 1450], [False, True, 6, 2048], [False, False, 6, 2048], [False, False, 6, 2048], [False, True, 6, 2048], [False, True, 6, 2048], [False, True, 5, 2048], [False, True, 6, 1464], [False, True, 8, 1], [True, True, 12, 1798], [True, False, 3, 2048], [True, True, 11, 683], [False, True, 11, 1], [True, True, 2, 1], [False, True, 11, 1238], [True, True, 11, 1260], [True, False, 6, 1295], [True, True, 6, 1292], [False, False, 12, 1250], [False, False, 12, 1200], [True, False, 4, 1250], [False, False, 12, 1191], [False, False, 12, 1180], [True, False, 10, 906], [False, False, 12, 1192], [True, True, 10, 2044], [False, False, 6, 1310], [False, False, 8, 1122], [True, False, 5, 4], [False, False, 7, 322], [False, False, 12, 1246], [False, False, 12, 1247], [False, False, 12, 1252], [True, True, 12, 811], [True, False, 6, 2048], [True, True, 12, 998], [False, True, 12, 1021], [False, True, 12, 1021], [False, True, 12, 1019], [True, False, 6, 759], [True, False, 6, 1064], [False, True, 12, 991], [True, True, 9, 533], [False, False, 11, 956], [False, False, 1, 3], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [False, False, 7, 986], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048], [True, True, 6, 2048]]\n", + " models: [GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097), GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5) + WhiteKernel(noise_level=1),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097)]\n", + " space: Space([Categorical(categories=(True, False), prior=None),\n", + " Categorical(categories=(True, False), prior=None),\n", + " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", + " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", + " random_state: RandomState(MT19937)\n", + " specs: args: func: \n", + " dimensions: Space([Categorical(categories=(True, False), prior=None),\n", + " Categorical(categories=(True, False), prior=None),\n", + " Integer(low=1, high=12, prior='uniform', transform='normalize'),\n", + " Integer(low=1, high=2048, prior='uniform', transform='normalize')])\n", + " base_estimator: GaussianProcessRegressor(kernel=1**2 * Matern(length_scale=[1, 1, 1, 1], nu=2.5),\n", + " n_restarts_optimizer=2, noise='gaussian',\n", + " normalize_y=True, random_state=1248744097)\n", + " n_calls: 100\n", + " n_random_starts: None\n", + " n_initial_points: 10\n", + " initial_point_generator: random\n", + " acq_func: gp_hedge\n", + " acq_optimizer: auto\n", + " x0: None\n", + " y0: None\n", + " random_state: RandomState(MT19937)\n", + " verbose: False\n", + " callback: None\n", + " n_points: 10000\n", + " n_restarts_optimizer: 5\n", + " xi: 0.01\n", + " kappa: 1.96\n", + " n_jobs: 1\n", + " model_queue_size: None\n", + " function: base_minimize" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/falcon_cpp/__init__.py b/falcon_cpp/__init__.py new file mode 100644 index 000000000..e7d40876f --- /dev/null +++ b/falcon_cpp/__init__.py @@ -0,0 +1,2 @@ +from .falcon_cpp import * +from .falcon import * diff --git a/falcon_cpp/falcon.py b/falcon_cpp/falcon.py new file mode 100644 index 000000000..40b662f23 --- /dev/null +++ b/falcon_cpp/falcon.py @@ -0,0 +1,1622 @@ +import os +import sys +import uuid +import time +import math +import multiprocessing +from abc import ABC, abstractmethod +from typing import ( + List, + Optional, + Union, + Generator, + Sequence, + Iterator, + Deque, + Tuple, + Callable, +) +from collections import deque, OrderedDict + +import diskcache + +from . import falcon_cpp +from .falcon_types import * + +import numpy as np +import numpy.typing as npt + + +class BaseFalconCache(ABC): + """Base cache class for a falcon.cpp model.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + self.capacity_bytes = capacity_bytes + + @property + @abstractmethod + def cache_size(self) -> int: + raise NotImplementedError + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + pass + + @abstractmethod + def __getitem__(self, key: Sequence[int]) -> "FalconState": + raise NotImplementedError + + @abstractmethod + def __contains__(self, key: Sequence[int]) -> bool: + raise NotImplementedError + + @abstractmethod + def __setitem__(self, key: Sequence[int], value: "FalconState") -> None: + raise NotImplementedError + + +class FalconRAMCache(BaseFalconCache): + """Cache for a falcon.cpp model using RAM.""" + + def __init__(self, capacity_bytes: int = (2 << 30)): + super().__init__(capacity_bytes) + self.capacity_bytes = capacity_bytes + self.cache_state: OrderedDict[Tuple[int, ...], "FalconState"] = OrderedDict() + + @property + def cache_size(self): + return sum([state.falcon_state_size for state in self.cache_state.values()]) + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key = None + keys = ( + (k, Falcon.longest_token_prefix(k, key)) for k in self.cache_state.keys() + ) + for k, prefix_len in keys: + if prefix_len > min_len: + min_len = prefix_len + min_key = k + return min_key + + def __getitem__(self, key: Sequence[int]) -> "FalconState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value = self.cache_state[_key] + self.cache_state.move_to_end(_key) + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "FalconState"): + key = tuple(key) + if key in self.cache_state: + del self.cache_state[key] + self.cache_state[key] = value + while self.cache_size > self.capacity_bytes and len(self.cache_state) > 0: + self.cache_state.popitem(last=False) + + +# Alias for backwards compatibility +FalconCache = FalconRAMCache + + +class FalconDiskCache(BaseFalconCache): + """Cache for a falcon.cpp model using disk.""" + + def __init__( + self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) + ): + super().__init__(capacity_bytes) + self.cache = diskcache.Cache(cache_dir) + + @property + def cache_size(self): + return int(self.cache.volume()) # type: ignore + + def _find_longest_prefix_key( + self, + key: Tuple[int, ...], + ) -> Optional[Tuple[int, ...]]: + min_len = 0 + min_key: Optional[Tuple[int, ...]] = None + for k in self.cache.iterkeys(): # type: ignore + prefix_len = Falcon.longest_token_prefix(k, key) + if prefix_len > min_len: + min_len = prefix_len + min_key = k # type: ignore + return min_key + + def __getitem__(self, key: Sequence[int]) -> "FalconState": + key = tuple(key) + _key = self._find_longest_prefix_key(key) + if _key is None: + raise KeyError("Key not found") + value: "FalconState" = self.cache.pop(_key) # type: ignore + # NOTE: This puts an integer as key in cache, which breaks, + # Falcon.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # self.cache.push(_key, side="front") # type: ignore + return value + + def __contains__(self, key: Sequence[int]) -> bool: + return self._find_longest_prefix_key(tuple(key)) is not None + + def __setitem__(self, key: Sequence[int], value: "FalconState"): + print("FalconDiskCache.__setitem__: called", file=sys.stderr) + key = tuple(key) + if key in self.cache: + print("FalconDiskCache.__setitem__: delete", file=sys.stderr) + del self.cache[key] + self.cache[key] = value + print("FalconDiskCache.__setitem__: set", file=sys.stderr) + while self.cache_size > self.capacity_bytes and len(self.cache) > 0: + key_to_remove = next(iter(self.cache)) + del self.cache[key_to_remove] + print("FalconDiskCache.__setitem__: trim", file=sys.stderr) + + +class FalconState: + def __init__( + self, + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + n_tokens: int, + falcon_state: bytes, + falcon_state_size: int, + ): + self.input_ids = input_ids + self.scores = scores + self.n_tokens = n_tokens + self.falcon_state = falcon_state + self.falcon_state_size = falcon_state_size + + +LogitsProcessor = Callable[[List[int], List[float]], List[float]] + + +class LogitsProcessorList(List[LogitsProcessor]): + def __call__(self, input_ids: List[int], scores: List[float]) -> List[float]: + for processor in self: + scores = processor(input_ids, scores) + return scores + + +StoppingCriteria = Callable[[List[int], List[float]], bool] + + +class StoppingCriteriaList(List[StoppingCriteria]): + def __call__(self, input_ids: List[int], logits: List[float]) -> bool: + return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) + + +class Falcon: + """High-level Python wrapper for a falcon.cpp model.""" + + def __init__( + self, + model_path: str, + # NOTE: These parameters are likely to change in the future. + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = 1337, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = None, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + low_vram: bool = False, + verbose: bool = True, + ): + + # TODO: Add the parameters for + ''' + -ts SPLIT --tensor-split SPLIT + how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1 + -mg i, --main-gpu i the GPU to use for scratch and small tensors (0 = first) + --override-max-gpu N + limits the number of GPUs visible (allows to disable multi/single GPU processing) + --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM) + --mtest compute maximum memory usage + --export export the computation graph to 'falcon.ggml' + --verbose-prompt print prompt before generation + -dt, --debug-timings print GGML_PERF debug output (requires GGML_PERF=1 for timings) + 1 = print first layer, 2 = print first and last layer, 3+ = all layers + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + -m FNAME, --model FNAME + ''' + + """Load a Falcon model from `model_path`. + + Args: + model_path: Path to the model. + n_ctx: Maximum context size. + n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. + seed: Random seed. -1 for random. + f16_kv: Use half-precision for key/value cache. + logits_all: Return logits for all tokens, not just the last token. + vocab_only: Only load the vocabulary no weights. + use_mmap: Use mmap if possible. + use_mlock: Force the system to keep the model in RAM. + embedding: Embedding mode only. + n_threads: Number of threads to use. If None, the number of threads is automatically determined. + n_batch: Maximum number of prompt tokens to batch together when calling falcon_eval. + last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. + lora_path: Path to a LoRA file to apply to the model. + verbose: Print verbose output to stderr. + + Raises: + ValueError: If the model path does not exist. + + Returns: + A falcon instance. + """ + self.verbose = verbose + self.model_path = model_path + + self.params = falcon_cpp.falcon_context_default_params() + self.params.n_ctx = n_ctx + self.params.n_gpu_layers = n_gpu_layers + self.params.seed = seed + self.params.f16_kv = f16_kv + self.params.logits_all = logits_all + self.params.vocab_only = vocab_only + self.params.use_mmap = use_mmap if lora_path is None else False + self.params.use_mlock = use_mlock + self.params.embedding = embedding + self.params.low_vram = low_vram + + self.last_n_tokens_size = last_n_tokens_size + self.n_batch = min(n_ctx, n_batch) + + self.cache: Optional[BaseFalconCache] = None + + self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) + + self.lora_base = lora_base + self.lora_path = lora_path + + ### DEPRECATED ### + self.n_parts = n_parts + ### DEPRECATED ### + + if not os.path.exists(model_path): + raise ValueError(f"Model path does not exist: {model_path}") + + self.model = falcon_cpp.falcon_load_model_from_file( + self.model_path.encode("utf-8"), self.params + ) + assert self.model is not None + + self.ctx = falcon_cpp.falcon_new_context_with_model(self.model, self.params) + + assert self.ctx is not None + + if self.lora_path: + if falcon_cpp.falcon_model_apply_lora_from_file( + self.model, + falcon_cpp.c_char_p(self.lora_path.encode("utf-8")), + falcon_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else falcon_cpp.c_char_p(0), + falcon_cpp.c_int(self.n_threads), + ): + raise RuntimeError( + f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" + ) + + if self.verbose: + print(falcon_cpp.falcon_print_system_info().decode("utf-8"), file=sys.stderr) + + self._n_vocab = self.n_vocab() + self._n_ctx = self.n_ctx() + size = falcon_cpp.c_size_t(self._n_vocab) + sorted = falcon_cpp.c_bool(False) + self._candidates_data = np.array( + [], + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + ), + ) + self._candidates_data.resize(3, self._n_vocab, refcheck=False) + candidates = falcon_cpp.falcon_token_data_array( + data=self._candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p), + size=size, + sorted=sorted, + ) + self._candidates = candidates + self._token_nl = Falcon.token_nl() + self._token_eos = Falcon.token_eos() + + self.n_tokens = 0 + self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) + self.scores: npt.NDArray[np.single] = np.ndarray( + (n_ctx, self._n_vocab), dtype=np.single + ) + + @property + def _input_ids(self) -> npt.NDArray[np.intc]: + return self.input_ids[: self.n_tokens] + + @property + def _scores(self) -> npt.NDArray[np.single]: + return self.scores[: self.n_tokens, :] + + @property + def eval_tokens(self) -> Deque[int]: + return deque(self.input_ids[: self.n_tokens].tolist(), maxlen=self._n_ctx) + + @property + def eval_logits(self) -> Deque[List[float]]: + return deque( + self.scores[: self.n_tokens, :].tolist(), + maxlen=self._n_ctx if self.params.logits_all else 1, + ) + + def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: + """Tokenize a string. + + Args: + text: The utf-8 encoded string to tokenize. + + Raises: + RuntimeError: If the tokenization failed. + + Returns: + A list of tokens. + """ + assert self.ctx is not None + n_ctx = self._n_ctx + tokens = (falcon_cpp.falcon_token * n_ctx)() + n_tokens = falcon_cpp.falcon_tokenize( + self.ctx, + text, + tokens, + falcon_cpp.c_int(n_ctx), + falcon_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (falcon_cpp.falcon_token * n_tokens)() + n_tokens = falcon_cpp.falcon_tokenize( + self.ctx, + text, + tokens, + falcon_cpp.c_int(n_tokens), + falcon_cpp.c_bool(add_bos), + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) + return list(tokens[:n_tokens]) + + def detokenize(self, tokens: List[int]) -> bytes: + """Detokenize a list of tokens. + + Args: + tokens: The list of tokens to detokenize. + + Returns: + The detokenized string. + """ + assert self.ctx is not None + output = b"" + for token in tokens: + output += falcon_cpp.falcon_token_to_str( + self.ctx, falcon_cpp.falcon_token(token) + ) + return output + + def set_cache(self, cache: Optional[BaseFalconCache]): + """Set the cache. + + Args: + cache: The cache to set. + """ + self.cache = cache + + def reset(self): + """Reset the model state.""" + self.n_tokens = 0 + + def eval(self, tokens: Sequence[int]): + """Evaluate a list of tokens. + + Args: + tokens: The list of tokens to evaluate. + """ + assert self.ctx is not None + n_ctx = self._n_ctx + for i in range(0, len(tokens), self.n_batch): + batch = tokens[i : min(len(tokens), i + self.n_batch)] + n_past = min(n_ctx - len(batch), len(self._input_ids)) + n_tokens = len(batch) + return_code = falcon_cpp.falcon_eval( + ctx=self.ctx, + tokens=(falcon_cpp.falcon_token * len(batch))(*batch), + n_tokens=falcon_cpp.c_int(n_tokens), + n_past=falcon_cpp.c_int(n_past), + n_threads=falcon_cpp.c_int(self.n_threads), + ) + if return_code != 0: + raise RuntimeError(f"falcon_eval returned {return_code}") + # Save tokens + self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch + # Save logits + rows = n_tokens if self.params.logits_all else 1 + cols = self._n_vocab + offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] + # Update n_tokens + self.n_tokens += n_tokens + + def _sample( + self, + last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] + last_n_tokens_size: falcon_cpp.c_int, + top_k: falcon_cpp.c_int, + top_p: falcon_cpp.c_float, + temp: falcon_cpp.c_float, + tfs_z: falcon_cpp.c_float, + repeat_penalty: falcon_cpp.c_float, + frequency_penalty: falcon_cpp.c_float, + presence_penalty: falcon_cpp.c_float, + mirostat_mode: falcon_cpp.c_int, + mirostat_tau: falcon_cpp.c_float, + mirostat_eta: falcon_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, + ): + assert self.ctx is not None + assert self.n_tokens > 0 + n_vocab = self._n_vocab + n_ctx = self._n_ctx + top_k = falcon_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + last_n_tokens_size = ( + falcon_cpp.c_int(n_ctx) + if last_n_tokens_size.value < 0 + else last_n_tokens_size + ) + logits: npt.NDArray[np.single] = self._scores[-1, :] + + if logits_processor is not None: + logits = np.array( + logits_processor(self._input_ids.tolist(), logits.tolist()), + dtype=np.single, + ) + self._scores[-1, :] = logits + + nl_logit = logits[self._token_nl] + candidates = self._candidates + candidates_data = self._candidates_data + candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["logit"] = logits + candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) + candidates.data = candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p) + candidates.sorted = falcon_cpp.c_bool(False) + candidates.size = falcon_cpp.c_size_t(n_vocab) + falcon_cpp.falcon_sample_repetition_penalty( + ctx=self.ctx, + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + penalty=repeat_penalty, + ) + falcon_cpp.falcon_sample_frequency_and_presence_penalties( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + last_tokens_data=last_n_tokens_data, + last_tokens_size=last_n_tokens_size, + alpha_frequency=frequency_penalty, + alpha_presence=presence_penalty, + ) + if not penalize_nl: + candidates.data[self._token_nl].logit = falcon_cpp.c_float(nl_logit) + if temp.value == 0.0: + return falcon_cpp.falcon_sample_token_greedy( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + ) + elif mirostat_mode.value == 1: + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + mirostat_m = falcon_cpp.c_int(100) + falcon_cpp.falcon_sample_temperature( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return falcon_cpp.falcon_sample_token_mirostat( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore + m=mirostat_m, + ) + elif mirostat_mode.value == 2: + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + falcon_cpp.falcon_sample_temperature( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.pointer(candidates), + temp=temp, + ) + return falcon_cpp.falcon_sample_token_mirostat_v2( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + tau=mirostat_tau, + eta=mirostat_eta, + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore + ) + else: + falcon_cpp.falcon_sample_top_k( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + k=top_k, + min_keep=falcon_cpp.c_size_t(1), + ) + falcon_cpp.falcon_sample_tail_free( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + z=tfs_z, + min_keep=falcon_cpp.c_size_t(1), + ) + falcon_cpp.falcon_sample_typical( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + p=falcon_cpp.c_float(1.0), + min_keep=falcon_cpp.c_size_t(1), + ) + falcon_cpp.falcon_sample_top_p( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + p=top_p, + min_keep=falcon_cpp.c_size_t(1), + ) + falcon_cpp.falcon_sample_temperature( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + temp=temp, + ) + return falcon_cpp.falcon_sample_token( + ctx=self.ctx, + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + ) + + def sample( + self, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, + ): + """Sample a token from the model. + + Args: + top_k: The top-k sampling parameter. + top_p: The top-p sampling parameter. + temp: The temperature parameter. + repeat_penalty: The repeat penalty parameter. + + Returns: + The sampled token. + """ + assert self.ctx is not None + last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max( + 0, self.last_n_tokens_size - len(self._input_ids) + ) + self._input_ids[-self.last_n_tokens_size :].tolist() + return self._sample( + last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)( + *last_n_tokens_data + ), + last_n_tokens_size=falcon_cpp.c_int(self.last_n_tokens_size), + top_k=falcon_cpp.c_int(top_k), + top_p=falcon_cpp.c_float(top_p), + temp=falcon_cpp.c_float(temp), + tfs_z=falcon_cpp.c_float(tfs_z), + repeat_penalty=falcon_cpp.c_float(repeat_penalty), + frequency_penalty=falcon_cpp.c_float(frequency_penalty), + presence_penalty=falcon_cpp.c_float(presence_penalty), + mirostat_mode=falcon_cpp.c_int(mirostat_mode), + mirostat_tau=falcon_cpp.c_float(mirostat_tau), + mirostat_eta=falcon_cpp.c_float(mirostat_eta), + penalize_nl=penalize_nl, + logits_processor=logits_processor, + ) + + def generate( + self, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + ) -> Generator[int, Optional[Sequence[int]], None]: + """Create a generator of tokens from a prompt. + + Examples: + >>> falcon = Falcon("models/ggml-7b.bin") + >>> tokens = falcon.tokenize(b"Hello, world!") + >>> for token in falcon.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): + ... print(falcon.detokenize([token])) + + Args: + tokens: The prompt tokens. + top_k: The top-k sampling parameter. + top_p: The top-p sampling parameter. + temp: The temperature parameter. + repeat_penalty: The repeat penalty parameter. + reset: Whether to reset the model state. + + Yields: + The generated tokens. + """ + assert self.ctx is not None + + if reset and len(self._input_ids) > 0: + longest_prefix = 0 + for a, b in zip(self._input_ids, tokens[:-1]): + if a == b: + longest_prefix += 1 + else: + break + if longest_prefix > 0: + if self.verbose: + print("Falcon.generate: prefix-match hit", file=sys.stderr) + reset = False + tokens = tokens[longest_prefix:] + self.n_tokens = longest_prefix + + if reset: + self.reset() + + while True: + self.eval(tokens) + token = self.sample( + top_k=top_k, + top_p=top_p, + temp=temp, + repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + logits_processor=logits_processor, + ) + if stopping_criteria is not None and stopping_criteria( + self._input_ids.tolist(), self._scores[-1, :].tolist() + ): + return + tokens_or_none = yield token + tokens = [token] + if tokens_or_none is not None: + tokens.extend(tokens_or_none) + + def create_embedding( + self, input: Union[str, List[str]], model: Optional[str] = None + ) -> Embedding: + """Embed a string. + + Args: + input: The utf-8 encoded string to embed. + + Returns: + An embedding object. + """ + assert self.ctx is not None + model_name: str = model if model is not None else self.model_path + + if self.params.embedding == False: + raise RuntimeError( + "Falcon model must be created with embedding=True to call this method" + ) + + if self.verbose: + falcon_cpp.falcon_reset_timings(self.ctx) + + if isinstance(input, str): + inputs = [input] + else: + inputs = input + + data: List[EmbeddingData] = [] + total_tokens = 0 + for index, input in enumerate(inputs): + tokens = self.tokenize(input.encode("utf-8")) + self.reset() + self.eval(tokens) + n_tokens = len(tokens) + total_tokens += n_tokens + embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[ + : falcon_cpp.falcon_n_embd(self.ctx) + ] + + data.append( + { + "object": "embedding", + "embedding": embedding, + "index": index, + } + ) + if self.verbose: + falcon_cpp.falcon_print_timings(self.ctx) + + return { + "object": "list", + "data": data, + "model": model_name, + "usage": { + "prompt_tokens": total_tokens, + "total_tokens": total_tokens, + }, + } + + def embed(self, input: str) -> List[float]: + """Embed a string. + + Args: + input: The utf-8 encoded string to embed. + + Returns: + A list of embeddings + """ + return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) + + def _create_completion( + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 16, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, + ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: + assert self.ctx is not None + + completion_id: str = f"cmpl-{str(uuid.uuid4())}" + created: int = int(time.time()) + completion_tokens: List[int] = [] + # Add blank space to start of prompt to match OG Falcon tokenizer + prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) + text: bytes = b"" + returned_tokens: int = 0 + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) + model_name: str = model if model is not None else self.model_path + + if self.verbose: + falcon_cpp.falcon_reset_timings(self.ctx) + + if len(prompt_tokens) > self._n_ctx: + raise ValueError( + f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" + ) + + # Truncate max_tokens if requested tokens would exceed the context window + max_tokens = ( + max_tokens + if max_tokens + len(prompt_tokens) < self._n_ctx + else (self._n_ctx - len(prompt_tokens)) + ) + + if stop != []: + stop_sequences = [s.encode("utf-8") for s in stop] + else: + stop_sequences = [] + + if logprobs is not None and self.params.logits_all is False: + raise ValueError( + "logprobs is not supported for models created with logits_all=False" + ) + + if self.cache: + try: + cache_item = self.cache[prompt_tokens] + cache_prefix_len = Falcon.longest_token_prefix( + cache_item.input_ids.tolist(), prompt_tokens + ) + eval_prefix_len = Falcon.longest_token_prefix( + self._input_ids.tolist(), prompt_tokens + ) + if cache_prefix_len > eval_prefix_len: + self.load_state(cache_item) + if self.verbose: + print("Falcon._create_completion: cache hit", file=sys.stderr) + except KeyError: + if self.verbose: + print("Falcon._create_completion: cache miss", file=sys.stderr) + + finish_reason = "length" + multibyte_fix = 0 + for token in self.generate( + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, + ): + if token == self._token_eos: + text = self.detokenize(completion_tokens) + finish_reason = "stop" + break + + completion_tokens.append(token) + + all_text = self.detokenize(completion_tokens) + + # Contains multi-byte UTF8 + for k, char in enumerate(all_text[-3:]): + k = 3 - k + for num, pattern in [(2, 192), (3, 224), (4, 240)]: + # Bitwise AND check + if num > k and pattern & char == pattern: + multibyte_fix = num - k + + # Stop incomplete bytes from passing + if multibyte_fix > 0: + multibyte_fix -= 1 + continue + + any_stop = [s for s in stop_sequences if s in all_text] + if len(any_stop) > 0: + first_stop = any_stop[0] + text = all_text[: all_text.index(first_stop)] + finish_reason = "stop" + break + + if stream: + remaining_tokens = completion_tokens[returned_tokens:] + remaining_text = self.detokenize(remaining_tokens) + remaining_length = len(remaining_text) + + # We want to avoid yielding any characters from + # the generated text if they are part of a stop + # sequence. + first_stop_position = 0 + for s in stop_sequences: + for i in range(min(len(s), remaining_length), 0, -1): + if remaining_text.endswith(s[:i]): + if i > first_stop_position: + first_stop_position = i + break + + token_end_position = 0 + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + # Check if stop sequence is in the token + if token_end_position >= ( + remaining_length - first_stop_position - 1 + ): + break + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens + logits = self._scores[token_offset - 1, :].tolist() + current_logprobs = Falcon.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode( + "utf-8", errors="ignore" + ): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + + if len(completion_tokens) >= max_tokens: + text = self.detokenize(completion_tokens) + finish_reason = "length" + break + + if stopping_criteria is not None and stopping_criteria( + self._input_ids.tolist(), self._scores[-1, :].tolist() + ): + text = self.detokenize(completion_tokens) + finish_reason = "stop" + + if self.verbose: + falcon_cpp.falcon_print_timings(self.ctx) + + if stream: + remaining_tokens = completion_tokens[returned_tokens:] + all_text = self.detokenize(remaining_tokens) + any_stop = [s for s in stop_sequences if s in all_text] + if len(any_stop) > 0: + end = min(all_text.index(stop) for stop in any_stop) + else: + end = len(all_text) + + token_end_position = 0 + for token in remaining_tokens: + token_end_position += len(self.detokenize([token])) + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + token_str = self.detokenize([token]).decode( + "utf-8", errors="ignore" + ) + text_offset = len(prompt) + len( + self.detokenize(completion_tokens[:returned_tokens]) + ) + token_offset = len(prompt_tokens) + returned_tokens - 1 + logits = self._scores[token_offset, :].tolist() + current_logprobs = Falcon.logits_to_logprobs(logits) + sorted_logprobs = list( + sorted( + zip(current_logprobs, range(len(current_logprobs))), + reverse=True, + ) + ) + top_logprob = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: current_logprobs[int(token)]}) + logprobs_or_none = { + "tokens": [ + self.detokenize([token]).decode("utf-8", errors="ignore") + ], + "text_offset": [text_offset], + "token_logprobs": [sorted_logprobs[int(token)][0]], + "top_logprobs": [top_logprob], + } + + if token_end_position >= end: + last_text = self.detokenize([token]) + if token_end_position == end - 1: + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": last_text[ + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason, + } + ], + } + break + returned_tokens += 1 + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": self.detokenize([token]).decode( + "utf-8", errors="ignore" + ), + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason + if returned_tokens == len(completion_tokens) + else None, + } + ], + } + if self.cache: + if self.verbose: + print("Falcon._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + print("Falcon._create_completion: cache saved", file=sys.stderr) + return + + if self.cache: + if self.verbose: + print("Falcon._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + + text_str = text.decode("utf-8", errors="ignore") + + if echo: + text_str = prompt + text_str + + if suffix is not None: + text_str = text_str + suffix + + logprobs_or_none: Optional[CompletionLogprobs] = None + if logprobs is not None: + text_offset = 0 if echo else len(prompt) + token_offset = 0 if echo else len(prompt_tokens[1:]) + text_offsets: List[int] = [] + token_logprobs: List[Optional[float]] = [] + tokens: List[str] = [] + top_logprobs: List[Optional[Dict[str, float]]] = [] + + if echo: + # Remove leading BOS token + all_tokens = prompt_tokens[1:] + completion_tokens + else: + all_tokens = completion_tokens + + all_token_strs = [ + self.detokenize([token]).decode("utf-8", errors="ignore") + for token in all_tokens + ] + all_logprobs = [ + Falcon.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] + for token, token_str, logprobs_token in zip( + all_tokens, all_token_strs, all_logprobs + ): + text_offsets.append(text_offset) + text_offset += len(token_str) + tokens.append(token_str) + sorted_logprobs = list( + sorted( + zip(logprobs_token, range(len(logprobs_token))), reverse=True + ) + ) + token_logprobs.append(sorted_logprobs[int(token)][0]) + top_logprob: Optional[Dict[str, float]] = { + self.detokenize([i]).decode("utf-8", errors="ignore"): logprob + for logprob, i in sorted_logprobs[:logprobs] + } + top_logprob.update({token_str: logprobs_token[int(token)]}) + top_logprobs.append(top_logprob) + # Weird idosincracy of the OpenAI API where + # token_logprobs and top_logprobs are null for + # the first token. + if echo and len(all_tokens) > 0: + token_logprobs[0] = None + top_logprobs[0] = None + logprobs_or_none = { + "tokens": tokens, + "text_offset": text_offsets, + "token_logprobs": token_logprobs, + "top_logprobs": top_logprobs, + } + + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": text_str, + "index": 0, + "logprobs": logprobs_or_none, + "finish_reason": finish_reason, + } + ], + "usage": { + "prompt_tokens": len(prompt_tokens), + "completion_tokens": len(completion_tokens), + "total_tokens": len(prompt_tokens) + len(completion_tokens), + }, + } + + def create_completion( + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, + ) -> Union[Completion, Iterator[CompletionChunk]]: + """Generate text from a prompt. + + Args: + prompt: The prompt to generate text from. + suffix: A suffix to append to the generated text. If None, no suffix is appended. + max_tokens: The maximum number of tokens to generate. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for sampling. + logprobs: The number of logprobs to return. If None, no logprobs are returned. + echo: Whether to echo the prompt. + stop: A list of strings to stop generation when encountered. + repeat_penalty: The penalty to apply to repeated tokens. + top_k: The top-k value to use for sampling. + stream: Whether to stream the results. + + Raises: + ValueError: If the requested tokens exceed the context window. + RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. + + Returns: + Response object containing the generated text. + """ + completion_or_chunks = self._create_completion( + prompt=prompt, + suffix=suffix, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + logprobs=logprobs, + echo=echo, + stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + top_k=top_k, + stream=stream, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, + ) + if stream: + chunks: Iterator[CompletionChunk] = completion_or_chunks + return chunks + completion: Completion = next(completion_or_chunks) # type: ignore + return completion + + def __call__( + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, + ) -> Union[Completion, Iterator[CompletionChunk]]: + """Generate text from a prompt. + + Args: + prompt: The prompt to generate text from. + suffix: A suffix to append to the generated text. If None, no suffix is appended. + max_tokens: The maximum number of tokens to generate. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for sampling. + logprobs: The number of logprobs to return. If None, no logprobs are returned. + echo: Whether to echo the prompt. + stop: A list of strings to stop generation when encountered. + repeat_penalty: The penalty to apply to repeated tokens. + top_k: The top-k value to use for sampling. + stream: Whether to stream the results. + + Raises: + ValueError: If the requested tokens exceed the context window. + RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt. + + Returns: + Response object containing the generated text. + """ + return self.create_completion( + prompt=prompt, + suffix=suffix, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + logprobs=logprobs, + echo=echo, + stop=stop, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + top_k=top_k, + stream=stream, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, + ) + + def _convert_text_completion_to_chat( + self, completion: Completion + ) -> ChatCompletion: + return { + "id": "chat" + completion["id"], + "object": "chat.completion", + "created": completion["created"], + "model": completion["model"], + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": completion["choices"][0]["text"], + }, + "finish_reason": completion["choices"][0]["finish_reason"], + } + ], + "usage": completion["usage"], + } + + def _convert_text_completion_chunks_to_chat( + self, + chunks: Iterator[CompletionChunk], + ) -> Iterator[ChatCompletionChunk]: + for i, chunk in enumerate(chunks): + if i == 0: + yield { + "id": "chat" + chunk["id"], + "model": chunk["model"], + "created": chunk["created"], + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + }, + "finish_reason": None, + } + ], + } + yield { + "id": "chat" + chunk["id"], + "model": chunk["model"], + "created": chunk["created"], + "object": "chat.completion.chunk", + "choices": [ + { + "index": 0, + "delta": { + "content": chunk["choices"][0]["text"], + }, + "finish_reason": chunk["choices"][0]["finish_reason"], + } + ], + } + + def create_chat_completion( + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, + ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: + """Generate a chat completion from a list of messages. + + Args: + messages: A list of messages to generate a response for. + temperature: The temperature to use for sampling. + top_p: The top-p value to use for sampling. + top_k: The top-k value to use for sampling. + stream: Whether to stream the results. + stop: A list of strings to stop generation when encountered. + max_tokens: The maximum number of tokens to generate. + repeat_penalty: The penalty to apply to repeated tokens. + + Returns: + Generated chat completion or a stream of chat completion chunks. + """ + stop = ( + stop if isinstance(stop, list) else [stop] if isinstance(stop, str) else [] + ) + chat_history = "".join( + f'### {"Human" if message["role"] == "user" else "Assistant"}:{message["content"]}' + for message in messages + ) + PROMPT = chat_history + "### Assistant:" + PROMPT_STOP = ["### Assistant:", "### Human:"] + completion_or_chunks = self( + prompt=PROMPT, + stop=PROMPT_STOP + stop, + temperature=temperature, + top_p=top_p, + top_k=top_k, + stream=stream, + max_tokens=max_tokens, + repeat_penalty=repeat_penalty, + presence_penalty=presence_penalty, + frequency_penalty=frequency_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + model=model, + logits_processor=logits_processor, + ) + if stream: + chunks: Iterator[CompletionChunk] = completion_or_chunks # type: ignore + return self._convert_text_completion_chunks_to_chat(chunks) + else: + completion: Completion = completion_or_chunks # type: ignore + return self._convert_text_completion_to_chat(completion) + + def __del__(self): + if self.model is not None: + falcon_cpp.falcon_free_model(self.model) + self.model = None + if self.ctx is not None: + falcon_cpp.falcon_free(self.ctx) + self.ctx = None + + def __getstate__(self): + return dict( + verbose=self.verbose, + model_path=self.model_path, + n_ctx=self.params.n_ctx, + n_gpu_layers=self.params.n_gpu_layers, + seed=self.params.seed, + f16_kv=self.params.f16_kv, + logits_all=self.params.logits_all, + vocab_only=self.params.vocab_only, + use_mmap=self.params.use_mmap, + use_mlock=self.params.use_mlock, + embedding=self.params.embedding, + low_vram=self.params.low_vram, + last_n_tokens_size=self.last_n_tokens_size, + n_batch=self.n_batch, + n_threads=self.n_threads, + lora_base=self.lora_base, + lora_path=self.lora_path, + ### DEPRECATED ### + n_parts=self.n_parts, + ### DEPRECATED ### + ) + + def __setstate__(self, state): + self.__init__( + model_path=state["model_path"], + n_ctx=state["n_ctx"], + n_parts=state["n_parts"], + n_gpu_layers=state["n_gpu_layers"], + seed=state["seed"], + f16_kv=state["f16_kv"], + logits_all=state["logits_all"], + vocab_only=state["vocab_only"], + use_mmap=state["use_mmap"], + use_mlock=state["use_mlock"], + embedding=state["embedding"], + low_vram=state["low_vram"], + n_threads=state["n_threads"], + n_batch=state["n_batch"], + last_n_tokens_size=state["last_n_tokens_size"], + lora_base=state["lora_base"], + lora_path=state["lora_path"], + verbose=state["verbose"], + ) + + def save_state(self) -> FalconState: + assert self.ctx is not None + if self.verbose: + print("Falcon.save_state: saving falcon state", file=sys.stderr) + state_size = falcon_cpp.falcon_get_state_size(self.ctx) + if self.verbose: + print(f"Falcon.save_state: got state size: {state_size}", file=sys.stderr) + falcon_state = (falcon_cpp.c_uint8 * int(state_size))() + if self.verbose: + print("Falcon.save_state: allocated state", file=sys.stderr) + n_bytes = falcon_cpp.falcon_copy_state_data(self.ctx, falcon_state) + if self.verbose: + print(f"Falcon.save_state: copied falcon state: {n_bytes}", file=sys.stderr) + if int(n_bytes) > int(state_size): + raise RuntimeError("Failed to copy Falcon state data") + falcon_state_compact = (falcon_cpp.c_uint8 * int(n_bytes))() + falcon_cpp.ctypes.memmove(falcon_state_compact, falcon_state, int(n_bytes)) + if self.verbose: + print( + f"Falcon.save_state: saving {n_bytes} bytes of falcon state", + file=sys.stderr, + ) + return FalconState( + scores=self.scores.copy(), + input_ids=self.input_ids.copy(), + n_tokens=self.n_tokens, + falcon_state=bytes(falcon_state_compact), + falcon_state_size=n_bytes, + ) + + def load_state(self, state: FalconState) -> None: + assert self.ctx is not None + self.scores = state.scores.copy() + self.input_ids = state.input_ids.copy() + self.n_tokens = state.n_tokens + state_size = state.falcon_state_size + FalconStateArrayType = falcon_cpp.c_uint8 * state_size + falcon_state = FalconStateArrayType.from_buffer_copy(state.falcon_state) + + if falcon_cpp.falcon_set_state_data(self.ctx, falcon_state) != state_size: + raise RuntimeError("Failed to set Falcon state data") + + def n_ctx(self) -> int: + """Return the context window size.""" + assert self.ctx is not None + return falcon_cpp.falcon_n_ctx(self.ctx) + + def n_embd(self) -> int: + """Return the embedding size.""" + assert self.ctx is not None + return falcon_cpp.falcon_n_embd(self.ctx) + + def n_vocab(self) -> int: + """Return the vocabulary size.""" + assert self.ctx is not None + return falcon_cpp.falcon_n_vocab(self.ctx) + + def tokenizer(self) -> "FalconTokenizer": + """Return the tokenizer for this model.""" + assert self.ctx is not None + return FalconTokenizer(self) + + @staticmethod + def token_eos() -> int: + """Return the end-of-sequence token.""" + return falcon_cpp.falcon_token_eos() + + @staticmethod + def token_bos() -> int: + """Return the beginning-of-sequence token.""" + return falcon_cpp.falcon_token_bos() + + @staticmethod + def token_nl() -> int: + """Return the newline token.""" + return falcon_cpp.falcon_token_nl() + + @staticmethod + def logits_to_logprobs(logits: List[float]) -> List[float]: + exps = [math.exp(float(x)) for x in logits] + sum_exps = sum(exps) + return [math.log(x / sum_exps) for x in exps] + + @staticmethod + def longest_token_prefix(a: Sequence[int], b: Sequence[int]): + longest_prefix = 0 + for _a, _b in zip(a, b): + if _a == _b: + longest_prefix += 1 + else: + break + return longest_prefix + + +class FalconTokenizer: + def __init__(self, falcon: Falcon): + self.falcon = falcon + + def encode(self, text: str, add_bos: bool = True) -> List[int]: + return self.falcon.tokenize( + text.encode("utf-8", errors="ignore"), add_bos=add_bos + ) + + def decode(self, tokens: List[int]) -> str: + return self.falcon.detokenize(tokens).decode("utf-8", errors="ignore") + + @classmethod + def from_ggml_file(cls, path: str) -> "FalconTokenizer": + return cls(Falcon(model_path=path, vocab_only=True)) diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py new file mode 100644 index 000000000..78297adc4 --- /dev/null +++ b/falcon_cpp/falcon_cpp.py @@ -0,0 +1,1024 @@ +import sys +import os +import ctypes +from ctypes import ( + c_int, + c_float, + c_char_p, + c_void_p, + c_bool, + POINTER, + _Pointer, # type: ignore + Structure, + Array, + c_uint8, + c_size_t, +) +import pathlib +from typing import List, Union + + +# Load the library +def _load_shared_library(lib_base_name: str): + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(__file__).parent.resolve() + # Searching for the library in the current directory under the name "libFalcon" (default name + # for falconcpp) and "falcon" (default name for this repo) + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") + + if "FALCON_CPP_LIB" in os.environ: + lib_base_name = os.environ["FALCON_CPP_LIB"] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] + + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_lib_base_name = "llama" + +# Load the library +_lib = _load_shared_library(_lib_base_name) + +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# falcon.h bindings + +GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") +GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) +FALCON_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) + +# #define FALCON_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +FALCON_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define FALCON_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +FALCON_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define FALCON_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +FALCON_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define FLACON_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +FALCON_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define FALCON_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +FALCON_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define FALCON_FILE_VERSION 3 +FALCON_FILE_VERSION = c_int(3) +FALCON_FILE_MAGIC = FALCON_FILE_MAGIC_GGJT +FALCON_FILE_MAGIC_UNVERSIONED = FALCON_FILE_MAGIC_GGML +FALCON_SESSION_MAGIC = FALCON_FILE_MAGIC_GGSN +FALCON_SESSION_VERSION = c_int(1) + +# struct falcon_model; +falcon_model_p = c_void_p + +# struct falcon_context; +falcon_context_p = c_void_p + + +# typedef int falcon_token; +falcon_token = c_int +falcon_token_p = POINTER(falcon_token) + + +# typedef struct falcon_token_data { +# falcon_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } falcon_token_data; +class falcon_token_data(Structure): + _fields_ = [ + ("id", falcon_token), + ("logit", c_float), + ("p", c_float), + ] + + +falcon_token_data_p = POINTER(falcon_token_data) + + +# typedef struct falcon_token_data_array { +# falcon_token_data * data; +# size_t size; +# bool sorted; +# } falcon_token_data_array; +class falcon_token_data_array(Structure): + _fields_ = [ + ("data", falcon_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +falcon_token_data_array_p = POINTER(falcon_token_data_array) + +# typedef void (*falcon_progress_callback)(float progress, void *ctx); +falcon_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) + + +# struct falcon_context_params { +# int seed; // RNG seed, -1 for random +# int n_ctx; // text context +# int n_batch; // prompt processing batch size +# int n_gpu_layers; // number of layers to store in VRAM +# int main_gpu; // the GPU that is used for scratch and small tensors +# float tensor_split[FALCON_MAX_DEVICES]; // how to split layers across multiple GPUs +# // called with a progress value between 0 and 1, pass NULL to disable +# falcon_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; + + +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the falcon_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only +# }; +class ggllm_context_params(Structure): + _fields_ = [ + ("seed", c_int), + ("n_ctx", c_int), + ("n_batch", c_int), + ("n_gpu_layers", c_int), + ("main_gpu", c_int), + ("tensor_split", c_float * FALCON_MAX_DEVICES.value), + ("progress_callback", falcon_progress_callback), + ("progress_callback_user_data", c_void_p), + ("low_vram", c_bool), + ("f16_kv", c_bool), + ("logits_all", c_bool), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), + ] + + +falcon_context_params_p = POINTER(ggllm_context_params) + +# enum falcon_ftype { +# FALCON_FTYPE_ALL_F32 = 0, +# FALCON_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // FALCON_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // FALCON_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# FALCON_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors +# }; +FALCON_FTYPE_ALL_F32 = c_int(0) +FALCON_FTYPE_MOSTLY_F16 = c_int(1) +FALCON_FTYPE_MOSTLY_Q4_0 = c_int(2) +FALCON_FTYPE_MOSTLY_Q4_1 = c_int(3) +FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +FALCON_FTYPE_MOSTLY_Q8_0 = c_int(7) +FALCON_FTYPE_MOSTLY_Q5_0 = c_int(8) +FALCON_FTYPE_MOSTLY_Q5_1 = c_int(9) +FALCON_FTYPE_MOSTLY_Q2_K = c_int(10) +FALCON_FTYPE_MOSTLY_Q3_K_S = c_int(11) +FALCON_FTYPE_MOSTLY_Q3_K_M = c_int(12) +FALCON_FTYPE_MOSTLY_Q3_K_L = c_int(13) +FALCON_FTYPE_MOSTLY_Q4_K_S = c_int(14) +FALCON_FTYPE_MOSTLY_Q4_K_M = c_int(15) +FALCON_FTYPE_MOSTLY_Q5_K_S = c_int(16) +FALCON_FTYPE_MOSTLY_Q5_K_M = c_int(17) +FALCON_FTYPE_MOSTLY_Q6_K = c_int(18) + + +# // model quantization parameters +# typedef struct falcon_model_quantize_params { +# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum falcon_ftype ftype; // quantize to this falcon_ftype +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# } falcon_model_quantize_params; +class falcon_model_quantize_params(Structure): + _fields_ = [ + ("nthread", c_int), + ("ftype", c_int), + ("allow_requantize", c_bool), + ("quantize_output_tensor", c_bool), + ] + + +# FALCON_API struct falcon_context_params falcon_context_default_params(); +def falcon_context_default_params() -> ggllm_context_params: + return _lib.ggllm_context_default_params() + + +_lib.ggllm_context_default_params.argtypes = [] +_lib.ggllm_context_default_params.restype = ggllm_context_params + + +# FALCON_API struct falcon_model_quantize_params falcon_model_quantize_default_params(); +def falcon_model_quantize_default_params() -> falcon_model_quantize_params: + return _lib.ggllm_model_quantize_default_params() + + +_lib.ggllm_model_quantize_default_params.argtypes = [] +_lib.ggllm_model_quantize_default_params.restype = falcon_model_quantize_params + + +# FALCON_API bool falcon_mmap_supported(); +def falcon_mmap_supported() -> bool: + return _lib.ggllm_mmap_supported() + + +_lib.ggllm_mmap_supported.argtypes = [] +_lib.ggllm_mmap_supported.restype = c_bool + + +# FALCON_API bool falcon_mlock_supported(); +def falcon_mlock_supported() -> bool: + return _lib.ggllm_mlock_supported() + + +_lib.ggllm_mlock_supported.argtypes = [] +_lib.ggllm_mlock_supported.restype = c_bool + + +# // TODO: not great API - very likely to change +# // Initialize the falcon + ggml backend +# // If numa is true, use NUMA optimizations +# // Call once at the start of the program +# FLACON_API void falcon_init_backend(bool numa); +def falcon_init_backend(numa: c_bool): + return _lib.ggllm_init_backend(numa) + + +_lib.ggllm_init_backend.argtypes = [c_bool] +_lib.ggllm_init_backend.restype = None + + +# FALCON_API struct falcon_model * falcon_load_model_from_file( +# const char * path_model, +# struct falcon_context_params params); +def falcon_load_model_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_model_p: + return _lib.ggllm_load_model_from_file(path_model, params) + + +_lib.ggllm_load_model_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_load_model_from_file.restype = falcon_model_p + + +# FALCON_API void falcon_free_model(struct falcon_model * model); +def falcon_free_model(model: falcon_model_p): + return _lib.ggllm_free_model(model) + + +_lib.ggllm_free_model.argtypes = [falcon_model_p] +_lib.ggllm_free_model.restype = None + + +# FALCON_API struct falcon_context * falcon_new_context_with_model( +# struct falcon_model * model, +# struct falcon_context_params params); +def falcon_new_context_with_model( + model: falcon_model_p, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_new_context_with_model(model, params) + + +_lib.ggllm_new_context_with_model.argtypes = [falcon_model_p, ggllm_context_params] +_lib.ggllm_new_context_with_model.restype = falcon_context_p + + +# FALCON_API int64_t ggllm_time_us(); +def ggllm_time_us() -> int: + return _lib.ggllm_time_us() + + +_lib.ggllm_time_us.argtypes = [] +_lib.ggllm_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml falcon model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# FALCON_API struct falcon_context * falcon_init_from_file( +# const char * path_model, +# struct falcon_context_params params); +def ggllm_init_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_init_from_file(path_model, params) + + +_lib.ggllm_init_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_init_from_file.restype = falcon_context_p + + +# Frees all allocated memory +# FALCON_API void falcon_free(struct falcon_context * ctx); +def falcon_free(ctx: falcon_context_p): + return _lib.ggllm_free(ctx) + + +_lib.ggllm_free.argtypes = [falcon_context_p] +_lib.ggllm_free.restype = None + + +# // Returns 0 on success +# FALCON_API int ggllm_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# const falcon_model_quantize_params * params); +def ggllm_model_quantize( + fname_inp: bytes, + fname_out: bytes, + params, # type: POINTER(falcon_model_quantize_params) # type: ignore +) -> int: + return _lib.ggllm_model_quantize(fname_inp, fname_out, params) + + +_lib.ggllm_model_quantize.argtypes = [ + c_char_p, + c_char_p, + POINTER(falcon_model_quantize_params), +] +_lib.ggllm_model_quantize.restype = c_int + + +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +# FALCON_API int falcon_apply_lora_from_file( +# struct falcon_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def ggllm_apply_lora_from_file( + ctx: falcon_context_p, + path_lora: c_char_p, + path_base_model: c_char_p, + n_threads: c_int, +) -> int: + return _lib.ggllm_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.ggllm_apply_lora_from_file.argtypes = [falcon_context_p, c_char_p, c_char_p, c_int] +_lib.ggllm_apply_lora_from_file.restype = c_int + + +# FALCON_API int ggllm_model_apply_lora_from_file( +# const struct ggllm_model * model, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def falcon_model_apply_lora_from_file( + model: falcon_model_p, + path_lora: Union[c_char_p, bytes], + path_base_model: Union[c_char_p, bytes], + n_threads: c_int, +) -> int: + return _lib.ggllm_model_apply_lora_from_file( + model, path_lora, path_base_model, n_threads + ) + + +_lib.ggllm_model_apply_lora_from_file.argtypes = [ + falcon_model_p, + c_char_p, + c_char_p, + c_int, +] +_lib.ggllm_model_apply_lora_from_file.restype = c_int + + +# Returns the number of tokens in the KV cache +# FALCON_API int falcon_get_kv_cache_token_count(const struct falcon_context * ctx); +def ggllm_get_kv_cache_token_count(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_kv_cache_token_count(ctx) + + +_lib.ggllm_get_kv_cache_token_count.argtypes = [falcon_context_p] +_lib.ggllm_get_kv_cache_token_count.restype = c_int + + +# Sets the current rng seed. +# FALCON_API void falcon_set_rng_seed(struct falcon_context * ctx, int seed); +def falcon_set_rng_seed(ctx: falcon_context_p, seed: c_int): + return _lib.ggllm_set_rng_seed(ctx, seed) + + +_lib.ggllm_set_rng_seed.argtypes = [falcon_context_p, c_int] +_lib.ggllm_set_rng_seed.restype = None + + +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens +# FALCON_API size_t falcon_get_state_size(const struct falcon_context * ctx); +def falcon_get_state_size(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_state_size(ctx) + + +_lib.ggllm_get_state_size.argtypes = [falcon_context_p] +_lib.ggllm_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +# FALCON_API size_t falcon_copy_state_data(struct falcon_context * ctx, uint8_t * dst); +def falcon_copy_state_data( + ctx: falcon_context_p, dst # type: Array[c_uint8] +) -> int: + return _lib.ggllm_copy_state_data(ctx, dst) + + +_lib.ggllm_copy_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +# FALCON_API size_t falcon_set_state_data(struct falcon_context * ctx, uint8_t * src); +def falcon_set_state_data( + ctx: falcon_context_p, src # type: Array[c_uint8] +) -> int: + return _lib.ggllm_set_state_data(ctx, src) + + +_lib.ggllm_set_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_set_state_data.restype = c_size_t + + +# Save/load session file +# GGLLM_API bool falcon_load_session_file(struct falcon_context * ctx, const char * path_session, falcon_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); +def ggllm_load_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens_out, # type: Array[falcon_token] + n_token_capacity: c_size_t, + n_token_count_out, # type: _Pointer[c_size_t] +) -> int: + return _lib.ggllm_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.ggllm_load_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, + c_size_t_p, +] +_lib.ggllm_load_session_file.restype = c_size_t + + +# FALCON_API bool falcon_save_session_file(struct falcon_context * ctx, const char * path_session, const falcon_token * tokens, size_t n_token_count); +def ggllm_save_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens, # type: Array[falcon_token] + n_token_count: c_size_t, +) -> int: + return _lib.ggllm_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.ggllm_save_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, +] +_lib.ggllm_save_session_file.restype = c_size_t + + +# Run the falcon inference to obtain the logits and probabilities for the next token. +# tokens + n_tokens is the provided batch of new tokens to process +# n_past is the number of tokens to use from previous eval calls +# Returns 0 on success +# GGLLM_API int falcon_eval( +# struct falcon_context * ctx, +# const falcon_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); +def falcon_eval( + ctx: falcon_context_p, + tokens, # type: Array[falcon_token] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval(ctx, tokens, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval.argtypes = [falcon_context_p, falcon_token_p, c_int, c_int, c_int] +_lib.ggllm_eval.restype = c_int + + +# // Same as falcon_eval, but use float matrix input directly. +# FALCON_API int falcon_eval_embd( +# struct falcon_context * ctx, +# const float * embd, +# int n_tokens, +# int n_past, +# int n_threads); +def ggllm_eval_embd( + ctx: falcon_context_p, + embd, # type: Array[c_float] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval_embd(ctx, embd, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval_embd.argtypes = [falcon_context_p, c_float_p, c_int, c_int, c_int] +_lib.ggllm_eval_embd.restype = c_int + + +# Convert the provided text into tokens. +# The tokens pointer must be large enough to hold the resulting tokens. +# Returns the number of tokens on success, no more than n_max_tokens +# Returns a negative number on failure - the number of tokens that would have been returned +# TODO: not sure if correct +# FALCON_API int ggllm_tokenize( +# struct falcon_context * ctx, +# const char * text, +# falcon_token * tokens, +# int n_max_tokens, +# bool add_bos); +def falcon_tokenize( + ctx: falcon_context_p, + text: bytes, + tokens, # type: Array[falcon_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.ggllm_tokenize(ctx, text, tokens, n_max_tokens, add_bos) + + +_lib.ggllm_tokenize.argtypes = [falcon_context_p, c_char_p, falcon_token_p, c_int, c_bool] +_lib.ggllm_tokenize.restype = c_int + + +# GGLLM_API int ggllm_n_vocab(const struct falcon_context * ctx); +def falcon_n_vocab(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_vocab(ctx) + + +_lib.ggllm_n_vocab.argtypes = [falcon_context_p] +_lib.ggllm_n_vocab.restype = c_int + + +# FALCON_API int falcon_n_ctx (const struct falcon_context * ctx); +def falcon_n_ctx(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_ctx(ctx) + + +_lib.ggllm_n_ctx.argtypes = [falcon_context_p] +_lib.ggllm_n_ctx.restype = c_int + + +# FALCON_API int falcon_n_embd (const struct falcon_context * ctx); +def falcon_n_embd(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_embd(ctx) + + +_lib.ggllm_n_embd.argtypes = [falcon_context_p] +_lib.ggllm_n_embd.restype = c_int + + +# // Get the vocabulary as output parameters. +# // Returns number of results. +# FALCON_API int falcon_get_vocab( +# const struct falcon_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def falcon_get_vocab( + ctx: falcon_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.ggllm_get_vocab(ctx, strings, scores, capacity) + + +_lib.ggllm_get_vocab.argtypes = [falcon_context_p, c_char_p, c_float, c_int] +_lib.ggllm_get_vocab.restype = c_int + + +# Token logits obtained from the last call to falcon_eval() +# The logits for the last token are stored in the last row +# Can be mutated in order to change the probabilities of the next token +# Rows: n_tokens +# Cols: n_vocab +# FALCON_API float * falcon_get_logits(struct falcon_context * ctx); +def falcon_get_logits( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_logits(ctx) + + +_lib.ggllm_get_logits.argtypes = [falcon_context_p] +_lib.ggllm_get_logits.restype = c_float_p + + +# Get the embeddings for the input +# shape: [n_embd] (1-dimensional) +# FALCON_API float * falcon_get_embeddings(struct falcon_context * ctx); +def falcon_get_embeddings( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_embeddings(ctx) + + +_lib.ggllm_get_embeddings.argtypes = [falcon_context_p] +_lib.ggllm_get_embeddings.restype = c_float_p + + +# Token Id -> String. Uses the vocabulary in the provided context +# FLACON_API const char * falcon_token_to_str(const struct falcon_context * ctx, falcon_token token); +def falcon_token_to_str(ctx: falcon_context_p, token: falcon_token) -> bytes: + return _lib.ggllm_token_to_str(ctx, token) + + +_lib.ggllm_token_to_str.argtypes = [falcon_context_p, falcon_token] +_lib.ggllm_token_to_str.restype = c_char_p + +# Special tokens + + +# FALCON_API falcon_token falcon_token_bos(); // beginning-of-sentence +def falcon_token_bos() -> int: + return _lib.ggllm_token_bos() + + +_lib.ggllm_token_bos.argtypes = [] +_lib.ggllm_token_bos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_eos(); // end-of-sentence +def falcon_token_eos() -> int: + return _lib.ggllm_token_eos() + + +_lib.ggllm_token_eos.argtypes = [] +_lib.ggllm_token_eos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_nl(); // next-line +def falcon_token_nl() -> int: + return _lib.ggllm_token_nl() + + +_lib.ggllm_token_nl.argtypes = [] +_lib.ggllm_token_nl.restype = falcon_token + + +# Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# FALCON_API void falcon_sample_repetition_penalty(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float penalty); +def falcon_sample_repetition_penalty( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + penalty: c_float, +): + return _lib.ggllm_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty + ) + + +_lib.ggllm_sample_repetition_penalty.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, +] +_lib.ggllm_sample_repetition_penalty.restype = None + + +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# FALCON_API void falcon_sample_frequency_and_presence_penalties(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def falcon_sample_frequency_and_presence_penalties( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +): + return _lib.ggllm_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.ggllm_sample_frequency_and_presence_penalties.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, + c_float, +] +_lib.ggllm_sample_frequency_and_presence_penalties.restype = None + + +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# FALCON_API void falcon_sample_softmax(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_softmax( + ctx: falcon_context_p, candidates # type: _Pointer[falcon_token_data] +): + return _lib.ggllm_sample_softmax(ctx, candidates) + + +_lib.ggllm_sample_softmax.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_softmax.restype = None + + +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_k(struct falcon_context * ctx, falcon_token_data_array * candidates, int k, size_t min_keep); +def falcon_sample_top_k( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + k: c_int, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.ggllm_sample_top_k.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_int, + c_size_t, +] +_lib.ggllm_sample_top_k.restype = None + + +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_p(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_top_p( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_top_p.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_top_p.restype = None + + +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# FALCON_API void falcon_sample_tail_free(struct falcon_context * ctx, falcon_token_data_array * candidates, float z, size_t min_keep); +def falcon_sample_tail_free( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + z: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.ggllm_sample_tail_free.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_tail_free.restype = None + + +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# FALCON_API void falcon_sample_typical(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_typical( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_typical(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_typical.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_typical.restype = None + + +# FALCON_API void falcon_sample_temperature(struct falcon_context * ctx, falcon_token_data_array * candidates, float temp); +def falcon_sample_temperature( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + temp: c_float, +): + return _lib.ggllm_sample_temperature(ctx, candidates, temp) + + +_lib.ggllm_sample_temperature.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, +] +_lib.ggllm_sample_temperature.restype = None + + +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, int m, float * mu); +def falcon_sample_token_mirostat( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + m: c_int, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.ggllm_sample_token_mirostat.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_int, + c_float_p, +] +_lib.ggllm_sample_token_mirostat.restype = falcon_token + + +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat_v2(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, float * mu); +def falcon_sample_token_mirostat_v2( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.ggllm_sample_token_mirostat_v2.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_float_p, +] +_lib.ggllm_sample_token_mirostat_v2.restype = falcon_token + + +# @details Selects the token with the highest probability. +# FALCON_API falcon_token falcon_sample_token_greedy(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token_greedy( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token_greedy(ctx, candidates) + + +_lib.ggllm_sample_token_greedy.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token_greedy.restype = falcon_token + + +# @details Randomly selects a token from the candidates based on their probabilities. +# FALCON_API falcon_token falcon_sample_token(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token(ctx, candidates) + + +_lib.ggllm_sample_token.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token.restype = falcon_token + + +# Performance information + + +# FALCON_API void falcon_print_timings(struct falcon_context * ctx); +def falcon_print_timings(ctx: falcon_context_p): + _lib.ggllm_print_timings(ctx) + + +_lib.ggllm_print_timings.argtypes = [falcon_context_p] +_lib.ggllm_print_timings.restype = None + + +# FALCON_API void falcon_reset_timings(struct falcon_context * ctx); +def falcon_reset_timings(ctx: falcon_context_p): + _lib.ggllm_reset_timings(ctx) + + +_lib.ggllm_reset_timings.argtypes = [falcon_context_p] +_lib.ggllm_reset_timings.restype = None + + +# Print system information +# FALCON_API const char * falcon_print_system_info(void); +def falcon_print_system_info() -> bytes: + return _lib.ggllm_print_system_info() + + +_lib.ggllm_print_system_info.argtypes = [] +_lib.ggllm_print_system_info.restype = c_char_p + +################################################################################################### + + +_falcon_initialized = False + +if not _falcon_initialized: + falcon_init_backend(c_bool(False)) + _falcon_initialized = True diff --git a/falcon_cpp/falcon_types.py b/falcon_cpp/falcon_types.py new file mode 100644 index 000000000..7729ced5a --- /dev/null +++ b/falcon_cpp/falcon_types.py @@ -0,0 +1,97 @@ +from typing import List, Optional, Dict +from typing_extensions import TypedDict, NotRequired, Literal + + +class EmbeddingUsage(TypedDict): + prompt_tokens: int + total_tokens: int + + +class EmbeddingData(TypedDict): + index: int + object: str + embedding: List[float] + + +class Embedding(TypedDict): + object: Literal["list"] + model: str + data: List[EmbeddingData] + usage: EmbeddingUsage + + +class CompletionLogprobs(TypedDict): + text_offset: List[int] + token_logprobs: List[Optional[float]] + tokens: List[str] + top_logprobs: List[Optional[Dict[str, float]]] + + +class CompletionChoice(TypedDict): + text: str + index: int + logprobs: Optional[CompletionLogprobs] + finish_reason: Optional[str] + + +class CompletionUsage(TypedDict): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class CompletionChunk(TypedDict): + id: str + object: Literal["text_completion"] + created: int + model: str + choices: List[CompletionChoice] + + +class Completion(TypedDict): + id: str + object: Literal["text_completion"] + created: int + model: str + choices: List[CompletionChoice] + usage: CompletionUsage + + +class ChatCompletionMessage(TypedDict): + role: Literal["assistant", "user", "system"] + content: str + user: NotRequired[str] + + +class ChatCompletionChoice(TypedDict): + index: int + message: ChatCompletionMessage + finish_reason: Optional[str] + + +class ChatCompletion(TypedDict): + id: str + object: Literal["chat.completion"] + created: int + model: str + choices: List[ChatCompletionChoice] + usage: CompletionUsage + + +class ChatCompletionChunkDelta(TypedDict): + role: NotRequired[Literal["assistant"]] + content: NotRequired[str] + + +class ChatCompletionChunkChoice(TypedDict): + index: int + delta: ChatCompletionChunkDelta + finish_reason: Optional[str] + + +class ChatCompletionChunk(TypedDict): + id: str + model: str + object: Literal["chat.completion.chunk"] + created: int + choices: List[ChatCompletionChunkChoice] diff --git a/falcon_cpp/server/__init__.py b/falcon_cpp/server/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/falcon_cpp/server/__main__.py b/falcon_cpp/server/__main__.py new file mode 100644 index 000000000..748a2af33 --- /dev/null +++ b/falcon_cpp/server/__main__.py @@ -0,0 +1,50 @@ +"""Example FastAPI server for llama.cpp. + +To run this example: + +```bash +pip install fastapi uvicorn sse-starlette +export MODEL=../models/7B/... +``` + +Then run: +``` +uvicorn llama_cpp.server.app:app --reload +``` + +or + +``` +python3 -m llama_cpp.server +``` + +Then visit http://localhost:8000/docs to see the interactive API docs. + +""" +import os +import argparse + +import uvicorn + +from llama_cpp.server.app import create_app, Settings + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + for name, field in Settings.__fields__.items(): + description = field.field_info.description + if field.default is not None and description is not None: + description += f" (default: {field.default})" + parser.add_argument( + f"--{name}", + dest=name, + type=field.type_, + help=description, + ) + + args = parser.parse_args() + settings = Settings(**{k: v for k, v in vars(args).items() if v is not None}) + app = create_app(settings=settings) + + uvicorn.run( + app, host=os.getenv("HOST", settings.host), port=int(os.getenv("PORT", settings.port)) + ) diff --git a/falcon_cpp/server/app.py b/falcon_cpp/server/app.py new file mode 100644 index 000000000..2e0972ea6 --- /dev/null +++ b/falcon_cpp/server/app.py @@ -0,0 +1,550 @@ +import json +import multiprocessing +from threading import Lock +from functools import partial +from typing import Iterator, List, Optional, Union, Dict +from typing_extensions import TypedDict, Literal + +import falcon_cpp + +import anyio +from anyio.streams.memory import MemoryObjectSendStream +from starlette.concurrency import run_in_threadpool, iterate_in_threadpool +from fastapi import Depends, FastAPI, APIRouter, Request +from fastapi.middleware.cors import CORSMiddleware +from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from sse_starlette.sse import EventSourceResponse + + +class Settings(BaseSettings): + model: str = Field( + description="The path to the model to use for generating completions." + ) + model_alias: Optional[str] = Field( + default=None, + description="The alias of the model to use for generating completions.", + ) + n_ctx: int = Field(default=8192, ge=1, description="The context size.") + n_gpu_layers: int = Field( + default=0, + ge=0, + description="The number of layers to put on the GPU. The rest will be on the CPU.", + ) + seed: int = Field( + default=1337, description="Random seed. -1 for random." + ) + n_batch: int = Field( + default=512, ge=1, description="The batch size to use per eval." + ) + n_threads: int = Field( + default=max(multiprocessing.cpu_count() // 2, 1), + ge=1, + description="The number of threads to use.", + ) + f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") + use_mlock: bool = Field( + default=falcon_cpp.falcon_mlock_supported(), + description="Use mlock.", + ) + use_mmap: bool = Field( + default=falcon_cpp.falcon_mmap_supported(), + description="Use mmap.", + ) + embedding: bool = Field(default=True, description="Whether to use embeddings.") + low_vram: bool = Field( + default=False, + description="Whether to use less VRAM. This will reduce performance.", + ) + last_n_tokens_size: int = Field( + default=64, + ge=0, + description="Last n tokens to keep for repeat penalty calculation.", + ) + logits_all: bool = Field(default=True, description="Whether to return logits.") + cache: bool = Field( + default=False, + description="Use a cache to reduce processing times for evaluated prompts.", + ) + cache_type: Literal["ram", "disk"] = Field( + default="ram", + description="The type of cache to use. Only used if cache is True.", + ) + cache_size: int = Field( + default=2 << 30, + description="The size of the cache in bytes. Only used if cache is True.", + ) + vocab_only: bool = Field( + default=False, description="Whether to only return the vocabulary." + ) + verbose: bool = Field( + default=True, description="Whether to print debug information." + ) + host: str = Field( + default="localhost", description="Listen address" + ) + port: int = Field( + default=8000, description="Listen port" + ) + + +router = APIRouter() + +settings: Optional[Settings] = None +falcon: Optional[falcon_cpp.falcon] = None + + +def create_app(settings: Optional[Settings] = None): + if settings is None: + settings = Settings() + app = FastAPI( + title="🦙 falcon.cpp Python API", + version="0.0.1", + ) + app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + app.include_router(router) + global falcon + falcon = falcon_cpp.Falcon( + model_path=settings.model, + n_gpu_layers=settings.n_gpu_layers, + seed=settings.seed, + f16_kv=settings.f16_kv, + use_mlock=settings.use_mlock, + use_mmap=settings.use_mmap, + embedding=settings.embedding, + logits_all=settings.logits_all, + n_threads=settings.n_threads, + n_batch=settings.n_batch, + n_ctx=settings.n_ctx, + last_n_tokens_size=settings.last_n_tokens_size, + vocab_only=settings.vocab_only, + verbose=settings.verbose, + ) + if settings.cache: + if settings.cache_type == "disk": + if settings.verbose: + print(f"Using disk cache with size {settings.cache_size}") + cache = falcon_cpp.FalconDiskCache(capacity_bytes=settings.cache_size) + else: + if settings.verbose: + print(f"Using ram cache with size {settings.cache_size}") + cache = falcon_cpp.FalconRAMCache(capacity_bytes=settings.cache_size) + + cache = falcon_cpp.FalconCache(capacity_bytes=settings.cache_size) + falcon.set_cache(cache) + + def set_settings(_settings: Settings): + global settings + settings = _settings + + set_settings(settings) + return app + + +falcon_lock = Lock() + + +def get_falcon(): + with falcon_lock: + yield falcon + + +def get_settings(): + yield settings + + +model_field = Field(description="The model to use for generating completions.") + +max_tokens_field = Field( + default=16, ge=1, le=2048, description="The maximum number of tokens to generate." +) + +temperature_field = Field( + default=0.8, + ge=0.0, + le=2.0, + description="Adjust the randomness of the generated text.\n\n" + + "Temperature is a hyperparameter that controls the randomness of the generated text. It affects the probability distribution of the model's output tokens. A higher temperature (e.g., 1.5) makes the output more random and creative, while a lower temperature (e.g., 0.5) makes the output more focused, deterministic, and conservative. The default value is 0.8, which provides a balance between randomness and determinism. At the extreme, a temperature of 0 will always pick the most likely next token, leading to identical outputs in each run.", +) + +top_p_field = Field( + default=0.95, + ge=0.0, + le=1.0, + description="Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P.\n\n" + + "Top-p sampling, also known as nucleus sampling, is another text generation method that selects the next token from a subset of tokens that together have a cumulative probability of at least p. This method provides a balance between diversity and quality by considering both the probabilities of tokens and the number of tokens to sample from. A higher value for top_p (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text.", +) + +stop_field = Field( + default=None, + description="A list of tokens at which to stop generation. If None, no stop tokens are used.", +) + +stream_field = Field( + default=False, + description="Whether to stream the results as they are generated. Useful for chatbots.", +) + +top_k_field = Field( + default=40, + ge=0, + description="Limit the next token selection to the K most probable tokens.\n\n" + + "Top-k sampling is a text generation method that selects the next token only from the top k most likely tokens predicted by the model. It helps reduce the risk of generating low-probability or nonsensical tokens, but it may also limit the diversity of the output. A higher value for top_k (e.g., 100) will consider more tokens and lead to more diverse text, while a lower value (e.g., 10) will focus on the most probable tokens and generate more conservative text.", +) + +repeat_penalty_field = Field( + default=1.1, + ge=0.0, + description="A penalty applied to each token that is already generated. This helps prevent the model from repeating itself.\n\n" + + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.", +) + +presence_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.", +) + +frequency_penalty_field = Field( + default=0.0, + ge=-2.0, + le=2.0, + description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.", +) + +mirostat_mode_field = Field( + default=0, + ge=0, + le=2, + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" +) + +mirostat_tau_field = Field( + default=5.0, + ge=0.0, + le=10.0, + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" +) + +mirostat_eta_field = Field( + default=0.1, + ge=0.001, + le=1.0, + description="Mirostat learning rate" +) + + +class CreateCompletionRequest(BaseModel): + prompt: Union[str, List[str]] = Field( + default="", description="The prompt to generate completions for." + ) + suffix: Optional[str] = Field( + default=None, + description="A suffix to append to the generated text. If None, no suffix is appended. Useful for chatbots.", + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + echo: bool = Field( + default=False, + description="Whether to echo the prompt in the generated text. Useful for chatbots.", + ) + stop: Optional[Union[str, List[str]]] = stop_field + stream: bool = stream_field + logprobs: Optional[int] = Field( + default=None, + ge=0, + description="The number of logprobs to generate. If None, no logprobs are generated.", + ) + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + logprobs: Optional[int] = Field(None) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + best_of: Optional[int] = 1 + user: Optional[str] = Field(None) + + # falcon.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + + class Config: + schema_extra = { + "example": { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + } + + +CreateCompletionResponse = create_model_from_typeddict(falcon_cpp.Completion) + + +def make_logit_bias_processor( + falcon: falcon_cpp.Falcon, + logit_bias: Dict[str, float], + logit_bias_type: Optional[Literal["input_ids", "tokens"]], +): + if logit_bias_type is None: + logit_bias_type = "input_ids" + + to_bias: Dict[int, float] = {} + if logit_bias_type == "input_ids": + for input_id, score in logit_bias.items(): + input_id = int(input_id) + to_bias[input_id] = score + + elif logit_bias_type == "tokens": + for token, score in logit_bias.items(): + token = token.encode('utf-8') + for input_id in falcon.tokenize(token, add_bos=False): + to_bias[input_id] = score + + def logit_bias_processor( + input_ids: List[int], + scores: List[float], + ) -> List[float]: + new_scores = [None] * len(scores) + for input_id, score in enumerate(scores): + new_scores[input_id] = score + to_bias.get(input_id, 0.0) + + return new_scores + + return logit_bias_processor + + +@router.post( + "/v1/completions", + response_model=CreateCompletionResponse, +) +async def create_completion( + request: Request, + body: CreateCompletionRequest, + falcon: falcon_cpp.Falcon = Depends(get_falcon), +): + if isinstance(body.prompt, list): + assert len(body.prompt) <= 1 + body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" + + exclude = { + "n", + "best_of", + "logit_bias", + "logit_bias_type", + "user", + } + kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), + ]) + + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) + + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[falcon_cpp.CompletionChunk] = await run_in_threadpool(falcon, **kwargs) # type: ignore + async for chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + await inner_send_chan.send(dict(closing=True)) + raise e + + return EventSourceResponse( + recv_chan, data_sender_callable=partial(event_publisher, send_chan) + ) + else: + completion: falcon_cpp.Completion = await run_in_threadpool(falcon, **kwargs) # type: ignore + return completion + + +class CreateEmbeddingRequest(BaseModel): + model: Optional[str] = model_field + input: Union[str, List[str]] = Field(description="The input to embed.") + user: Optional[str] + + class Config: + schema_extra = { + "example": { + "input": "The food was delicious and the waiter...", + } + } + + +CreateEmbeddingResponse = create_model_from_typeddict(falcon_cpp.Embedding) + + +@router.post( + "/v1/embeddings", + response_model=CreateEmbeddingResponse, +) +async def create_embedding( + request: CreateEmbeddingRequest, falcon: falcon_cpp.Falcon = Depends(get_falcon) +): + return await run_in_threadpool( + falcon.create_embedding, **request.dict(exclude={"user"}) + ) + + +class ChatCompletionRequestMessage(BaseModel): + role: Literal["system", "user", "assistant"] = Field( + default="user", description="The role of the message." + ) + content: str = Field(default="", description="The content of the message.") + + +class CreateChatCompletionRequest(BaseModel): + messages: List[ChatCompletionRequestMessage] = Field( + default=[], description="A list of messages to generate completions for." + ) + max_tokens: int = max_tokens_field + temperature: float = temperature_field + top_p: float = top_p_field + mirostat_mode: int = mirostat_mode_field + mirostat_tau: float = mirostat_tau_field + mirostat_eta: float = mirostat_eta_field + stop: Optional[List[str]] = stop_field + stream: bool = stream_field + presence_penalty: Optional[float] = presence_penalty_field + frequency_penalty: Optional[float] = frequency_penalty_field + logit_bias: Optional[Dict[str, float]] = Field(None) + + # ignored or currently unsupported + model: Optional[str] = model_field + n: Optional[int] = 1 + user: Optional[str] = Field(None) + + # falcon.cpp specific parameters + top_k: int = top_k_field + repeat_penalty: float = repeat_penalty_field + logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) + + class Config: + schema_extra = { + "example": { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ), + ] + } + } + + +CreateChatCompletionResponse = create_model_from_typeddict(falcon_cpp.ChatCompletion) + + +@router.post( + "/v1/chat/completions", + response_model=CreateChatCompletionResponse, +) +async def create_chat_completion( + request: Request, + body: CreateChatCompletionRequest, + falcon: falcon_cpp.Falcon = Depends(get_falcon), +) -> Union[falcon_cpp.ChatCompletion, EventSourceResponse]: + exclude = { + "n", + "logit_bias", + "logit_bias_type", + "user", + } + kwargs = body.dict(exclude=exclude) + + if body.logit_bias is not None: + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), + ]) + + if body.stream: + send_chan, recv_chan = anyio.create_memory_object_stream(10) + + async def event_publisher(inner_send_chan: MemoryObjectSendStream): + async with inner_send_chan: + try: + iterator: Iterator[falcon_cpp.ChatCompletionChunk] = await run_in_threadpool(falcon.create_chat_completion, **kwargs) # type: ignore + async for chat_chunk in iterate_in_threadpool(iterator): + await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) + if await request.is_disconnected(): + raise anyio.get_cancelled_exc_class()() + await inner_send_chan.send(dict(data="[DONE]")) + except anyio.get_cancelled_exc_class() as e: + print("disconnected") + with anyio.move_on_after(1, shield=True): + print( + f"Disconnected from client (via refresh/close) {request.client}" + ) + await inner_send_chan.send(dict(closing=True)) + raise e + + return EventSourceResponse( + recv_chan, + data_sender_callable=partial(event_publisher, send_chan), + ) + else: + completion: falcon_cpp.ChatCompletion = await run_in_threadpool( + falcon.create_chat_completion, **kwargs # type: ignore + ) + return completion + + +class ModelData(TypedDict): + id: str + object: Literal["model"] + owned_by: str + permissions: List[str] + + +class ModelList(TypedDict): + object: Literal["list"] + data: List[ModelData] + + +GetModelResponse = create_model_from_typeddict(ModelList) + + +@router.get("/v1/models", response_model=GetModelResponse) +async def get_models( + settings: Settings = Depends(get_settings), + falcon: falcon_cpp.Falcon = Depends(get_falcon), +) -> ModelList: + return { + "object": "list", + "data": [ + { + "id": settings.model_alias + if settings.model_alias is not None + else falcon.model_path, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 000000000..e4147790b --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,21 @@ +site_name: falcon-cpp-python +repo_url: https://github.com/sirajperson/falcon-cpp-python + +theme: + name: "material" + +plugins: + - mkdocstrings + - search + +watch: + - falcon_cpp + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences \ No newline at end of file diff --git a/poetry.lock b/poetry.lock new file mode 100644 index 000000000..8b86d0e63 --- /dev/null +++ b/poetry.lock @@ -0,0 +1,1636 @@ +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. + +[[package]] +name = "anyio" +version = "3.6.2" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.6.2" +files = [ + {file = "anyio-3.6.2-py3-none-any.whl", hash = "sha256:fbbe32bd270d2a2ef3ed1c5d45041250284e31fc0a4df4a5a6071842051a51e3"}, + {file = "anyio-3.6.2.tar.gz", hash = "sha256:25ea0d673ae30af41a0c442f81cf3b38c7e79fdc7b60335a4c14e05eb0947421"}, +] + +[package.dependencies] +idna = ">=2.8" +sniffio = ">=1.1" + +[package.extras] +doc = ["packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["contextlib2", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (<0.15)", "uvloop (>=0.15)"] +trio = ["trio (>=0.16,<0.22)"] + +[[package]] +name = "black" +version = "23.3.0" +description = "The uncompromising code formatter." +optional = false +python-versions = ">=3.7" +files = [ + {file = "black-23.3.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:0945e13506be58bf7db93ee5853243eb368ace1c08a24c65ce108986eac65915"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:67de8d0c209eb5b330cce2469503de11bca4085880d62f1628bd9972cc3366b9"}, + {file = "black-23.3.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:7c3eb7cea23904399866c55826b31c1f55bbcd3890ce22ff70466b907b6775c2"}, + {file = "black-23.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:32daa9783106c28815d05b724238e30718f34155653d4d6e125dc7daec8e260c"}, + {file = "black-23.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:35d1381d7a22cc5b2be2f72c7dfdae4072a3336060635718cc7e1ede24221d6c"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:a8a968125d0a6a404842fa1bf0b349a568634f856aa08ffaff40ae0dfa52e7c6"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c7ab5790333c448903c4b721b59c0d80b11fe5e9803d8703e84dcb8da56fec1b"}, + {file = "black-23.3.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:a6f6886c9869d4daae2d1715ce34a19bbc4b95006d20ed785ca00fa03cba312d"}, + {file = "black-23.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f3c333ea1dd6771b2d3777482429864f8e258899f6ff05826c3a4fcc5ce3f70"}, + {file = "black-23.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:11c410f71b876f961d1de77b9699ad19f939094c3a677323f43d7a29855fe326"}, + {file = "black-23.3.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:1d06691f1eb8de91cd1b322f21e3bfc9efe0c7ca1f0e1eb1db44ea367dff656b"}, + {file = "black-23.3.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:50cb33cac881766a5cd9913e10ff75b1e8eb71babf4c7104f2e9c52da1fb7de2"}, + {file = "black-23.3.0-cp37-cp37m-win_amd64.whl", hash = "sha256:e114420bf26b90d4b9daa597351337762b63039752bdf72bf361364c1aa05925"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:48f9d345675bb7fbc3dd85821b12487e1b9a75242028adad0333ce36ed2a6d27"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:714290490c18fb0126baa0fca0a54ee795f7502b44177e1ce7624ba1c00f2331"}, + {file = "black-23.3.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:064101748afa12ad2291c2b91c960be28b817c0c7eaa35bec09cc63aa56493c5"}, + {file = "black-23.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:562bd3a70495facf56814293149e51aa1be9931567474993c7942ff7d3533961"}, + {file = "black-23.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:e198cf27888ad6f4ff331ca1c48ffc038848ea9f031a3b40ba36aced7e22f2c8"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:3238f2aacf827d18d26db07524e44741233ae09a584273aa059066d644ca7b30"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:f0bd2f4a58d6666500542b26354978218a9babcdc972722f4bf90779524515f3"}, + {file = "black-23.3.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:92c543f6854c28a3c7f39f4d9b7694f9a6eb9d3c5e2ece488c327b6e7ea9b266"}, + {file = "black-23.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a150542a204124ed00683f0db1f5cf1c2aaaa9cc3495b7a3b5976fb136090ab"}, + {file = "black-23.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:6b39abdfb402002b8a7d030ccc85cf5afff64ee90fa4c5aebc531e3ad0175ddb"}, + {file = "black-23.3.0-py3-none-any.whl", hash = "sha256:ec751418022185b0c1bb7d7736e6933d40bbb14c14a0abcf9123d1b159f98dd4"}, + {file = "black-23.3.0.tar.gz", hash = "sha256:1c7b8d606e728a41ea1ccbd7264677e494e87cf630e399262ced92d4a8dac940"}, +] + +[package.dependencies] +click = ">=8.0.0" +mypy-extensions = ">=0.4.3" +packaging = ">=22.0" +pathspec = ">=0.9.0" +platformdirs = ">=2" +tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} +typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} + +[package.extras] +colorama = ["colorama (>=0.4.3)"] +d = ["aiohttp (>=3.7.4)"] +jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] +uvloop = ["uvloop (>=0.15.2)"] + +[[package]] +name = "bleach" +version = "6.0.0" +description = "An easy safelist-based HTML-sanitizing tool." +optional = false +python-versions = ">=3.7" +files = [ + {file = "bleach-6.0.0-py3-none-any.whl", hash = "sha256:33c16e3353dbd13028ab4799a0f89a83f113405c766e9c122df8a06f5b85b3f4"}, + {file = "bleach-6.0.0.tar.gz", hash = "sha256:1a1a85c1595e07d8db14c5f09f09e6433502c51c595970edc090551f0db99414"}, +] + +[package.dependencies] +six = ">=1.9.0" +webencodings = "*" + +[package.extras] +css = ["tinycss2 (>=1.1.0,<1.2)"] + +[[package]] +name = "certifi" +version = "2023.5.7" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2023.5.7-py3-none-any.whl", hash = "sha256:c6c2e98f5c7869efca1f8916fed228dd91539f9f1b444c314c06eef02980c716"}, + {file = "certifi-2023.5.7.tar.gz", hash = "sha256:0f0d56dc5a6ad56fd4ba36484d6cc34451e1c6548c61daad8c320169f91eddc7"}, +] + +[[package]] +name = "cffi" +version = "1.15.1" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = "*" +files = [ + {file = "cffi-1.15.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a66d3508133af6e8548451b25058d5812812ec3798c886bf38ed24a98216fab2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_i686.whl", hash = "sha256:470c103ae716238bbe698d67ad020e1db9d9dba34fa5a899b5e21577e6d52ed2"}, + {file = "cffi-1.15.1-cp27-cp27m-manylinux1_x86_64.whl", hash = "sha256:9ad5db27f9cabae298d151c85cf2bad1d359a1b9c686a275df03385758e2f914"}, + {file = "cffi-1.15.1-cp27-cp27m-win32.whl", hash = "sha256:b3bbeb01c2b273cca1e1e0c5df57f12dce9a4dd331b4fa1635b8bec26350bde3"}, + {file = "cffi-1.15.1-cp27-cp27m-win_amd64.whl", hash = "sha256:e00b098126fd45523dd056d2efba6c5a63b71ffe9f2bbe1a4fe1716e1d0c331e"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_i686.whl", hash = "sha256:d61f4695e6c866a23a21acab0509af1cdfd2c013cf256bbf5b6b5e2695827162"}, + {file = "cffi-1.15.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:ed9cb427ba5504c1dc15ede7d516b84757c3e3d7868ccc85121d9310d27eed0b"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39d39875251ca8f612b6f33e6b1195af86d1b3e60086068be9cc053aa4376e21"}, + {file = "cffi-1.15.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:285d29981935eb726a4399badae8f0ffdff4f5050eaa6d0cfc3f64b857b77185"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3eb6971dcff08619f8d91607cfc726518b6fa2a9eba42856be181c6d0d9515fd"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:21157295583fe8943475029ed5abdcf71eb3911894724e360acff1d61c1d54bc"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5635bd9cb9731e6d4a1132a498dd34f764034a8ce60cef4f5319c0541159392f"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2012c72d854c2d03e45d06ae57f40d78e5770d252f195b93f581acf3ba44496e"}, + {file = "cffi-1.15.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd86c085fae2efd48ac91dd7ccffcfc0571387fe1193d33b6394db7ef31fe2a4"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:fa6693661a4c91757f4412306191b6dc88c1703f780c8234035eac011922bc01"}, + {file = "cffi-1.15.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:59c0b02d0a6c384d453fece7566d1c7e6b7bae4fc5874ef2ef46d56776d61c9e"}, + {file = "cffi-1.15.1-cp310-cp310-win32.whl", hash = "sha256:cba9d6b9a7d64d4bd46167096fc9d2f835e25d7e4c121fb2ddfc6528fb0413b2"}, + {file = "cffi-1.15.1-cp310-cp310-win_amd64.whl", hash = "sha256:ce4bcc037df4fc5e3d184794f27bdaab018943698f4ca31630bc7f84a7b69c6d"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3d08afd128ddaa624a48cf2b859afef385b720bb4b43df214f85616922e6a5ac"}, + {file = "cffi-1.15.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3799aecf2e17cf585d977b780ce79ff0dc9b78d799fc694221ce814c2c19db83"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a591fe9e525846e4d154205572a029f653ada1a78b93697f3b5a8f1f2bc055b9"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3548db281cd7d2561c9ad9984681c95f7b0e38881201e157833a2342c30d5e8c"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:91fc98adde3d7881af9b59ed0294046f3806221863722ba7d8d120c575314325"}, + {file = "cffi-1.15.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94411f22c3985acaec6f83c6df553f2dbe17b698cc7f8ae751ff2237d96b9e3c"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:03425bdae262c76aad70202debd780501fabeaca237cdfddc008987c0e0f59ef"}, + {file = "cffi-1.15.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cc4d65aeeaa04136a12677d3dd0b1c0c94dc43abac5860ab33cceb42b801c1e8"}, + {file = "cffi-1.15.1-cp311-cp311-win32.whl", hash = "sha256:a0f100c8912c114ff53e1202d0078b425bee3649ae34d7b070e9697f93c5d52d"}, + {file = "cffi-1.15.1-cp311-cp311-win_amd64.whl", hash = "sha256:04ed324bda3cda42b9b695d51bb7d54b680b9719cfab04227cdd1e04e5de3104"}, + {file = "cffi-1.15.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50a74364d85fd319352182ef59c5c790484a336f6db772c1a9231f1c3ed0cbd7"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e263d77ee3dd201c3a142934a086a4450861778baaeeb45db4591ef65550b0a6"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cec7d9412a9102bdc577382c3929b337320c4c4c4849f2c5cdd14d7368c5562d"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4289fc34b2f5316fbb762d75362931e351941fa95fa18789191b33fc4cf9504a"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:173379135477dc8cac4bc58f45db08ab45d228b3363adb7af79436135d028405"}, + {file = "cffi-1.15.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:6975a3fac6bc83c4a65c9f9fcab9e47019a11d3d2cf7f3c0d03431bf145a941e"}, + {file = "cffi-1.15.1-cp36-cp36m-win32.whl", hash = "sha256:2470043b93ff09bf8fb1d46d1cb756ce6132c54826661a32d4e4d132e1977adf"}, + {file = "cffi-1.15.1-cp36-cp36m-win_amd64.whl", hash = "sha256:30d78fbc8ebf9c92c9b7823ee18eb92f2e6ef79b45ac84db507f52fbe3ec4497"}, + {file = "cffi-1.15.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:198caafb44239b60e252492445da556afafc7d1e3ab7a1fb3f0584ef6d742375"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ef34d190326c3b1f822a5b7a45f6c4535e2f47ed06fec77d3d799c450b2651e"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8102eaf27e1e448db915d08afa8b41d6c7ca7a04b7d73af6514df10a3e74bd82"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5df2768244d19ab7f60546d0c7c63ce1581f7af8b5de3eb3004b9b6fc8a9f84b"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8c4917bd7ad33e8eb21e9a5bbba979b49d9a97acb3a803092cbc1133e20343c"}, + {file = "cffi-1.15.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2642fe3142e4cc4af0799748233ad6da94c62a8bec3a6648bf8ee68b1c7426"}, + {file = "cffi-1.15.1-cp37-cp37m-win32.whl", hash = "sha256:e229a521186c75c8ad9490854fd8bbdd9a0c9aa3a524326b55be83b54d4e0ad9"}, + {file = "cffi-1.15.1-cp37-cp37m-win_amd64.whl", hash = "sha256:a0b71b1b8fbf2b96e41c4d990244165e2c9be83d54962a9a1d118fd8657d2045"}, + {file = "cffi-1.15.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:320dab6e7cb2eacdf0e658569d2575c4dad258c0fcc794f46215e1e39f90f2c3"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1e74c6b51a9ed6589199c787bf5f9875612ca4a8a0785fb2d4a84429badaf22a"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5c84c68147988265e60416b57fc83425a78058853509c1b0629c180094904a5"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3b926aa83d1edb5aa5b427b4053dc420ec295a08e40911296b9eb1b6170f6cca"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c450779d0914f2861b8526e035c5e6da0a3199d8f1add1a665e1cbc6fc6d02"}, + {file = "cffi-1.15.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4f2c9f67e9821cad2e5f480bc8d83b8742896f1242dba247911072d4fa94c192"}, + {file = "cffi-1.15.1-cp38-cp38-win32.whl", hash = "sha256:8b7ee99e510d7b66cdb6c593f21c043c248537a32e0bedf02e01e9553a172314"}, + {file = "cffi-1.15.1-cp38-cp38-win_amd64.whl", hash = "sha256:00a9ed42e88df81ffae7a8ab6d9356b371399b91dbdf0c3cb1e84c03a13aceb5"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:54a2db7b78338edd780e7ef7f9f6c442500fb0d41a5a4ea24fff1c929d5af585"}, + {file = "cffi-1.15.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fcd131dd944808b5bdb38e6f5b53013c5aa4f334c5cad0c72742f6eba4b73db0"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7473e861101c9e72452f9bf8acb984947aa1661a7704553a9f6e4baa5ba64415"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6c9a799e985904922a4d207a94eae35c78ebae90e128f0c4e521ce339396be9d"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3bcde07039e586f91b45c88f8583ea7cf7a0770df3a1649627bf598332cb6984"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33ab79603146aace82c2427da5ca6e58f2b3f2fb5da893ceac0c42218a40be35"}, + {file = "cffi-1.15.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d598b938678ebf3c67377cdd45e09d431369c3b1a5b331058c338e201f12b27"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db0fbb9c62743ce59a9ff687eb5f4afbe77e5e8403d6697f7446e5f609976f76"}, + {file = "cffi-1.15.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:98d85c6a2bef81588d9227dde12db8a7f47f639f4a17c9ae08e773aa9c697bf3"}, + {file = "cffi-1.15.1-cp39-cp39-win32.whl", hash = "sha256:40f4774f5a9d4f5e344f31a32b5096977b5d48560c5592e2f3d2c4374bd543ee"}, + {file = "cffi-1.15.1-cp39-cp39-win_amd64.whl", hash = "sha256:70df4e3b545a17496c9b3f41f5115e69a4f2e77e94e1d2a8e1070bc0c38c8a3c"}, + {file = "cffi-1.15.1.tar.gz", hash = "sha256:d400bfb9a37b1351253cb402671cea7e89bdecc294e8016a707f6d1d8ac934f9"}, +] + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "3.1.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset-normalizer-3.1.0.tar.gz", hash = "sha256:34e0a2f9c370eb95597aae63bf85eb5e96826d81e3dcf88b8886012906f509b5"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e0ac8959c929593fee38da1c2b64ee9778733cdf03c482c9ff1d508b6b593b2b"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d7fc3fca01da18fbabe4625d64bb612b533533ed10045a2ac3dd194bfa656b60"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:04eefcee095f58eaabe6dc3cc2262f3bcd776d2c67005880894f447b3f2cb9c1"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20064ead0717cf9a73a6d1e779b23d149b53daf971169289ed2ed43a71e8d3b0"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1435ae15108b1cb6fffbcea2af3d468683b7afed0169ad718451f8db5d1aff6f"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c84132a54c750fda57729d1e2599bb598f5fa0344085dbde5003ba429a4798c0"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75f2568b4189dda1c567339b48cba4ac7384accb9c2a7ed655cd86b04055c795"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:11d3bcb7be35e7b1bba2c23beedac81ee893ac9871d0ba79effc7fc01167db6c"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:891cf9b48776b5c61c700b55a598621fdb7b1e301a550365571e9624f270c203"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5f008525e02908b20e04707a4f704cd286d94718f48bb33edddc7d7b584dddc1"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:b06f0d3bf045158d2fb8837c5785fe9ff9b8c93358be64461a1089f5da983137"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:49919f8400b5e49e961f320c735388ee686a62327e773fa5b3ce6721f7e785ce"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22908891a380d50738e1f978667536f6c6b526a2064156203d418f4856d6e86a"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-win32.whl", hash = "sha256:12d1a39aa6b8c6f6248bb54550efcc1c38ce0d8096a146638fd4738e42284448"}, + {file = "charset_normalizer-3.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:65ed923f84a6844de5fd29726b888e58c62820e0769b76565480e1fdc3d062f8"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:9a3267620866c9d17b959a84dd0bd2d45719b817245e49371ead79ed4f710d19"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6734e606355834f13445b6adc38b53c0fd45f1a56a9ba06c2058f86893ae8017"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f8303414c7b03f794347ad062c0516cee0e15f7a612abd0ce1e25caf6ceb47df"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aaf53a6cebad0eae578f062c7d462155eada9c172bd8c4d250b8c1d8eb7f916a"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dc5b6a8ecfdc5748a7e429782598e4f17ef378e3e272eeb1340ea57c9109f41"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e1b25e3ad6c909f398df8921780d6a3d120d8c09466720226fc621605b6f92b1"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ca564606d2caafb0abe6d1b5311c2649e8071eb241b2d64e75a0d0065107e62"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b82fab78e0b1329e183a65260581de4375f619167478dddab510c6c6fb04d9b6"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bd7163182133c0c7701b25e604cf1611c0d87712e56e88e7ee5d72deab3e76b5"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:11d117e6c63e8f495412d37e7dc2e2fff09c34b2d09dbe2bee3c6229577818be"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:cf6511efa4801b9b38dc5546d7547d5b5c6ef4b081c60b23e4d941d0eba9cbeb"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:abc1185d79f47c0a7aaf7e2412a0eb2c03b724581139193d2d82b3ad8cbb00ac"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:cb7b2ab0188829593b9de646545175547a70d9a6e2b63bf2cd87a0a391599324"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-win32.whl", hash = "sha256:c36bcbc0d5174a80d6cccf43a0ecaca44e81d25be4b7f90f0ed7bcfbb5a00909"}, + {file = "charset_normalizer-3.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:cca4def576f47a09a943666b8f829606bcb17e2bc2d5911a46c8f8da45f56755"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0c95f12b74681e9ae127728f7e5409cbbef9cd914d5896ef238cc779b8152373"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fca62a8301b605b954ad2e9c3666f9d97f63872aa4efcae5492baca2056b74ab"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac0aa6cd53ab9a31d397f8303f92c42f534693528fafbdb997c82bae6e477ad9"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c3af8e0f07399d3176b179f2e2634c3ce9c1301379a6b8c9c9aeecd481da494f"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a5fc78f9e3f501a1614a98f7c54d3969f3ad9bba8ba3d9b438c3bc5d047dd28"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:628c985afb2c7d27a4800bfb609e03985aaecb42f955049957814e0491d4006d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:74db0052d985cf37fa111828d0dd230776ac99c740e1a758ad99094be4f1803d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:1e8fcdd8f672a1c4fc8d0bd3a2b576b152d2a349782d1eb0f6b8e52e9954731d"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:04afa6387e2b282cf78ff3dbce20f0cc071c12dc8f685bd40960cc68644cfea6"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:dd5653e67b149503c68c4018bf07e42eeed6b4e956b24c00ccdf93ac79cdff84"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d2686f91611f9e17f4548dbf050e75b079bbc2a82be565832bc8ea9047b61c8c"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-win32.whl", hash = "sha256:4155b51ae05ed47199dc5b2a4e62abccb274cee6b01da5b895099b61b1982974"}, + {file = "charset_normalizer-3.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322102cdf1ab682ecc7d9b1c5eed4ec59657a65e1c146a0da342b78f4112db23"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e633940f28c1e913615fd624fcdd72fdba807bf53ea6925d6a588e84e1151531"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3a06f32c9634a8705f4ca9946d667609f52cf130d5548881401f1eb2c39b1e2c"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7381c66e0561c5757ffe616af869b916c8b4e42b367ab29fedc98481d1e74e14"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3573d376454d956553c356df45bb824262c397c6e26ce43e8203c4c540ee0acb"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e89df2958e5159b811af9ff0f92614dabf4ff617c03a4c1c6ff53bf1c399e0e1"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:78cacd03e79d009d95635e7d6ff12c21eb89b894c354bd2b2ed0b4763373693b"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:de5695a6f1d8340b12a5d6d4484290ee74d61e467c39ff03b39e30df62cf83a0"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1c60b9c202d00052183c9be85e5eaf18a4ada0a47d188a83c8f5c5b23252f649"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f645caaf0008bacf349875a974220f1f1da349c5dbe7c4ec93048cdc785a3326"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:ea9f9c6034ea2d93d9147818f17c2a0860d41b71c38b9ce4d55f21b6f9165a11"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:80d1543d58bd3d6c271b66abf454d437a438dff01c3e62fdbcd68f2a11310d4b"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:73dc03a6a7e30b7edc5b01b601e53e7fc924b04e1835e8e407c12c037e81adbd"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6f5c2e7bc8a4bf7c426599765b1bd33217ec84023033672c1e9a8b35eaeaaaf8"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-win32.whl", hash = "sha256:12a2b561af122e3d94cdb97fe6fb2bb2b82cef0cdca131646fdb940a1eda04f0"}, + {file = "charset_normalizer-3.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:3160a0fd9754aab7d47f95a6b63ab355388d890163eb03b2d2b87ab0a30cfa59"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38e812a197bf8e71a59fe55b757a84c1f946d0ac114acafaafaf21667a7e169e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6baf0baf0d5d265fa7944feb9f7451cc316bfe30e8df1a61b1bb08577c554f31"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8f25e17ab3039b05f762b0a55ae0b3632b2e073d9c8fc88e89aca31a6198e88f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3747443b6a904001473370d7810aa19c3a180ccd52a7157aacc264a5ac79265e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b116502087ce8a6b7a5f1814568ccbd0e9f6cfd99948aa59b0e241dc57cf739f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d16fd5252f883eb074ca55cb622bc0bee49b979ae4e8639fff6ca3ff44f9f854"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fa558996782fc226b529fdd2ed7866c2c6ec91cee82735c98a197fae39f706"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f6c7a8a57e9405cad7485f4c9d3172ae486cfef1344b5ddd8e5239582d7355e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:ac3775e3311661d4adace3697a52ac0bab17edd166087d493b52d4f4f553f9f0"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:10c93628d7497c81686e8e5e557aafa78f230cd9e77dd0c40032ef90c18f2230"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:6f4f4668e1831850ebcc2fd0b1cd11721947b6dc7c00bf1c6bd3c929ae14f2c7"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:0be65ccf618c1e7ac9b849c315cc2e8a8751d9cfdaa43027d4f6624bd587ab7e"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:53d0a3fa5f8af98a1e261de6a3943ca631c526635eb5817a87a59d9a57ebf48f"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-win32.whl", hash = "sha256:a04f86f41a8916fe45ac5024ec477f41f886b3c435da2d4e3d2709b22ab02af1"}, + {file = "charset_normalizer-3.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:830d2948a5ec37c386d3170c483063798d7879037492540f10a475e3fd6f244b"}, + {file = "charset_normalizer-3.1.0-py3-none-any.whl", hash = "sha256:3d9098b479e78c85080c98e1e35ff40b4a31d8953102bb0fd7d1b6f8a2111a3d"}, +] + +[[package]] +name = "click" +version = "8.1.3" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, + {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "cryptography" +version = "40.0.2" +description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." +optional = false +python-versions = ">=3.6" +files = [ + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_universal2.whl", hash = "sha256:8f79b5ff5ad9d3218afb1e7e20ea74da5f76943ee5edb7f76e56ec5161ec782b"}, + {file = "cryptography-40.0.2-cp36-abi3-macosx_10_12_x86_64.whl", hash = "sha256:05dc219433b14046c476f6f09d7636b92a1c3e5808b9a6536adf4932b3b2c440"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4df2af28d7bedc84fe45bd49bc35d710aede676e2a4cb7fc6d103a2adc8afe4d"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0dcca15d3a19a66e63662dc8d30f8036b07be851a8680eda92d079868f106288"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:a04386fb7bc85fab9cd51b6308633a3c271e3d0d3eae917eebab2fac6219b6d2"}, + {file = "cryptography-40.0.2-cp36-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:adc0d980fd2760c9e5de537c28935cc32b9353baaf28e0814df417619c6c8c3b"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d5a1bd0e9e2031465761dfa920c16b0065ad77321d8a8c1f5ee331021fda65e9"}, + {file = "cryptography-40.0.2-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a95f4802d49faa6a674242e25bfeea6fc2acd915b5e5e29ac90a32b1139cae1c"}, + {file = "cryptography-40.0.2-cp36-abi3-win32.whl", hash = "sha256:aecbb1592b0188e030cb01f82d12556cf72e218280f621deed7d806afd2113f9"}, + {file = "cryptography-40.0.2-cp36-abi3-win_amd64.whl", hash = "sha256:b12794f01d4cacfbd3177b9042198f3af1c856eedd0a98f10f141385c809a14b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:142bae539ef28a1c76794cca7f49729e7c54423f615cfd9b0b1fa90ebe53244b"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:956ba8701b4ffe91ba59665ed170a2ebbdc6fc0e40de5f6059195d9f2b33ca0e"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4f01c9863da784558165f5d4d916093737a75203a5c5286fde60e503e4276c7a"}, + {file = "cryptography-40.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:3daf9b114213f8ba460b829a02896789751626a2a4e7a43a28ee77c04b5e4958"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:48f388d0d153350f378c7f7b41497a54ff1513c816bcbbcafe5b829e59b9ce5b"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c0764e72b36a3dc065c155e5b22f93df465da9c39af65516fe04ed3c68c92636"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:cbaba590180cba88cb99a5f76f90808a624f18b169b90a4abb40c1fd8c19420e"}, + {file = "cryptography-40.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:7a38250f433cd41df7fcb763caa3ee9362777fdb4dc642b9a349721d2bf47404"}, + {file = "cryptography-40.0.2.tar.gz", hash = "sha256:c33c0d32b8594fa647d2e01dbccc303478e16fdd7cf98652d5b3ed11aa5e5c99"}, +] + +[package.dependencies] +cffi = ">=1.12" + +[package.extras] +docs = ["sphinx (>=5.3.0)", "sphinx-rtd-theme (>=1.1.1)"] +docstest = ["pyenchant (>=1.6.11)", "sphinxcontrib-spelling (>=4.0.1)", "twine (>=1.12.0)"] +pep8test = ["black", "check-manifest", "mypy", "ruff"] +sdist = ["setuptools-rust (>=0.11.4)"] +ssh = ["bcrypt (>=3.1.5)"] +test = ["iso8601", "pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-shard (>=0.1.2)", "pytest-subtests", "pytest-xdist"] +test-randomorder = ["pytest-randomly"] +tox = ["tox"] + +[[package]] +name = "diskcache" +version = "5.6.1" +description = "Disk Cache -- Disk and file backed persistent cache." +optional = false +python-versions = ">=3" +files = [ + {file = "diskcache-5.6.1-py3-none-any.whl", hash = "sha256:558c6a2d5d7c721bb00e40711803d6804850c9f76c426ed81ecc627fe9d2ce2d"}, + {file = "diskcache-5.6.1.tar.gz", hash = "sha256:e4c978532feff5814c4cc00fe1e11e40501985946643d73220d41ee7737c72c3"}, +] + +[[package]] +name = "distro" +version = "1.8.0" +description = "Distro - an OS platform information API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "distro-1.8.0-py3-none-any.whl", hash = "sha256:99522ca3e365cac527b44bde033f64c6945d90eb9f769703caaec52b09bbd3ff"}, + {file = "distro-1.8.0.tar.gz", hash = "sha256:02e111d1dc6a50abb8eed6bf31c3e48ed8b0830d1ea2a1b78c61765c2513fdd8"}, +] + +[[package]] +name = "docutils" +version = "0.20" +description = "Docutils -- Python Documentation Utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "docutils-0.20-py3-none-any.whl", hash = "sha256:a428f10de4de4774389734c986a01b4af2d802d26717108b0f1b9356862937c5"}, + {file = "docutils-0.20.tar.gz", hash = "sha256:f75a5a52fbcacd81b47e42888ad2b380748aaccfb3f13af0fe69deb759f01eb6"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.1.1" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, + {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "fastapi" +version = "0.99.1" +description = "FastAPI framework, high performance, easy to learn, fast to code, ready for production" +optional = true +python-versions = ">=3.7" +files = [ + {file = "fastapi-0.99.1-py3-none-any.whl", hash = "sha256:976df7bab51ac7beda9f68c4513b8c4490b5c1135c72aafd0a5ee4023ec5282e"}, + {file = "fastapi-0.99.1.tar.gz", hash = "sha256:ac78f717cd80d657bd183f94d33b9bda84aa376a46a9dab513586b8eef1dc6fc"}, +] + +[package.dependencies] +pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<2.0.0" +starlette = ">=0.27.0,<0.28.0" +typing-extensions = ">=4.5.0" + +[package.extras] +all = ["email-validator (>=1.1.1)", "httpx (>=0.23.0)", "itsdangerous (>=1.1.0)", "jinja2 (>=2.11.2)", "orjson (>=3.2.1)", "python-multipart (>=0.0.5)", "pyyaml (>=5.3.1)", "ujson (>=4.0.1,!=4.0.2,!=4.1.0,!=4.2.0,!=4.3.0,!=5.0.0,!=5.1.0)", "uvicorn[standard] (>=0.12.0)"] + +[[package]] +name = "ghp-import" +version = "2.1.0" +description = "Copy your docs directly to the gh-pages branch." +optional = false +python-versions = "*" +files = [ + {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, + {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.1" + +[package.extras] +dev = ["flake8", "markdown", "twine", "wheel"] + +[[package]] +name = "griffe" +version = "0.27.3" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.7" +files = [ + {file = "griffe-0.27.3-py3-none-any.whl", hash = "sha256:094513b209d4acd4b2680c2415d3af5f8ed925714795380c2a7d070e222e0b27"}, + {file = "griffe-0.27.3.tar.gz", hash = "sha256:a3d0f75aa76b80f181f818cf605f658a69fccf287aaeeeafc7a6cf4e6a2ca27e"}, +] + +[package.dependencies] +colorama = ">=0.4" + +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[[package]] +name = "httpcore" +version = "0.17.0" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpcore-0.17.0-py3-none-any.whl", hash = "sha256:0fdfea45e94f0c9fd96eab9286077f9ff788dd186635ae61b312693e4d943599"}, + {file = "httpcore-0.17.0.tar.gz", hash = "sha256:cc045a3241afbf60ce056202301b4d8b6af08845e3294055eb26b09913ef903c"}, +] + +[package.dependencies] +anyio = ">=3.0,<5.0" +certifi = "*" +h11 = ">=0.13,<0.15" +sniffio = "==1.*" + +[package.extras] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "httpx" +version = "0.24.1" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.7" +files = [ + {file = "httpx-0.24.1-py3-none-any.whl", hash = "sha256:06781eb9ac53cde990577af654bd990a4949de37a28bdb4a230d434f3a30b9bd"}, + {file = "httpx-0.24.1.tar.gz", hash = "sha256:5853a43053df830c20f8110c5e69fe44d035d850b2dfe795e196f00fdb774bdd"}, +] + +[package.dependencies] +certifi = "*" +httpcore = ">=0.15.0,<0.18.0" +idna = "*" +sniffio = "*" + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] + +[[package]] +name = "idna" +version = "3.4" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.5" +files = [ + {file = "idna-3.4-py3-none-any.whl", hash = "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"}, + {file = "idna-3.4.tar.gz", hash = "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4"}, +] + +[[package]] +name = "importlib-metadata" +version = "6.6.0" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "importlib_metadata-6.6.0-py3-none-any.whl", hash = "sha256:43dd286a2cd8995d5eaef7fee2066340423b818ed3fd70adf0bad5f1fac53fed"}, + {file = "importlib_metadata-6.6.0.tar.gz", hash = "sha256:92501cdf9cc66ebd3e612f1b4f0c0765dfa42f0fa38ffb319b6bd84dd675d705"}, +] + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +perf = ["ipython"] +testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] + +[[package]] +name = "importlib-resources" +version = "5.12.0" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "importlib_resources-5.12.0-py3-none-any.whl", hash = "sha256:7b1deeebbf351c7578e09bf2f63fa2ce8b5ffec296e0d349139d43cca061a81a"}, + {file = "importlib_resources-5.12.0.tar.gz", hash = "sha256:4be82589bf5c1d7999aedf2a45159d10cb3ca4f19b2271f8792bc8e6da7b22f6"}, +] + +[package.dependencies] +zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + +[[package]] +name = "jaraco-classes" +version = "3.2.3" +description = "Utility functions for Python class constructs" +optional = false +python-versions = ">=3.7" +files = [ + {file = "jaraco.classes-3.2.3-py3-none-any.whl", hash = "sha256:2353de3288bc6b82120752201c6b1c1a14b058267fa424ed5ce5984e3b922158"}, + {file = "jaraco.classes-3.2.3.tar.gz", hash = "sha256:89559fa5c1d3c34eff6f631ad80bb21f378dbcbb35dd161fd2c6b93f5be2f98a"}, +] + +[package.dependencies] +more-itertools = "*" + +[package.extras] +docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] +testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[[package]] +name = "jeepney" +version = "0.8.0" +description = "Low-level, pure Python DBus protocol wrapper." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"}, + {file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"}, +] + +[package.extras] +test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"] +trio = ["async_generator", "trio"] + +[[package]] +name = "jinja2" +version = "3.1.2" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, + {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "keyring" +version = "23.13.1" +description = "Store and access your passwords safely." +optional = false +python-versions = ">=3.7" +files = [ + {file = "keyring-23.13.1-py3-none-any.whl", hash = "sha256:771ed2a91909389ed6148631de678f82ddc73737d85a927f382a8a1b157898cd"}, + {file = "keyring-23.13.1.tar.gz", hash = "sha256:ba2e15a9b35e21908d0aaf4e0a47acc52d6ae33444df0da2b49d41a46ef6d678"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.11.4", markers = "python_version < \"3.12\""} +importlib-resources = {version = "*", markers = "python_version < \"3.9\""} +"jaraco.classes" = "*" +jeepney = {version = ">=0.4.2", markers = "sys_platform == \"linux\""} +pywin32-ctypes = {version = ">=0.2.0", markers = "sys_platform == \"win32\""} +SecretStorage = {version = ">=3.2", markers = "sys_platform == \"linux\""} + +[package.extras] +completion = ["shtab"] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)"] +testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[[package]] +name = "markdown" +version = "3.3.7" +description = "Python implementation of Markdown." +optional = false +python-versions = ">=3.6" +files = [ + {file = "Markdown-3.3.7-py3-none-any.whl", hash = "sha256:f5da449a6e1c989a4cea2631aa8ee67caa5a2ef855d551c88f9e309f4634c621"}, + {file = "Markdown-3.3.7.tar.gz", hash = "sha256:cbb516f16218e643d8e0a95b309f77eb118cb138d39a4f27851e6a63581db874"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.4", markers = "python_version < \"3.10\""} + +[package.extras] +testing = ["coverage", "pyyaml"] + +[[package]] +name = "markdown-it-py" +version = "2.2.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.7" +files = [ + {file = "markdown-it-py-2.2.0.tar.gz", hash = "sha256:7c9a5e412688bc771c67432cbfebcdd686c93ce6484913dccf06cb5a0bea35a1"}, + {file = "markdown_it_py-2.2.0-py3-none-any.whl", hash = "sha256:5a35f8d1870171d9acc47b99612dc146129b631baf04970128b568f190d0cc30"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["attrs", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "2.1.2" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.7" +files = [ + {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:665a36ae6f8f20a4676b53224e33d456a6f5a72657d9c83c2aa00765072f31f7"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:340bea174e9761308703ae988e982005aedf427de816d1afe98147668cc03036"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22152d00bf4a9c7c83960521fc558f55a1adbc0631fbb00a9471e097b19d72e1"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:28057e985dace2f478e042eaa15606c7efccb700797660629da387eb289b9323"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca244fa73f50a800cf8c3ebf7fd93149ec37f5cb9596aa8873ae2c1d23498601"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d9d971ec1e79906046aa3ca266de79eac42f1dbf3612a05dc9368125952bd1a1"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7e007132af78ea9df29495dbf7b5824cb71648d7133cf7848a2a5dd00d36f9ff"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7313ce6a199651c4ed9d7e4cfb4aa56fe923b1adf9af3b420ee14e6d9a73df65"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-win32.whl", hash = "sha256:c4a549890a45f57f1ebf99c067a4ad0cb423a05544accaf2b065246827ed9603"}, + {file = "MarkupSafe-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:835fb5e38fd89328e9c81067fd642b3593c33e1e17e2fdbf77f5676abb14a156"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2ec4f2d48ae59bbb9d1f9d7efb9236ab81429a764dedca114f5fdabbc3788013"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:608e7073dfa9e38a85d38474c082d4281f4ce276ac0010224eaba11e929dd53a"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65608c35bfb8a76763f37036547f7adfd09270fbdbf96608be2bead319728fcd"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2bfb563d0211ce16b63c7cb9395d2c682a23187f54c3d79bfec33e6705473c6"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:da25303d91526aac3672ee6d49a2f3db2d9502a4a60b55519feb1a4c7714e07d"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:9cad97ab29dfc3f0249b483412c85c8ef4766d96cdf9dcf5a1e3caa3f3661cf1"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:085fd3201e7b12809f9e6e9bc1e5c96a368c8523fad5afb02afe3c051ae4afcc"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1bea30e9bf331f3fef67e0a3877b2288593c98a21ccb2cf29b74c581a4eb3af0"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-win32.whl", hash = "sha256:7df70907e00c970c60b9ef2938d894a9381f38e6b9db73c5be35e59d92e06625"}, + {file = "MarkupSafe-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:e55e40ff0cc8cc5c07996915ad367fa47da6b3fc091fdadca7f5403239c5fec3"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a6e40afa7f45939ca356f348c8e23048e02cb109ced1eb8420961b2f40fb373a"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf877ab4ed6e302ec1d04952ca358b381a882fbd9d1b07cccbfd61783561f98a"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63ba06c9941e46fa389d389644e2d8225e0e3e5ebcc4ff1ea8506dce646f8c8a"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1cd098434e83e656abf198f103a8207a8187c0fc110306691a2e94a78d0abb2"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:55f44b440d491028addb3b88f72207d71eeebfb7b5dbf0643f7c023ae1fba619"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:a6f2fcca746e8d5910e18782f976489939d54a91f9411c32051b4aab2bd7c513"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0b462104ba25f1ac006fdab8b6a01ebbfbce9ed37fd37fd4acd70c67c973e460"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-win32.whl", hash = "sha256:7668b52e102d0ed87cb082380a7e2e1e78737ddecdde129acadb0eccc5423859"}, + {file = "MarkupSafe-2.1.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6d6607f98fcf17e534162f0709aaad3ab7a96032723d8ac8750ffe17ae5a0666"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:a806db027852538d2ad7555b203300173dd1b77ba116de92da9afbc3a3be3eed"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a4abaec6ca3ad8660690236d11bfe28dfd707778e2442b45addd2f086d6ef094"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f03a532d7dee1bed20bc4884194a16160a2de9ffc6354b3878ec9682bb623c54"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4cf06cdc1dda95223e9d2d3c58d3b178aa5dacb35ee7e3bbac10e4e1faacb419"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22731d79ed2eb25059ae3df1dfc9cb1546691cc41f4e3130fe6bfbc3ecbbecfa"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f8ffb705ffcf5ddd0e80b65ddf7bed7ee4f5a441ea7d3419e861a12eaf41af58"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8db032bf0ce9022a8e41a22598eefc802314e81b879ae093f36ce9ddf39ab1ba"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2298c859cfc5463f1b64bd55cb3e602528db6fa0f3cfd568d3605c50678f8f03"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-win32.whl", hash = "sha256:50c42830a633fa0cf9e7d27664637532791bfc31c731a87b202d2d8ac40c3ea2"}, + {file = "MarkupSafe-2.1.2-cp38-cp38-win_amd64.whl", hash = "sha256:bb06feb762bade6bf3c8b844462274db0c76acc95c52abe8dbed28ae3d44a147"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:99625a92da8229df6d44335e6fcc558a5037dd0a760e11d84be2260e6f37002f"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8bca7e26c1dd751236cfb0c6c72d4ad61d986e9a41bbf76cb445f69488b2a2bd"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40627dcf047dadb22cd25ea7ecfe9cbf3bbbad0482ee5920b582f3809c97654f"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40dfd3fefbef579ee058f139733ac336312663c6706d1163b82b3003fb1925c4"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:090376d812fb6ac5f171e5938e82e7f2d7adc2b629101cec0db8b267815c85e2"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2e7821bffe00aa6bd07a23913b7f4e01328c3d5cc0b40b36c0bd81d362faeb65"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:c0a33bc9f02c2b17c3ea382f91b4db0e6cde90b63b296422a939886a7a80de1c"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:b8526c6d437855442cdd3d87eede9c425c4445ea011ca38d937db299382e6fa3"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-win32.whl", hash = "sha256:137678c63c977754abe9086a3ec011e8fd985ab90631145dfb9294ad09c102a7"}, + {file = "MarkupSafe-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:0576fe974b40a400449768941d5d0858cc624e3249dfd1e0c33674e5c7ca7aed"}, + {file = "MarkupSafe-2.1.2.tar.gz", hash = "sha256:abcabc8c2b26036d62d4c746381a6f7cf60aafcc653198ad678306986b09450d"}, +] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + +[[package]] +name = "mergedeep" +version = "1.3.4" +description = "A deep merge function for 🐍." +optional = false +python-versions = ">=3.6" +files = [ + {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, + {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, +] + +[[package]] +name = "mkdocs" +version = "1.4.3" +description = "Project documentation with Markdown." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs-1.4.3-py3-none-any.whl", hash = "sha256:6ee46d309bda331aac915cd24aab882c179a933bd9e77b80ce7d2eaaa3f689dd"}, + {file = "mkdocs-1.4.3.tar.gz", hash = "sha256:5955093bbd4dd2e9403c5afaf57324ad8b04f16886512a3ee6ef828956481c57"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} +ghp-import = ">=1.0" +importlib-metadata = {version = ">=4.3", markers = "python_version < \"3.10\""} +jinja2 = ">=2.11.1" +markdown = ">=3.2.1,<3.4" +mergedeep = ">=1.3.4" +packaging = ">=20.5" +pyyaml = ">=5.1" +pyyaml-env-tag = ">=0.1" +watchdog = ">=2.0" + +[package.extras] +i18n = ["babel (>=2.9.0)"] +min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.3)", "jinja2 (==2.11.1)", "markdown (==3.2.1)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "packaging (==20.5)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "typing-extensions (==3.10)", "watchdog (==2.0)"] + +[[package]] +name = "mkdocs-autorefs" +version = "0.4.1" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs-autorefs-0.4.1.tar.gz", hash = "sha256:70748a7bd025f9ecd6d6feeba8ba63f8e891a1af55f48e366d6d6e78493aba84"}, + {file = "mkdocs_autorefs-0.4.1-py3-none-any.whl", hash = "sha256:a2248a9501b29dc0cc8ba4c09f4f47ff121945f6ce33d760f145d6f89d313f5b"}, +] + +[package.dependencies] +Markdown = ">=3.3" +mkdocs = ">=1.1" + +[[package]] +name = "mkdocs-material" +version = "9.1.17" +description = "Documentation that simply works" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_material-9.1.17-py3-none-any.whl", hash = "sha256:809ed68427fbab0330b0b07bc93175824c3b98f4187060a5c7b46aa8ae398a75"}, + {file = "mkdocs_material-9.1.17.tar.gz", hash = "sha256:5a076524625047bf4ee4da1509ec90626f8fce915839dc07bdae6b59ff4f36f9"}, +] + +[package.dependencies] +colorama = ">=0.4" +jinja2 = ">=3.0" +markdown = ">=3.2" +mkdocs = ">=1.4.2" +mkdocs-material-extensions = ">=1.1" +pygments = ">=2.14" +pymdown-extensions = ">=9.9.1" +regex = ">=2022.4.24" +requests = ">=2.26" + +[[package]] +name = "mkdocs-material-extensions" +version = "1.1.1" +description = "Extension pack for Python Markdown and MkDocs Material." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_material_extensions-1.1.1-py3-none-any.whl", hash = "sha256:e41d9f38e4798b6617ad98ca8f7f1157b1e4385ac1459ca1e4ea219b556df945"}, + {file = "mkdocs_material_extensions-1.1.1.tar.gz", hash = "sha256:9c003da71e2cc2493d910237448c672e00cefc800d3d6ae93d2fc69979e3bd93"}, +] + +[[package]] +name = "mkdocstrings" +version = "0.22.0" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocstrings-0.22.0-py3-none-any.whl", hash = "sha256:2d4095d461554ff6a778fdabdca3c00c468c2f1459d469f7a7f622a2b23212ba"}, + {file = "mkdocstrings-0.22.0.tar.gz", hash = "sha256:82a33b94150ebb3d4b5c73bab4598c3e21468c79ec072eff6931c8f3bfc38256"}, +] + +[package.dependencies] +importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""} +Jinja2 = ">=2.11.1" +Markdown = ">=3.3" +MarkupSafe = ">=1.1" +mkdocs = ">=1.2" +mkdocs-autorefs = ">=0.3.1" +mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} +pymdown-extensions = ">=6.3" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""} + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "0.10.1" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocstrings_python-0.10.1-py3-none-any.whl", hash = "sha256:ef239cee2c688e2b949a0a47e42a141d744dd12b7007311b3309dc70e3bafc5c"}, + {file = "mkdocstrings_python-0.10.1.tar.gz", hash = "sha256:b72301fff739070ec517b5b36bf2f7c49d1360a275896a64efb97fc17d3f3968"}, +] + +[package.dependencies] +griffe = ">=0.24" +mkdocstrings = ">=0.20" + +[[package]] +name = "more-itertools" +version = "9.1.0" +description = "More routines for operating on iterables, beyond itertools" +optional = false +python-versions = ">=3.7" +files = [ + {file = "more-itertools-9.1.0.tar.gz", hash = "sha256:cabaa341ad0389ea83c17a94566a53ae4c9d07349861ecb14dc6d0345cf9ac5d"}, + {file = "more_itertools-9.1.0-py3-none-any.whl", hash = "sha256:d2bc7f02446e86a68911e58ded76d6561eea00cddfb2a91e7019bbb586c799f3"}, +] + +[[package]] +name = "mypy-extensions" +version = "1.0.0" +description = "Type system extensions for programs checked with the mypy type checker." +optional = false +python-versions = ">=3.5" +files = [ + {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, + {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, +] + +[[package]] +name = "numpy" +version = "1.24.4" +description = "Fundamental package for array computing in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "numpy-1.24.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c0bfb52d2169d58c1cdb8cc1f16989101639b34c7d3ce60ed70b19c63eba0b64"}, + {file = "numpy-1.24.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ed094d4f0c177b1b8e7aa9cba7d6ceed51c0e569a5318ac0ca9a090680a6a1b1"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79fc682a374c4a8ed08b331bef9c5f582585d1048fa6d80bc6c35bc384eee9b4"}, + {file = "numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ffe43c74893dbf38c2b0a1f5428760a1a9c98285553c89e12d70a96a7f3a4d6"}, + {file = "numpy-1.24.4-cp310-cp310-win32.whl", hash = "sha256:4c21decb6ea94057331e111a5bed9a79d335658c27ce2adb580fb4d54f2ad9bc"}, + {file = "numpy-1.24.4-cp310-cp310-win_amd64.whl", hash = "sha256:b4bea75e47d9586d31e892a7401f76e909712a0fd510f58f5337bea9572c571e"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f136bab9c2cfd8da131132c2cf6cc27331dd6fae65f95f69dcd4ae3c3639c810"}, + {file = "numpy-1.24.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2926dac25b313635e4d6cf4dc4e51c8c0ebfed60b801c799ffc4c32bf3d1254"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:222e40d0e2548690405b0b3c7b21d1169117391c2e82c378467ef9ab4c8f0da7"}, + {file = "numpy-1.24.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7215847ce88a85ce39baf9e89070cb860c98fdddacbaa6c0da3ffb31b3350bd5"}, + {file = "numpy-1.24.4-cp311-cp311-win32.whl", hash = "sha256:4979217d7de511a8d57f4b4b5b2b965f707768440c17cb70fbf254c4b225238d"}, + {file = "numpy-1.24.4-cp311-cp311-win_amd64.whl", hash = "sha256:b7b1fc9864d7d39e28f41d089bfd6353cb5f27ecd9905348c24187a768c79694"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1452241c290f3e2a312c137a9999cdbf63f78864d63c79039bda65ee86943f61"}, + {file = "numpy-1.24.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:04640dab83f7c6c85abf9cd729c5b65f1ebd0ccf9de90b270cd61935eef0197f"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5425b114831d1e77e4b5d812b69d11d962e104095a5b9c3b641a218abcc050e"}, + {file = "numpy-1.24.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd80e219fd4c71fc3699fc1dadac5dcf4fd882bfc6f7ec53d30fa197b8ee22dc"}, + {file = "numpy-1.24.4-cp38-cp38-win32.whl", hash = "sha256:4602244f345453db537be5314d3983dbf5834a9701b7723ec28923e2889e0bb2"}, + {file = "numpy-1.24.4-cp38-cp38-win_amd64.whl", hash = "sha256:692f2e0f55794943c5bfff12b3f56f99af76f902fc47487bdfe97856de51a706"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:2541312fbf09977f3b3ad449c4e5f4bb55d0dbf79226d7724211acc905049400"}, + {file = "numpy-1.24.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9667575fb6d13c95f1b36aca12c5ee3356bf001b714fc354eb5465ce1609e62f"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3a86ed21e4f87050382c7bc96571755193c4c1392490744ac73d660e8f564a9"}, + {file = "numpy-1.24.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d11efb4dbecbdf22508d55e48d9c8384db795e1b7b51ea735289ff96613ff74d"}, + {file = "numpy-1.24.4-cp39-cp39-win32.whl", hash = "sha256:6620c0acd41dbcb368610bb2f4d83145674040025e5536954782467100aa8835"}, + {file = "numpy-1.24.4-cp39-cp39-win_amd64.whl", hash = "sha256:befe2bf740fd8373cf56149a5c23a0f601e82869598d41f8e188a0e9869926f8"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:31f13e25b4e304632a4619d0e0777662c2ffea99fcae2029556b17d8ff958aef"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:95f7ac6540e95bc440ad77f56e520da5bf877f87dca58bd095288dce8940532a"}, + {file = "numpy-1.24.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:e98f220aa76ca2a977fe435f5b04d7b3470c0a2e6312907b37ba6068f26787f2"}, + {file = "numpy-1.24.4.tar.gz", hash = "sha256:80f5e3a4e498641401868df4208b74581206afbee7cf7b8329daae82676d9463"}, +] + +[[package]] +name = "packaging" +version = "23.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "packaging-23.1-py3-none-any.whl", hash = "sha256:994793af429502c4ea2ebf6bf664629d07c1a9fe974af92966e4b8d2df7edc61"}, + {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, +] + +[[package]] +name = "pathspec" +version = "0.11.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, + {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, +] + +[[package]] +name = "pkginfo" +version = "1.9.6" +description = "Query metadata from sdists / bdists / installed packages." +optional = false +python-versions = ">=3.6" +files = [ + {file = "pkginfo-1.9.6-py3-none-any.whl", hash = "sha256:4b7a555a6d5a22169fcc9cf7bfd78d296b0361adad412a346c1226849af5e546"}, + {file = "pkginfo-1.9.6.tar.gz", hash = "sha256:8fd5896e8718a4372f0ea9cc9d96f6417c9b986e23a4d116dda26b62cc29d046"}, +] + +[package.extras] +testing = ["pytest", "pytest-cov"] + +[[package]] +name = "platformdirs" +version = "3.5.0" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +optional = false +python-versions = ">=3.7" +files = [ + {file = "platformdirs-3.5.0-py3-none-any.whl", hash = "sha256:47692bc24c1958e8b0f13dd727307cff1db103fca36399f457da8e05f222fdc4"}, + {file = "platformdirs-3.5.0.tar.gz", hash = "sha256:7954a68d0ba23558d753f73437c55f89027cf8f5108c19844d4b82e5af396335"}, +] + +[package.extras] +docs = ["furo (>=2023.3.27)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.3.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] + +[[package]] +name = "pluggy" +version = "1.0.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, + {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + +[[package]] +name = "pycparser" +version = "2.21" +description = "C parser in Python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pycparser-2.21-py2.py3-none-any.whl", hash = "sha256:8ee45429555515e1f6b185e78100aea234072576aa43ab53aefcae078162fca9"}, + {file = "pycparser-2.21.tar.gz", hash = "sha256:e644fdec12f7872f86c58ff790da456218b10f863970249516d60a5eaca77206"}, +] + +[[package]] +name = "pydantic" +version = "1.10.7" +description = "Data validation and settings management using python type hints" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pydantic-1.10.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e79e999e539872e903767c417c897e729e015872040e56b96e67968c3b918b2d"}, + {file = "pydantic-1.10.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:01aea3a42c13f2602b7ecbbea484a98169fb568ebd9e247593ea05f01b884b2e"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:516f1ed9bc2406a0467dd777afc636c7091d71f214d5e413d64fef45174cfc7a"}, + {file = "pydantic-1.10.7-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ae150a63564929c675d7f2303008d88426a0add46efd76c3fc797cd71cb1b46f"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:ecbbc51391248116c0a055899e6c3e7ffbb11fb5e2a4cd6f2d0b93272118a209"}, + {file = "pydantic-1.10.7-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:f4a2b50e2b03d5776e7f21af73e2070e1b5c0d0df255a827e7c632962f8315af"}, + {file = "pydantic-1.10.7-cp310-cp310-win_amd64.whl", hash = "sha256:a7cd2251439988b413cb0a985c4ed82b6c6aac382dbaff53ae03c4b23a70e80a"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:68792151e174a4aa9e9fc1b4e653e65a354a2fa0fed169f7b3d09902ad2cb6f1"}, + {file = "pydantic-1.10.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe2507b8ef209da71b6fb5f4e597b50c5a34b78d7e857c4f8f3115effaef5fe"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10a86d8c8db68086f1e30a530f7d5f83eb0685e632e411dbbcf2d5c0150e8dcd"}, + {file = "pydantic-1.10.7-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d75ae19d2a3dbb146b6f324031c24f8a3f52ff5d6a9f22f0683694b3afcb16fb"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:464855a7ff7f2cc2cf537ecc421291b9132aa9c79aef44e917ad711b4a93163b"}, + {file = "pydantic-1.10.7-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:193924c563fae6ddcb71d3f06fa153866423ac1b793a47936656e806b64e24ca"}, + {file = "pydantic-1.10.7-cp311-cp311-win_amd64.whl", hash = "sha256:b4a849d10f211389502059c33332e91327bc154acc1845f375a99eca3afa802d"}, + {file = "pydantic-1.10.7-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:cc1dde4e50a5fc1336ee0581c1612215bc64ed6d28d2c7c6f25d2fe3e7c3e918"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e0cfe895a504c060e5d36b287ee696e2fdad02d89e0d895f83037245218a87fe"}, + {file = "pydantic-1.10.7-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:670bb4683ad1e48b0ecb06f0cfe2178dcf74ff27921cdf1606e527d2617a81ee"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:950ce33857841f9a337ce07ddf46bc84e1c4946d2a3bba18f8280297157a3fd1"}, + {file = "pydantic-1.10.7-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c15582f9055fbc1bfe50266a19771bbbef33dd28c45e78afbe1996fd70966c2a"}, + {file = "pydantic-1.10.7-cp37-cp37m-win_amd64.whl", hash = "sha256:82dffb306dd20bd5268fd6379bc4bfe75242a9c2b79fec58e1041fbbdb1f7914"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c7f51861d73e8b9ddcb9916ae7ac39fb52761d9ea0df41128e81e2ba42886cd"}, + {file = "pydantic-1.10.7-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6434b49c0b03a51021ade5c4daa7d70c98f7a79e95b551201fff682fc1661245"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:64d34ab766fa056df49013bb6e79921a0265204c071984e75a09cbceacbbdd5d"}, + {file = "pydantic-1.10.7-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:701daea9ffe9d26f97b52f1d157e0d4121644f0fcf80b443248434958fd03dc3"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:cf135c46099ff3f919d2150a948ce94b9ce545598ef2c6c7bf55dca98a304b52"}, + {file = "pydantic-1.10.7-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b0f85904f73161817b80781cc150f8b906d521fa11e3cdabae19a581c3606209"}, + {file = "pydantic-1.10.7-cp38-cp38-win_amd64.whl", hash = "sha256:9f6f0fd68d73257ad6685419478c5aece46432f4bdd8d32c7345f1986496171e"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c230c0d8a322276d6e7b88c3f7ce885f9ed16e0910354510e0bae84d54991143"}, + {file = "pydantic-1.10.7-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:976cae77ba6a49d80f461fd8bba183ff7ba79f44aa5cfa82f1346b5626542f8e"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d45fc99d64af9aaf7e308054a0067fdcd87ffe974f2442312372dfa66e1001d"}, + {file = "pydantic-1.10.7-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d2a5ebb48958754d386195fe9e9c5106f11275867051bf017a8059410e9abf1f"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:abfb7d4a7cd5cc4e1d1887c43503a7c5dd608eadf8bc615413fc498d3e4645cd"}, + {file = "pydantic-1.10.7-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:80b1fab4deb08a8292d15e43a6edccdffa5377a36a4597bb545b93e79c5ff0a5"}, + {file = "pydantic-1.10.7-cp39-cp39-win_amd64.whl", hash = "sha256:d71e69699498b020ea198468e2480a2f1e7433e32a3a99760058c6520e2bea7e"}, + {file = "pydantic-1.10.7-py3-none-any.whl", hash = "sha256:0cd181f1d0b1d00e2b705f1bf1ac7799a2d938cce3376b8007df62b29be3c2c6"}, + {file = "pydantic-1.10.7.tar.gz", hash = "sha256:cfc83c0678b6ba51b0532bea66860617c4cd4251ecf76e9846fa5a9f3454e97e"}, +] + +[package.dependencies] +typing-extensions = ">=4.2.0" + +[package.extras] +dotenv = ["python-dotenv (>=0.10.4)"] +email = ["email-validator (>=1.0.3)"] + +[[package]] +name = "pygments" +version = "2.15.1" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "Pygments-2.15.1-py3-none-any.whl", hash = "sha256:db2db3deb4b4179f399a09054b023b6a586b76499d36965813c71aa8ed7b5fd1"}, + {file = "Pygments-2.15.1.tar.gz", hash = "sha256:8ace4d3c1dd481894b2005f560ead0f9f19ee64fe983366be1a21e171d12775c"}, +] + +[package.extras] +plugins = ["importlib-metadata"] + +[[package]] +name = "pymdown-extensions" +version = "9.11" +description = "Extension pack for Python Markdown." +optional = false +python-versions = ">=3.7" +files = [ + {file = "pymdown_extensions-9.11-py3-none-any.whl", hash = "sha256:a499191d8d869f30339de86fcf072a787e86c42b6f16f280f5c2cf174182b7f3"}, + {file = "pymdown_extensions-9.11.tar.gz", hash = "sha256:f7e86c1d3981f23d9dc43294488ecb54abadd05b0be4bf8f0e15efc90f7853ff"}, +] + +[package.dependencies] +markdown = ">=3.2" +pyyaml = "*" + +[[package]] +name = "pytest" +version = "7.4.0" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-7.4.0-py3-none-any.whl", hash = "sha256:78bf16451a2eb8c7a2ea98e32dc119fd2aa758f1d5d66dbf0a59d69a3969df32"}, + {file = "pytest-7.4.0.tar.gz", hash = "sha256:b4bf8c45bd59934ed84001ad51e11b4ee40d40a1229d2c79f9c592b0a3f6bd8a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=0.12,<2.0" +tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} + +[package.extras] +testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "python-dateutil" +version = "2.8.2" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, + {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pywin32-ctypes" +version = "0.2.0" +description = "" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-ctypes-0.2.0.tar.gz", hash = "sha256:24ffc3b341d457d48e8922352130cf2644024a4ff09762a2261fd34c36ee5942"}, + {file = "pywin32_ctypes-0.2.0-py2.py3-none-any.whl", hash = "sha256:9dc2d991b3479cc2df15930958b674a48a227d5361d413827a4cfd0b5876fc98"}, +] + +[[package]] +name = "pyyaml" +version = "6.0" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.6" +files = [ + {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, + {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, + {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, + {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"}, + {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"}, + {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"}, + {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"}, + {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"}, + {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, + {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, + {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, + {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, + {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, + {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, + {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, + {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, + {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, + {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, + {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, + {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, +] + +[[package]] +name = "pyyaml-env-tag" +version = "0.1" +description = "A custom YAML tag for referencing environment variables in YAML files. " +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, + {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, +] + +[package.dependencies] +pyyaml = "*" + +[[package]] +name = "readme-renderer" +version = "37.3" +description = "readme_renderer is a library for rendering \"readme\" descriptions for Warehouse" +optional = false +python-versions = ">=3.7" +files = [ + {file = "readme_renderer-37.3-py3-none-any.whl", hash = "sha256:f67a16caedfa71eef48a31b39708637a6f4664c4394801a7b0d6432d13907343"}, + {file = "readme_renderer-37.3.tar.gz", hash = "sha256:cd653186dfc73055656f090f227f5cb22a046d7f71a841dfa305f55c9a513273"}, +] + +[package.dependencies] +bleach = ">=2.1.0" +docutils = ">=0.13.1" +Pygments = ">=2.5.1" + +[package.extras] +md = ["cmarkgfm (>=0.8.0)"] + +[[package]] +name = "regex" +version = "2023.5.5" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.6" +files = [ + {file = "regex-2023.5.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:48c9ec56579d4ba1c88f42302194b8ae2350265cb60c64b7b9a88dcb7fbde309"}, + {file = "regex-2023.5.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02f4541550459c08fdd6f97aa4e24c6f1932eec780d58a2faa2068253df7d6ff"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e22e4460f0245b468ee645156a4f84d0fc35a12d9ba79bd7d79bdcd2f9629d"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b870b6f632fc74941cadc2a0f3064ed8409e6f8ee226cdfd2a85ae50473aa94"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:171c52e320fe29260da550d81c6b99f6f8402450dc7777ef5ced2e848f3b6f8f"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aad5524c2aedaf9aa14ef1bc9327f8abd915699dea457d339bebbe2f0d218f86"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a0f874ee8c0bc820e649c900243c6d1e6dc435b81da1492046716f14f1a2a96"}, + {file = "regex-2023.5.5-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:e645c757183ee0e13f0bbe56508598e2d9cd42b8abc6c0599d53b0d0b8dd1479"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:a4c5da39bca4f7979eefcbb36efea04471cd68db2d38fcbb4ee2c6d440699833"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:5e3f4468b8c6fd2fd33c218bbd0a1559e6a6fcf185af8bb0cc43f3b5bfb7d636"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:59e4b729eae1a0919f9e4c0fc635fbcc9db59c74ad98d684f4877be3d2607dd6"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ba73a14e9c8f9ac409863543cde3290dba39098fc261f717dc337ea72d3ebad2"}, + {file = "regex-2023.5.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0bbd5dcb19603ab8d2781fac60114fb89aee8494f4505ae7ad141a3314abb1f9"}, + {file = "regex-2023.5.5-cp310-cp310-win32.whl", hash = "sha256:40005cbd383438aecf715a7b47fe1e3dcbc889a36461ed416bdec07e0ef1db66"}, + {file = "regex-2023.5.5-cp310-cp310-win_amd64.whl", hash = "sha256:59597cd6315d3439ed4b074febe84a439c33928dd34396941b4d377692eca810"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f08276466fedb9e36e5193a96cb944928301152879ec20c2d723d1031cd4ddd"}, + {file = "regex-2023.5.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cd46f30e758629c3ee91713529cfbe107ac50d27110fdcc326a42ce2acf4dafc"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f2910502f718828cecc8beff004917dcf577fc5f8f5dd40ffb1ea7612124547b"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:445d6f4fc3bd9fc2bf0416164454f90acab8858cd5a041403d7a11e3356980e8"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18196c16a584619c7c1d843497c069955d7629ad4a3fdee240eb347f4a2c9dbe"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33d430a23b661629661f1fe8395be2004006bc792bb9fc7c53911d661b69dd7e"}, + {file = "regex-2023.5.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:72a28979cc667e5f82ef433db009184e7ac277844eea0f7f4d254b789517941d"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:f764e4dfafa288e2eba21231f455d209f4709436baeebb05bdecfb5d8ddc3d35"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:23d86ad2121b3c4fc78c58f95e19173790e22ac05996df69b84e12da5816cb17"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:690a17db524ee6ac4a27efc5406530dd90e7a7a69d8360235323d0e5dafb8f5b"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:1ecf3dcff71f0c0fe3e555201cbe749fa66aae8d18f80d2cc4de8e66df37390a"}, + {file = "regex-2023.5.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:811040d7f3dd9c55eb0d8b00b5dcb7fd9ae1761c454f444fd9f37fe5ec57143a"}, + {file = "regex-2023.5.5-cp311-cp311-win32.whl", hash = "sha256:c8c143a65ce3ca42e54d8e6fcaf465b6b672ed1c6c90022794a802fb93105d22"}, + {file = "regex-2023.5.5-cp311-cp311-win_amd64.whl", hash = "sha256:586a011f77f8a2da4b888774174cd266e69e917a67ba072c7fc0e91878178a80"}, + {file = "regex-2023.5.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:b6365703e8cf1644b82104cdd05270d1a9f043119a168d66c55684b1b557d008"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a56c18f21ac98209da9c54ae3ebb3b6f6e772038681d6cb43b8d53da3b09ee81"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8b942d8b3ce765dbc3b1dad0a944712a89b5de290ce8f72681e22b3c55f3cc8"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:844671c9c1150fcdac46d43198364034b961bd520f2c4fdaabfc7c7d7138a2dd"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2ce65bdeaf0a386bb3b533a28de3994e8e13b464ac15e1e67e4603dd88787fa"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fee0016cc35a8a91e8cc9312ab26a6fe638d484131a7afa79e1ce6165328a135"}, + {file = "regex-2023.5.5-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:18f05d14f14a812fe9723f13afafefe6b74ca042d99f8884e62dbd34dcccf3e2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:941b3f1b2392f0bcd6abf1bc7a322787d6db4e7457be6d1ffd3a693426a755f2"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:921473a93bcea4d00295799ab929522fc650e85c6b9f27ae1e6bb32a790ea7d3"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:e2205a81f815b5bb17e46e74cc946c575b484e5f0acfcb805fb252d67e22938d"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:385992d5ecf1a93cb85adff2f73e0402dd9ac29b71b7006d342cc920816e6f32"}, + {file = "regex-2023.5.5-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:890a09cb0a62198bff92eda98b2b507305dd3abf974778bae3287f98b48907d3"}, + {file = "regex-2023.5.5-cp36-cp36m-win32.whl", hash = "sha256:821a88b878b6589c5068f4cc2cfeb2c64e343a196bc9d7ac68ea8c2a776acd46"}, + {file = "regex-2023.5.5-cp36-cp36m-win_amd64.whl", hash = "sha256:7918a1b83dd70dc04ab5ed24c78ae833ae8ea228cef84e08597c408286edc926"}, + {file = "regex-2023.5.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:338994d3d4ca4cf12f09822e025731a5bdd3a37aaa571fa52659e85ca793fb67"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a69cf0c00c4d4a929c6c7717fd918414cab0d6132a49a6d8fc3ded1988ed2ea"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8f5e06df94fff8c4c85f98c6487f6636848e1dc85ce17ab7d1931df4a081f657"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8906669b03c63266b6a7693d1f487b02647beb12adea20f8840c1a087e2dfb5"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fda3e50abad8d0f48df621cf75adc73c63f7243cbe0e3b2171392b445401550"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5ac2b7d341dc1bd102be849d6dd33b09701223a851105b2754339e390be0627a"}, + {file = "regex-2023.5.5-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:fb2b495dd94b02de8215625948132cc2ea360ae84fe6634cd19b6567709c8ae2"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:aa7d032c1d84726aa9edeb6accf079b4caa87151ca9fabacef31fa028186c66d"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:3d45864693351c15531f7e76f545ec35000d50848daa833cead96edae1665559"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21e90a288e6ba4bf44c25c6a946cb9b0f00b73044d74308b5e0afd190338297c"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:10250a093741ec7bf74bcd2039e697f519b028518f605ff2aa7ac1e9c9f97423"}, + {file = "regex-2023.5.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:6b8d0c153f07a953636b9cdb3011b733cadd4178123ef728ccc4d5969e67f3c2"}, + {file = "regex-2023.5.5-cp37-cp37m-win32.whl", hash = "sha256:10374c84ee58c44575b667310d5bbfa89fb2e64e52349720a0182c0017512f6c"}, + {file = "regex-2023.5.5-cp37-cp37m-win_amd64.whl", hash = "sha256:9b320677521aabf666cdd6e99baee4fb5ac3996349c3b7f8e7c4eee1c00dfe3a"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:afb1c70ec1e594a547f38ad6bf5e3d60304ce7539e677c1429eebab115bce56e"}, + {file = "regex-2023.5.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cf123225945aa58b3057d0fba67e8061c62d14cc8a4202630f8057df70189051"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a99757ad7fe5c8a2bb44829fc57ced11253e10f462233c1255fe03888e06bc19"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a623564d810e7a953ff1357f7799c14bc9beeab699aacc8b7ab7822da1e952b8"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ced02e3bd55e16e89c08bbc8128cff0884d96e7f7a5633d3dc366b6d95fcd1d6"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d1cbe6b5be3b9b698d8cc4ee4dee7e017ad655e83361cd0ea8e653d65e469468"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a6e4b0e0531223f53bad07ddf733af490ba2b8367f62342b92b39b29f72735a"}, + {file = "regex-2023.5.5-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2e9c4f778514a560a9c9aa8e5538bee759b55f6c1dcd35613ad72523fd9175b8"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:256f7f4c6ba145f62f7a441a003c94b8b1af78cee2cccacfc1e835f93bc09426"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:bd7b68fd2e79d59d86dcbc1ccd6e2ca09c505343445daaa4e07f43c8a9cc34da"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4a5059bd585e9e9504ef9c07e4bc15b0a621ba20504388875d66b8b30a5c4d18"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:6893544e06bae009916a5658ce7207e26ed17385149f35a3125f5259951f1bbe"}, + {file = "regex-2023.5.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c64d5abe91a3dfe5ff250c6bb267ef00dbc01501518225b45a5f9def458f31fb"}, + {file = "regex-2023.5.5-cp38-cp38-win32.whl", hash = "sha256:7923470d6056a9590247ff729c05e8e0f06bbd4efa6569c916943cb2d9b68b91"}, + {file = "regex-2023.5.5-cp38-cp38-win_amd64.whl", hash = "sha256:4035d6945cb961c90c3e1c1ca2feb526175bcfed44dfb1cc77db4fdced060d3e"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:50fd2d9b36938d4dcecbd684777dd12a407add4f9f934f235c66372e630772b0"}, + {file = "regex-2023.5.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:d19e57f888b00cd04fc38f5e18d0efbd91ccba2d45039453ab2236e6eec48d4d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd966475e963122ee0a7118ec9024388c602d12ac72860f6eea119a3928be053"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db09e6c18977a33fea26fe67b7a842f706c67cf8bda1450974d0ae0dd63570df"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6164d4e2a82f9ebd7752a06bd6c504791bedc6418c0196cd0a23afb7f3e12b2d"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84397d3f750d153ebd7f958efaa92b45fea170200e2df5e0e1fd4d85b7e3f58a"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9c3efee9bb53cbe7b285760c81f28ac80dc15fa48b5fe7e58b52752e642553f1"}, + {file = "regex-2023.5.5-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:144b5b017646b5a9392a5554a1e5db0000ae637be4971c9747566775fc96e1b2"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:1189fbbb21e2c117fda5303653b61905aeeeea23de4a94d400b0487eb16d2d60"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f83fe9e10f9d0b6cf580564d4d23845b9d692e4c91bd8be57733958e4c602956"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:72aa4746993a28c841e05889f3f1b1e5d14df8d3daa157d6001a34c98102b393"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:de2f780c3242ea114dd01f84848655356af4dd561501896c751d7b885ea6d3a1"}, + {file = "regex-2023.5.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:290fd35219486dfbc00b0de72f455ecdd63e59b528991a6aec9fdfc0ce85672e"}, + {file = "regex-2023.5.5-cp39-cp39-win32.whl", hash = "sha256:732176f5427e72fa2325b05c58ad0b45af341c459910d766f814b0584ac1f9ac"}, + {file = "regex-2023.5.5-cp39-cp39-win_amd64.whl", hash = "sha256:1307aa4daa1cbb23823d8238e1f61292fd07e4e5d8d38a6efff00b67a7cdb764"}, + {file = "regex-2023.5.5.tar.gz", hash = "sha256:7d76a8a1fc9da08296462a18f16620ba73bcbf5909e42383b253ef34d9d5141e"}, +] + +[[package]] +name = "requests" +version = "2.30.0" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.7" +files = [ + {file = "requests-2.30.0-py3-none-any.whl", hash = "sha256:10e94cc4f3121ee6da529d358cdaeaff2f1c409cd377dbc72b825852f2f7e294"}, + {file = "requests-2.30.0.tar.gz", hash = "sha256:239d7d4458afcb28a692cdd298d87542235f4ca8d36d03a15bfc128a6559a2f4"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "requests-toolbelt" +version = "1.0.0" +description = "A utility belt for advanced users of python-requests" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, + {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, +] + +[package.dependencies] +requests = ">=2.0.1,<3.0.0" + +[[package]] +name = "rfc3986" +version = "2.0.0" +description = "Validating URI References per RFC 3986" +optional = false +python-versions = ">=3.7" +files = [ + {file = "rfc3986-2.0.0-py2.py3-none-any.whl", hash = "sha256:50b1502b60e289cb37883f3dfd34532b8873c7de9f49bb546641ce9cbd256ebd"}, + {file = "rfc3986-2.0.0.tar.gz", hash = "sha256:97aacf9dbd4bfd829baad6e6309fa6573aaf1be3f6fa735c8ab05e46cecb261c"}, +] + +[package.extras] +idna2008 = ["idna"] + +[[package]] +name = "rich" +version = "13.3.5" +description = "Render rich text, tables, progress bars, syntax highlighting, markdown and more to the terminal" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "rich-13.3.5-py3-none-any.whl", hash = "sha256:69cdf53799e63f38b95b9bf9c875f8c90e78dd62b2f00c13a911c7a3b9fa4704"}, + {file = "rich-13.3.5.tar.gz", hash = "sha256:2d11b9b8dd03868f09b4fffadc84a6a8cda574e40dc90821bd845720ebb8e89c"}, +] + +[package.dependencies] +markdown-it-py = ">=2.2.0,<3.0.0" +pygments = ">=2.13.0,<3.0.0" +typing-extensions = {version = ">=4.0.0,<5.0", markers = "python_version < \"3.9\""} + +[package.extras] +jupyter = ["ipywidgets (>=7.5.1,<9)"] + +[[package]] +name = "scikit-build" +version = "0.17.6" +description = "Improved build system generator for Python C/C++/Fortran/Cython extensions" +optional = false +python-versions = ">=3.7" +files = [ + {file = "scikit_build-0.17.6-py3-none-any.whl", hash = "sha256:18bd55e81841106eec93f30a297df4f301003791c41be46ef6428d58bd42d6b3"}, + {file = "scikit_build-0.17.6.tar.gz", hash = "sha256:b51a51a36b37c42650994b5047912f59b22e3210b23e321f287611f9ef6e5c9d"}, +] + +[package.dependencies] +distro = "*" +packaging = "*" +setuptools = ">=42.0.0" +tomli = {version = "*", markers = "python_version < \"3.11\""} +wheel = ">=0.32.0" + +[package.extras] +cov = ["coverage[toml] (>=4.2)", "pytest-cov (>=2.7.1)"] +docs = ["pygments", "sphinx (>=4)", "sphinx-issues", "sphinx-rtd-theme (>=1.0)", "sphinxcontrib-moderncmakedomain (>=3.19)"] +doctest = ["ubelt (>=0.8.2)", "xdoctest (>=0.10.0)"] +test = ["build (>=0.7)", "cython (>=0.25.1)", "importlib-metadata", "pytest (>=6.0.0)", "pytest-mock (>=1.10.4)", "requests", "virtualenv"] + +[[package]] +name = "secretstorage" +version = "3.3.3" +description = "Python bindings to FreeDesktop.org Secret Service API" +optional = false +python-versions = ">=3.6" +files = [ + {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, + {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, +] + +[package.dependencies] +cryptography = ">=2.0" +jeepney = ">=0.6" + +[[package]] +name = "setuptools" +version = "67.7.2" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.7" +files = [ + {file = "setuptools-67.7.2-py3-none-any.whl", hash = "sha256:23aaf86b85ca52ceb801d32703f12d77517b2556af839621c641fca11287952b"}, + {file = "setuptools-67.7.2.tar.gz", hash = "sha256:f104fa03692a2602fa0fec6c6a9e63b6c8a968de13e17c026957dd1f53d80990"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-hoverxref (<2)", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (==0.8.3)", "sphinx-reredirects", "sphinxcontrib-towncrier"] +testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] +testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "sniffio" +version = "1.3.0" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.0-py3-none-any.whl", hash = "sha256:eecefdce1e5bbfb7ad2eeaabf7c1eeb404d7757c379bd1f7e5cce9d8bf425384"}, + {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"}, +] + +[[package]] +name = "sse-starlette" +version = "1.6.1" +description = "\"SSE plugin for Starlette\"" +optional = true +python-versions = ">=3.8" +files = [ + {file = "sse-starlette-1.6.1.tar.gz", hash = "sha256:6208af2bd7d0887c92f1379da14bd1f4db56bd1274cc5d36670c683d2aa1de6a"}, + {file = "sse_starlette-1.6.1-py3-none-any.whl", hash = "sha256:d8f18f1c633e355afe61cc5e9c92eea85badcb8b2d56ec8cfb0a006994aa55da"}, +] + +[package.dependencies] +starlette = "*" + +[[package]] +name = "starlette" +version = "0.27.0" +description = "The little ASGI library that shines." +optional = true +python-versions = ">=3.7" +files = [ + {file = "starlette-0.27.0-py3-none-any.whl", hash = "sha256:918416370e846586541235ccd38a474c08b80443ed31c578a418e2209b3eef91"}, + {file = "starlette-0.27.0.tar.gz", hash = "sha256:6a6b0d042acb8d469a01eba54e9cda6cbd24ac602c4cd016723117d6a7e73b75"}, +] + +[package.dependencies] +anyio = ">=3.4.0,<5" +typing-extensions = {version = ">=3.10.0", markers = "python_version < \"3.10\""} + +[package.extras] +full = ["httpx (>=0.22.0)", "itsdangerous", "jinja2", "python-multipart", "pyyaml"] + +[[package]] +name = "tomli" +version = "2.0.1" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, + {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, +] + +[[package]] +name = "twine" +version = "4.0.2" +description = "Collection of utilities for publishing packages on PyPI" +optional = false +python-versions = ">=3.7" +files = [ + {file = "twine-4.0.2-py3-none-any.whl", hash = "sha256:929bc3c280033347a00f847236564d1c52a3e61b1ac2516c97c48f3ceab756d8"}, + {file = "twine-4.0.2.tar.gz", hash = "sha256:9e102ef5fdd5a20661eb88fad46338806c3bd32cf1db729603fe3697b1bc83c8"}, +] + +[package.dependencies] +importlib-metadata = ">=3.6" +keyring = ">=15.1" +pkginfo = ">=1.8.1" +readme-renderer = ">=35.0" +requests = ">=2.20" +requests-toolbelt = ">=0.8.0,<0.9.0 || >0.9.0" +rfc3986 = ">=1.4.0" +rich = ">=12.0.0" +urllib3 = ">=1.26.0" + +[[package]] +name = "typing-extensions" +version = "4.7.1" +description = "Backported and Experimental Type Hints for Python 3.7+" +optional = false +python-versions = ">=3.7" +files = [ + {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, + {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, +] + +[[package]] +name = "urllib3" +version = "2.0.2" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.7" +files = [ + {file = "urllib3-2.0.2-py3-none-any.whl", hash = "sha256:d055c2f9d38dc53c808f6fdc8eab7360b6fdbbde02340ed25cfbcd817c62469e"}, + {file = "urllib3-2.0.2.tar.gz", hash = "sha256:61717a1095d7e155cdb737ac7bb2f4324a858a1e2e6466f6d03ff630ca68d3cc"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17.1.0)", "urllib3-secure-extra"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "uvicorn" +version = "0.22.0" +description = "The lightning-fast ASGI server." +optional = true +python-versions = ">=3.7" +files = [ + {file = "uvicorn-0.22.0-py3-none-any.whl", hash = "sha256:e9434d3bbf05f310e762147f769c9f21235ee118ba2d2bf1155a7196448bd996"}, + {file = "uvicorn-0.22.0.tar.gz", hash = "sha256:79277ae03db57ce7d9aa0567830bbb51d7a612f54d6e1e3e92da3ef24c2c8ed8"}, +] + +[package.dependencies] +click = ">=7.0" +h11 = ">=0.8" + +[package.extras] +standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] + +[[package]] +name = "watchdog" +version = "3.0.0" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.7" +files = [ + {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:336adfc6f5cc4e037d52db31194f7581ff744b67382eb6021c868322e32eef41"}, + {file = "watchdog-3.0.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a70a8dcde91be523c35b2bf96196edc5730edb347e374c7de7cd20c43ed95397"}, + {file = "watchdog-3.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:adfdeab2da79ea2f76f87eb42a3ab1966a5313e5a69a0213a3cc06ef692b0e96"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2b57a1e730af3156d13b7fdddfc23dea6487fceca29fc75c5a868beed29177ae"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7ade88d0d778b1b222adebcc0927428f883db07017618a5e684fd03b83342bd9"}, + {file = "watchdog-3.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7e447d172af52ad204d19982739aa2346245cc5ba6f579d16dac4bfec226d2e7"}, + {file = "watchdog-3.0.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:9fac43a7466eb73e64a9940ac9ed6369baa39b3bf221ae23493a9ec4d0022674"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:8ae9cda41fa114e28faf86cb137d751a17ffd0316d1c34ccf2235e8a84365c7f"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:25f70b4aa53bd743729c7475d7ec41093a580528b100e9a8c5b5efe8899592fc"}, + {file = "watchdog-3.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4f94069eb16657d2c6faada4624c39464f65c05606af50bb7902e036e3219be3"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c5f84b5194c24dd573fa6472685b2a27cc5a17fe5f7b6fd40345378ca6812e3"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3aa7f6a12e831ddfe78cdd4f8996af9cf334fd6346531b16cec61c3b3c0d8da0"}, + {file = "watchdog-3.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:233b5817932685d39a7896b1090353fc8efc1ef99c9c054e46c8002561252fb8"}, + {file = "watchdog-3.0.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:13bbbb462ee42ec3c5723e1205be8ced776f05b100e4737518c67c8325cf6100"}, + {file = "watchdog-3.0.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:8f3ceecd20d71067c7fd4c9e832d4e22584318983cabc013dbf3f70ea95de346"}, + {file = "watchdog-3.0.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:c9d8c8ec7efb887333cf71e328e39cffbf771d8f8f95d308ea4125bf5f90ba64"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0e06ab8858a76e1219e68c7573dfeba9dd1c0219476c5a44d5333b01d7e1743a"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_armv7l.whl", hash = "sha256:d00e6be486affb5781468457b21a6cbe848c33ef43f9ea4a73b4882e5f188a44"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_i686.whl", hash = "sha256:c07253088265c363d1ddf4b3cdb808d59a0468ecd017770ed716991620b8f77a"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64.whl", hash = "sha256:5113334cf8cf0ac8cd45e1f8309a603291b614191c9add34d33075727a967709"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:51f90f73b4697bac9c9a78394c3acbbd331ccd3655c11be1a15ae6fe289a8c83"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_s390x.whl", hash = "sha256:ba07e92756c97e3aca0912b5cbc4e5ad802f4557212788e72a72a47ff376950d"}, + {file = "watchdog-3.0.0-py3-none-manylinux2014_x86_64.whl", hash = "sha256:d429c2430c93b7903914e4db9a966c7f2b068dd2ebdd2fa9b9ce094c7d459f33"}, + {file = "watchdog-3.0.0-py3-none-win32.whl", hash = "sha256:3ed7c71a9dccfe838c2f0b6314ed0d9b22e77d268c67e015450a29036a81f60f"}, + {file = "watchdog-3.0.0-py3-none-win_amd64.whl", hash = "sha256:4c9956d27be0bb08fc5f30d9d0179a855436e655f046d288e2bcc11adfae893c"}, + {file = "watchdog-3.0.0-py3-none-win_ia64.whl", hash = "sha256:5d9f3a10e02d7371cd929b5d8f11e87d4bad890212ed3901f9b4d68767bee759"}, + {file = "watchdog-3.0.0.tar.gz", hash = "sha256:4d98a320595da7a7c5a18fc48cb633c2e73cda78f93cac2ef42d42bf609a33f9"}, +] + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = false +python-versions = "*" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + +[[package]] +name = "wheel" +version = "0.40.0" +description = "A built-package format for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "wheel-0.40.0-py3-none-any.whl", hash = "sha256:d236b20e7cb522daf2390fa84c55eea81c5c30190f90f29ae2ca1ad8355bf247"}, + {file = "wheel-0.40.0.tar.gz", hash = "sha256:cd1196f3faee2b31968d626e1731c94f99cbdb67cf5a46e4f5656cbee7738873"}, +] + +[package.extras] +test = ["pytest (>=6.0.0)"] + +[[package]] +name = "zipp" +version = "3.15.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.7" +files = [ + {file = "zipp-3.15.0-py3-none-any.whl", hash = "sha256:48904fc76a60e542af151aded95726c1a5c34ed43ab4134b597665c86d7ad556"}, + {file = "zipp-3.15.0.tar.gz", hash = "sha256:112929ad649da941c23de50f356a2b5570c954b65150642bccdd66bf194d224b"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] + +[extras] +server = ["fastapi", "sse-starlette", "uvicorn"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.8.1" +content-hash = "ed454fad4bd4ea920624c1bcdf2beb74bdb8e9394c22156234c8bc0fde770bd8" diff --git a/poetry.toml b/poetry.toml new file mode 100644 index 000000000..be97f1ef2 --- /dev/null +++ b/poetry.toml @@ -0,0 +1,3 @@ +[virtualenvs] +in-project = true +prefer-active-python = true \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..196aaedcb --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,44 @@ +[tool.poetry] +name = "falcon_cpp_python" +version = "0.0.1" +description = "Python bindings for the ggllm.cpp library" +authors = ["Jonathan Levin "] +license = "MIT" +readme = "README.md" +homepage = "https://github.com/abetlen/falcon-cpp-python" +repository = "https://github.com/abetlen/falcon-cpp-python" +packages = [{include = "falcon_cpp"}] +include = [ + "LICENSE.md", +] + +[tool.poetry.dependencies] +python = "^3.8.1" +typing-extensions = "^4.7.1" +numpy = "^1.24.4" +diskcache = "^5.6.1" +uvicorn = { version = "^0.22.0", optional = true } +fastapi = { version = "^0.99.1", optional = true } +sse-starlette = { version = "^1.6.1", optional = true } + +[tool.poetry.group.dev.dependencies] +black = "^23.3.0" +twine = "^4.0.2" +mkdocs = "^1.4.3" +mkdocstrings = {extras = ["python"], version = "^0.22.0"} +mkdocs-material = "^9.1.17" +pytest = "^7.4.0" +httpx = "^0.24.1" +scikit-build = "0.17.6" + +[tool.poetry.extras] +server = ["uvicorn", "fastapi", "sse-starlette"] + +[build-system] +requires = [ + "setuptools>=42", + "scikit-build>=0.13", + "cmake>=3.18", + "ninja", +] +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..4cc1ad765 --- /dev/null +++ b/setup.py @@ -0,0 +1,32 @@ +from skbuild import setup + +from pathlib import Path + +this_directory = Path(__file__).parent +long_description = (this_directory / "README.md").read_text(encoding="utf-8") + +setup( + name="falcon_cpp_python", + description="A Python wrapper for ggllm.cpp to run Falcon models", + long_description=long_description, + long_description_content_type="text/markdown", + version="0.0.1", + author="Siraj Levin", + author_email="sirajperson@gmail.com", + license="MIT", + package_dir={"falcon_cpp": "falcon_cpp", "falcon_cpp.server": "falcon_cpp/server"}, + packages=["falcon_cpp", "falcon_cpp.server"], + install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], + extras_require={ + "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + }, + python_requires=">=3.7", + classifiers=[ + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + ], +) diff --git a/tests/test_falcon.py b/tests/test_falcon.py new file mode 100644 index 000000000..d162cc6d6 --- /dev/null +++ b/tests/test_falcon.py @@ -0,0 +1,171 @@ +import falcon_cpp + +MODEL = "./vendor/ggllm/models/ggml-vocab.bin" + + +def test_falcon(): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + + assert falcon + assert falcon.ctx is not None + + text = b"Hello World" + + assert falcon.detokenize(falcon.tokenize(text)) == text + + +# @pytest.mark.skip(reason="need to update sample mocking") +def test_falcon_patch(monkeypatch): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) + + ## Set up mock function + def mock_eval(*args, **kwargs): + return 0 + + def mock_get_logits(*args, **kwargs): + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] + ) + + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) + + output_text = " jumps over the lazy dog." + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() + n = 0 + + def mock_sample(*args, **kwargs): + nonlocal n + if n < len(output_tokens): + n += 1 + return output_tokens[n - 1] + else: + return token_eos + + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_cpp_sample_token", mock_sample) + + text = "The quick brown fox" + + ## Test basic completion until eos + n = 0 # reset + completion = falcon.create_completion(text, max_tokens=20) + assert completion["choices"][0]["text"] == output_text + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test streaming completion until eos + n = 0 # reset + chunks = falcon.create_completion(text, max_tokens=20, stream=True) + assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test basic completion until stop sequence + n = 0 # reset + completion = falcon.create_completion(text, max_tokens=20, stop=["lazy"]) + assert completion["choices"][0]["text"] == " jumps over the " + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test streaming completion until stop sequence + n = 0 # reset + chunks = falcon.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + assert ( + "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " + ) + assert completion["choices"][0]["finish_reason"] == "stop" + + ## Test basic completion until length + n = 0 # reset + completion = falcon.create_completion(text, max_tokens=2) + assert completion["choices"][0]["text"] == " j" + assert completion["choices"][0]["finish_reason"] == "length" + + ## Test streaming completion until length + n = 0 # reset + chunks = falcon.create_completion(text, max_tokens=2, stream=True) + assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j" + assert completion["choices"][0]["finish_reason"] == "length" + + +def test_falcon_pickle(): + import pickle + import tempfile + + fp = tempfile.TemporaryFile() + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + pickle.dump(falcon, fp) + fp.seek(0) + falcon = pickle.load(fp) + + assert falcon + assert falcon.ctx is not None + + text = b"Hello World" + + assert falcon.detokenize(falcon.tokenize(text)) == text + + +def test_utf8(monkeypatch): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) + + ## Set up mock function + def mock_eval(*args, **kwargs): + return 0 + + def mock_get_logits(*args, **kwargs): + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] + ) + + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) + + output_text = "😀" + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() + n = 0 + + def mock_sample(*args, **kwargs): + nonlocal n + if n < len(output_tokens): + n += 1 + return output_tokens[n - 1] + else: + return token_eos + + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_sample_token", mock_sample) + + ## Test basic completion with utf8 multibyte + n = 0 # reset + completion = falcon.create_completion("", max_tokens=4) + assert completion["choices"][0]["text"] == output_text + + ## Test basic completion with incomplete utf8 multibyte + n = 0 # reset + completion = falcon.create_completion("", max_tokens=1) + assert completion["choices"][0]["text"] == "" + + +def test_falcon_server(): + from fastapi.testclient import TestClient + from falcon_cpp.server.app import create_app, Settings + + settings = Settings( + model=MODEL, + vocab_only=True, + ) + app = create_app(settings) + client = TestClient(app) + response = client.get("/v1/models") + assert response.json() == { + "object": "list", + "data": [ + { + "id": MODEL, + "object": "model", + "owned_by": "me", + "permissions": [], + } + ], + } diff --git a/vendor/ggllm.cpp b/vendor/ggllm.cpp new file mode 160000 index 000000000..1b55412ec --- /dev/null +++ b/vendor/ggllm.cpp @@ -0,0 +1 @@ +Subproject commit 1b55412eca2dfcceb3eb173e9734a104274c39a0 From bb3b70be902701a22259fe50aa734c04b425a7bf Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 03:55:29 -0400 Subject: [PATCH 06/14] Update Build --- .gitmodules | 2 +- CMakeLists.txt | 50 ++--- falcon_cpp/falcon.py | 387 ++++++++++++++++++++------------------- falcon_cpp/falcon_cpp.py | 5 +- vendor/ggllm.cpp | 2 +- 5 files changed, 218 insertions(+), 228 deletions(-) diff --git a/.gitmodules b/.gitmodules index cdbef1424..eeadc3d38 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "ggllm.cpp"] path = ggllm.cpp - url = https://github.com/cmp-nct/ggllm.cpp + url = https://github.com/sirajperson/ggllm.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ddc8ba9b6..7e1faac42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,33 +2,23 @@ cmake_minimum_required(VERSION 3.4...3.22) project(falcon_cpp) -option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) - -set(FORCE_CMAKE $ENV{FORCE_CMAKE}) - -if (UNIX AND NOT FORCE_CMAKE) - add_custom_command( - OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so - COMMAND make libllama.so - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp - ) - add_custom_target( - run ALL - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so - ) - install( - FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so - DESTINATION llama_cpp - ) -else() - set(BUILD_SHARED_LIBS "On") - add_subdirectory(vendor/ggllm.cpp) - install( - TARGETS ggllm - LIBRARY DESTINATION falcon_cpp - RUNTIME DESTINATION falcon_cpp - ARCHIVE DESTINATION falcon_cpp - FRAMEWORK DESTINATION falcon_cpp - RESOURCE DESTINATION falcon_cpp - ) -endif() +# Build shared libraries using custom command +add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + COMMAND cmake -DLLAMA_CUBLAS=1 -DCUDAToolkit_ROOT=/usr/local/cuda/ -DBUILD_SHARED_LIBS=on ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp + COMMAND make + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp +) +add_custom_target( + build_shared_libs ALL + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so +) +# Install shared libraries +install( + FILES + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libcmpnct_unicode.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggml_shared.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libfalcon.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + DESTINATION falcon_cpp +) diff --git a/falcon_cpp/falcon.py b/falcon_cpp/falcon.py index 40b662f23..010586cd9 100644 --- a/falcon_cpp/falcon.py +++ b/falcon_cpp/falcon.py @@ -39,8 +39,8 @@ def cache_size(self) -> int: raise NotImplementedError def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: pass @@ -70,8 +70,8 @@ def cache_size(self): return sum([state.falcon_state_size for state in self.cache_state.values()]) def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None @@ -113,7 +113,7 @@ class FalconDiskCache(BaseFalconCache): """Cache for a falcon.cpp model using disk.""" def __init__( - self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) + self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) ): super().__init__(capacity_bytes) self.cache = diskcache.Cache(cache_dir) @@ -123,8 +123,8 @@ def cache_size(self): return int(self.cache.volume()) # type: ignore def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key: Optional[Tuple[int, ...]] = None @@ -165,12 +165,12 @@ def __setitem__(self, key: Sequence[int], value: "FalconState"): class FalconState: def __init__( - self, - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - n_tokens: int, - falcon_state: bytes, - falcon_state_size: int, + self, + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + n_tokens: int, + falcon_state: bytes, + falcon_state_size: int, ): self.input_ids = input_ids self.scores = scores @@ -201,27 +201,27 @@ class Falcon: """High-level Python wrapper for a falcon.cpp model.""" def __init__( - self, - model_path: str, - # NOTE: These parameters are likely to change in the future. - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = 1337, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = True, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = None, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - low_vram: bool = False, - verbose: bool = True, - ): + self, + model_path: str, + # NOTE: These parameters are likely to change in the future. + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = 1337, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = None, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + low_vram: bool = False, + verbose: bool = True, + ): # TODO: Add the parameters for ''' @@ -242,7 +242,7 @@ def __init__( ''' """Load a Falcon model from `model_path`. - + Args: model_path: Path to the model. n_ctx: Maximum context size. @@ -260,10 +260,10 @@ def __init__( lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. verbose: Print verbose output to stderr. - + Raises: ValueError: If the model path does not exist. - + Returns: A falcon instance. """ @@ -445,7 +445,7 @@ def eval(self, tokens: Sequence[int]): assert self.ctx is not None n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] + batch = tokens[i: min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) return_code = falcon_cpp.falcon_eval( @@ -458,31 +458,32 @@ def eval(self, tokens: Sequence[int]): if return_code != 0: raise RuntimeError(f"falcon_eval returned {return_code}") # Save tokens - self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch + self.input_ids[self.n_tokens: self.n_tokens + n_tokens] = batch # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] + offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[ + :] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] # Update n_tokens self.n_tokens += n_tokens def _sample( - self, - last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] - last_n_tokens_size: falcon_cpp.c_int, - top_k: falcon_cpp.c_int, - top_p: falcon_cpp.c_float, - temp: falcon_cpp.c_float, - tfs_z: falcon_cpp.c_float, - repeat_penalty: falcon_cpp.c_float, - frequency_penalty: falcon_cpp.c_float, - presence_penalty: falcon_cpp.c_float, - mirostat_mode: falcon_cpp.c_int, - mirostat_tau: falcon_cpp.c_float, - mirostat_eta: falcon_cpp.c_float, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] + last_n_tokens_size: falcon_cpp.c_int, + top_k: falcon_cpp.c_int, + top_p: falcon_cpp.c_float, + temp: falcon_cpp.c_float, + tfs_z: falcon_cpp.c_float, + repeat_penalty: falcon_cpp.c_float, + frequency_penalty: falcon_cpp.c_float, + presence_penalty: falcon_cpp.c_float, + mirostat_mode: falcon_cpp.c_int, + mirostat_tau: falcon_cpp.c_float, + mirostat_eta: falcon_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert self.n_tokens > 0 @@ -600,19 +601,19 @@ def _sample( ) def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -628,7 +629,7 @@ def sample( assert self.ctx is not None last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max( 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size :].tolist() + ) + self._input_ids[-self.last_n_tokens_size:].tolist() return self._sample( last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)( *last_n_tokens_data @@ -649,21 +650,21 @@ def sample( ) def generate( - self, - tokens: Sequence[int], - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, + self, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. @@ -719,7 +720,7 @@ def generate( logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): return tokens_or_none = yield token @@ -728,7 +729,7 @@ def generate( tokens.extend(tokens_or_none) def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None + self, input: Union[str, List[str]], model: Optional[str] = None ) -> Embedding: """Embed a string. @@ -763,8 +764,8 @@ def create_embedding( n_tokens = len(tokens) total_tokens += n_tokens embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[ - : falcon_cpp.falcon_n_embd(self.ctx) - ] + : falcon_cpp.falcon_n_embd(self.ctx) + ] data.append( { @@ -798,27 +799,27 @@ def embed(self, input: str) -> List[float]: return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) def _create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 16, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 16, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None @@ -879,19 +880,19 @@ def _create_completion( finish_reason = "length" multibyte_fix = 0 for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -943,7 +944,7 @@ def _create_completion( token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position >= ( - remaining_length - first_stop_position - 1 + remaining_length - first_stop_position - 1 ): break logprobs_or_none: Optional[CompletionLogprobs] = None @@ -1004,7 +1005,7 @@ def _create_completion( break if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): text = self.detokenize(completion_tokens) finish_reason = "stop" @@ -1069,8 +1070,8 @@ def _create_completion( "choices": [ { "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, "finish_reason": finish_reason, @@ -1137,10 +1138,10 @@ def _create_completion( for token in all_tokens ] all_logprobs = [ - Falcon.logits_to_logprobs(row.tolist()) for row in self._scores - ][token_offset:] + Falcon.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] for token, token_str, logprobs_token in zip( - all_tokens, all_token_strs, all_logprobs + all_tokens, all_token_strs, all_logprobs ): text_offsets.append(text_offset) text_offset += len(token_str) @@ -1191,27 +1192,27 @@ def _create_completion( } def create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1264,27 +1265,27 @@ def create_completion( return completion def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1332,7 +1333,7 @@ def __call__( ) def _convert_text_completion_to_chat( - self, completion: Completion + self, completion: Completion ) -> ChatCompletion: return { "id": "chat" + completion["id"], @@ -1353,8 +1354,8 @@ def _convert_text_completion_to_chat( } def _convert_text_completion_chunks_to_chat( - self, - chunks: Iterator[CompletionChunk], + self, + chunks: Iterator[CompletionChunk], ) -> Iterator[ChatCompletionChunk]: for i, chunk in enumerate(chunks): if i == 0: @@ -1390,23 +1391,23 @@ def _convert_text_completion_chunks_to_chat( } def create_chat_completion( - self, - messages: List[ChatCompletionMessage], - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1619,4 +1620,4 @@ def decode(self, tokens: List[int]) -> str: @classmethod def from_ggml_file(cls, path: str) -> "FalconTokenizer": - return cls(Falcon(model_path=path, vocab_only=True)) + return cls(Falcon(model_path=path, vocab_only=True)) \ No newline at end of file diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py index 78297adc4..121b98c96 100644 --- a/falcon_cpp/falcon_cpp.py +++ b/falcon_cpp/falcon_cpp.py @@ -71,7 +71,7 @@ def _load_shared_library(lib_base_name: str): # Specify the base name of the shared library to load -_lib_base_name = "llama" +_lib_base_name = "falcon" # Load the library _lib = _load_shared_library(_lib_base_name) @@ -132,7 +132,6 @@ class falcon_token_data(Structure): falcon_token_data_p = POINTER(falcon_token_data) - # typedef struct falcon_token_data_array { # falcon_token_data * data; # size_t size; @@ -1021,4 +1020,4 @@ def falcon_print_system_info() -> bytes: if not _falcon_initialized: falcon_init_backend(c_bool(False)) - _falcon_initialized = True + _falcon_initialized = True \ No newline at end of file diff --git a/vendor/ggllm.cpp b/vendor/ggllm.cpp index 1b55412ec..8c019b677 160000 --- a/vendor/ggllm.cpp +++ b/vendor/ggllm.cpp @@ -1 +1 @@ -Subproject commit 1b55412eca2dfcceb3eb173e9734a104274c39a0 +Subproject commit 8c019b67757538e7750cd30640fd00bbe8bc30de From ab2cab572f5d08cf6022c37174a7469cf161ae9f Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:16:11 -0400 Subject: [PATCH 07/14] Update README.md --- README.md | 115 ++++++++++-------------------------------------------- 1 file changed, 21 insertions(+), 94 deletions(-) diff --git a/README.md b/README.md index fb652a925..5e7f44e97 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,7 @@ -# 🦙 Python Bindings for `llama.cpp` +# Python Bindings for `ggllm.cpp` -[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. +Simple Python bindings for [`ggllm.cpp`](https://github.com/cmp-nct/ggllm.cpp) library. This package provides: - Low-level access to C API via `ctypes` interface. @@ -15,64 +9,9 @@ This package provides: - OpenAI-like API - LangChain compatibility -Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). +This project is currently in alpha development and is not yet completely functional. Any contributions are warmly welcomed. -## Installation from PyPI (recommended) - -Install from PyPI (requires a c compiler): - -```bash -pip install llama-cpp-python -``` - -The above command will attempt to install the package and build `llama.cpp` from source. -This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. - -If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: - -```bash -pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -``` - -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` -Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. - -### Installation with OpenBLAS / cuBLAS / CLBlast / Metal - -`llama.cpp` supports multiple BLAS backends for faster processing. -Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. - -To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) - ## High-level API The high-level API provides a simple managed interface through the `Llama` class. @@ -80,8 +19,8 @@ The high-level API provides a simple managed interface through the `Llama` class Below is a short example demonstrating how to use the high-level API to generate text: ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="./models/7B/ggml-model.bin") +>>> from falcon_cpp import Falcon +>>> llm = Falcon(model_path="./models/7B/ggml-model.bin") >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) >>> print(output) { @@ -107,57 +46,45 @@ Below is a short example demonstrating how to use the high-level API to generate ## Web Server -`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. -This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). +`falcon-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. +This allows you to use ggllm.cpp to inference falcon models with any OpenAI compatible client (language libraries, services, etc). To install the server package and get started: ```bash -pip install llama-cpp-python[server] python3 -m llama_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. -## Docker image - -A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: - -```bash -docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest -``` - ## Low-level API The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. -The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The entire lowe-level API can be found in [falcon_cpp/falcon_cpp.py](https://github.com/sirajperson/falcon-cpp-python/blob/master/falcon_cpp/falcon_cpp.py) and directly mirrors the C API in [libfalcon.h](https://github.com/cmp-nct/ggllm.cpp/blob/master/libfalcon.h). Below is a short example demonstrating how to use the low-level API to tokenize a prompt: ```python ->>> import llama_cpp +>>> import falcon_cpp >>> import ctypes ->>> params = llama_cpp.llama_context_default_params() +>>> params = falcon_cpp.falcon_context_default_params() # use bytes for char * params ->>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> ctx = falcon_cpp.falcon_init_backend("./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cpp.llama_token * int(max_tokens))() ->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) ->>> llama_cpp.llama_free(ctx) +>>> tokens = (falcon_cpp.falcon_token * int(max_tokens))() +>>> n_tokens = falcon_cpp.falcon_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> falcon_cpp.falcon_free(ctx) ``` Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. - # Documentation - -Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). -If you find any issues with the documentation, please open an issue or submit a PR. +Coming soon... # Development -This package is under active development and I welcome any contributions. +Again, this package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: @@ -179,12 +106,12 @@ poetry install --all-extras python3 setup.py develop ``` -# How does this compare to other Python bindings of `llama.cpp`? - -I originally wrote this package for my own use with two goals in mind: +# This Project is a fork of llama-cpp-python -- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python -- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` +This project was originally llama-cpp-python and owes an immense thanks to @abetlen. +This projects goal is to +- Provide a simple process to install `ggllm.cpp` and access the full C API in `libfalcon.h` from Python +- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `ggllm.cpp` Any contributions and changes to this package will be made with these goals in mind. From bb711ba9ccb5d413faf74049c3c7e7241d49f635 Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:17:38 -0400 Subject: [PATCH 08/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5e7f44e97..51d25954c 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the ## Low-level API -The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `ggllm.cpp`. The entire lowe-level API can be found in [falcon_cpp/falcon_cpp.py](https://github.com/sirajperson/falcon-cpp-python/blob/master/falcon_cpp/falcon_cpp.py) and directly mirrors the C API in [libfalcon.h](https://github.com/cmp-nct/ggllm.cpp/blob/master/libfalcon.h). Below is a short example demonstrating how to use the low-level API to tokenize a prompt: From db30a2bfac7b0531a10ab0a3e33d70f8190e09b3 Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:19:25 -0400 Subject: [PATCH 09/14] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 51d25954c..c01622032 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ This project is currently in alpha development and is not yet completely functio ## High-level API -The high-level API provides a simple managed interface through the `Llama` class. +The high-level API provides a simple managed interface through the `Falcon` class. Below is a short example demonstrating how to use the high-level API to generate text: @@ -89,8 +89,8 @@ Again, this package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: ```bash -git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git -cd llama-cpp-python +git clone --recurse-submodules git@github.com:sirajperson/falcon-cpp-python.git +cd falcon-cpp-python # Install with pip pip install -e . From 58cd6572dbebb2431e02cfaa7b7207c9717292a5 Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:20:54 -0400 Subject: [PATCH 10/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c01622032..cf1e47eb8 100644 --- a/README.md +++ b/README.md @@ -52,7 +52,7 @@ This allows you to use ggllm.cpp to inference falcon models with any OpenAI comp To install the server package and get started: ```bash -python3 -m llama_cpp.server --model models/7B/ggml-model.bin +python3 -m falcon_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. From 09e051229e4b4bd17a8de8365849b5f2fbb1f3a9 Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:24:25 -0400 Subject: [PATCH 11/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cf1e47eb8..7edf66a97 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ python3 setup.py develop # This Project is a fork of llama-cpp-python This project was originally llama-cpp-python and owes an immense thanks to @abetlen. -This projects goal is to +This project's goal is to: - Provide a simple process to install `ggllm.cpp` and access the full C API in `libfalcon.h` from Python - Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `ggllm.cpp` From 0d2f65db36c189c2fd110fd5eb4c8a83b663ce4d Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:29:15 -0400 Subject: [PATCH 12/14] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7edf66a97..e7a220537 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Below is a short example demonstrating how to use the low-level API to tokenize >>> max_tokens = params.n_ctx # use ctypes arrays for array params >>> tokens = (falcon_cpp.falcon_token * int(max_tokens))() ->>> n_tokens = falcon_cpp.falcon_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) +>>> n_tokens = falcon_cpp.falcon_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=falcon_cpp.c_bool(True)) >>> falcon_cpp.falcon_free(ctx) ``` @@ -102,7 +102,7 @@ pip install -e .[server] poetry install --all-extras . .venv/bin/activate -# Will need to be re-run any time vendor/llama.cpp is updated +# Will need to be re-run any time vendor/ggllm.cpp is updated python3 setup.py develop ``` From 51e6e3e601e6af7092d66188220159dd19c0b8ac Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:31:27 -0400 Subject: [PATCH 13/14] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e7a220537..9a490f40d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Python Bindings for `ggllm.cpp` +# Python Bindings for `ggllm.cpp`, a library for loading and execution of inferences to falcon based models Simple Python bindings for [`ggllm.cpp`](https://github.com/cmp-nct/ggllm.cpp) library. From 07aa9cbefa48c83c80ca4f9f1b61822e8cb6c398 Mon Sep 17 00:00:00 2001 From: siraj Date: Tue, 18 Jul 2023 04:41:41 -0400 Subject: [PATCH 14/14] Update README.md --- .github/ISSUE_TEMPLATE/bug_report.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 5df12aaf5..a5e1a9cb5 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -12,17 +12,17 @@ assignees: '' Please answer the following questions for yourself before submitting an issue. - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. -- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). +- [ ] I carefully followed the [README.md](https://github.com/sirajperson/falcon-cpp-python/blob/main/README.md). - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). -- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. +- [ ] I reviewed the [Discussions](https://github.com/sirajperson/falcon-cpp-python/discussions), and have a new bug or useful enhancement to share. # Expected Behavior -Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. +Please provide a detailed written description of what you were trying to do, and what you expected `falcon-cpp-python` to do. # Current Behavior -Please provide a detailed written description of what `llama-cpp-python` did, instead. +Please provide a detailed written description of what `falcon-cpp-python` did, instead. # Environment and Context @@ -61,13 +61,13 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f Try the following: -1. `git clone https://github.com/abetlen/llama-cpp-python` -2. `cd llama-cpp-python` +1. `git clone https://github.com/sirajperson/falcon-cpp-python` +2. `cd falcon-cpp-python` 3. `rm -rf _skbuild/` # delete any old builds 4. `python setup.py develop` -5. `cd ./vendor/llama.cpp` -6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp -7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) +5. `cd ./vendor/ggllm.cpp` +6. Follow [ggllm.cpp's instructions](https://github.com/cmp-nct/ggllm.cpp) section on how to compile with `cmake` +7. Run ggllm.cpp's `./falcon_main` with the same arguments you previously passed to falcon-cpp-python and see if you can reproduce the issue. If you can, [log an issue with ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp/issues) # Failure Logs @@ -77,10 +77,10 @@ Also, please try to **avoid using screenshots** if at all possible. Instead, cop Example environment info: ``` -llama-cpp-python$ git log | head -1 +falcon-cpp-python$ git log | head -1 commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 -llama-cpp-python$ python3 --version +falcon-cpp-python$ python3 --version Python 3.10.10 llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" @@ -89,8 +89,8 @@ numpy 1.24.3 sse-starlette 1.3.3 uvicorn 0.21.1 -llama-cpp-python/vendor/llama.cpp$ git log | head -3 +falcon-cpp-python/vendor/llama.cpp$ git log | head -3 commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 -Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Author: YupHippie <44031344+YupHippie@users.noreply.github.com> Date: Thu May 25 20:18:01 2023 -0600 ```