diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 5df12aaf5..a5e1a9cb5 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -12,17 +12,17 @@ assignees: '' Please answer the following questions for yourself before submitting an issue. - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. -- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md). +- [ ] I carefully followed the [README.md](https://github.com/sirajperson/falcon-cpp-python/blob/main/README.md). - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). -- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share. +- [ ] I reviewed the [Discussions](https://github.com/sirajperson/falcon-cpp-python/discussions), and have a new bug or useful enhancement to share. # Expected Behavior -Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do. +Please provide a detailed written description of what you were trying to do, and what you expected `falcon-cpp-python` to do. # Current Behavior -Please provide a detailed written description of what `llama-cpp-python` did, instead. +Please provide a detailed written description of what `falcon-cpp-python` did, instead. # Environment and Context @@ -61,13 +61,13 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f Try the following: -1. `git clone https://github.com/abetlen/llama-cpp-python` -2. `cd llama-cpp-python` +1. `git clone https://github.com/sirajperson/falcon-cpp-python` +2. `cd falcon-cpp-python` 3. `rm -rf _skbuild/` # delete any old builds 4. `python setup.py develop` -5. `cd ./vendor/llama.cpp` -6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp -7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues) +5. `cd ./vendor/ggllm.cpp` +6. Follow [ggllm.cpp's instructions](https://github.com/cmp-nct/ggllm.cpp) section on how to compile with `cmake` +7. Run ggllm.cpp's `./falcon_main` with the same arguments you previously passed to falcon-cpp-python and see if you can reproduce the issue. If you can, [log an issue with ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp/issues) # Failure Logs @@ -77,10 +77,10 @@ Also, please try to **avoid using screenshots** if at all possible. Instead, cop Example environment info: ``` -llama-cpp-python$ git log | head -1 +falcon-cpp-python$ git log | head -1 commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2 -llama-cpp-python$ python3 --version +falcon-cpp-python$ python3 --version Python 3.10.10 llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy" @@ -89,8 +89,8 @@ numpy 1.24.3 sse-starlette 1.3.3 uvicorn 0.21.1 -llama-cpp-python/vendor/llama.cpp$ git log | head -3 +falcon-cpp-python/vendor/llama.cpp$ git log | head -3 commit 66874d4fbcc7866377246efbcee938e8cc9c7d76 -Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Author: YupHippie <44031344+YupHippie@users.noreply.github.com> Date: Thu May 25 20:18:01 2023 -0600 ``` diff --git a/.gitmodules b/.gitmodules index 7edf0975d..eeadc3d38 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "vendor/llama.cpp"] - path = vendor/llama.cpp - url = https://github.com/ggerganov/llama.cpp.git +[submodule "ggllm.cpp"] + path = ggllm.cpp + url = https://github.com/sirajperson/ggllm.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 788402a56..7e1faac42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,34 +1,24 @@ cmake_minimum_required(VERSION 3.4...3.22) -project(llama_cpp) +project(falcon_cpp) -option(FORCE_CMAKE "Force CMake build of Python bindings" OFF) - -set(FORCE_CMAKE $ENV{FORCE_CMAKE}) - -if (UNIX AND NOT FORCE_CMAKE) - add_custom_command( - OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so - COMMAND make libllama.so - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp - ) - add_custom_target( - run ALL - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so - ) - install( - FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so - DESTINATION llama_cpp - ) -else() - set(BUILD_SHARED_LIBS "On") - add_subdirectory(vendor/llama.cpp) - install( - TARGETS llama - LIBRARY DESTINATION llama_cpp - RUNTIME DESTINATION llama_cpp - ARCHIVE DESTINATION llama_cpp - FRAMEWORK DESTINATION llama_cpp - RESOURCE DESTINATION llama_cpp - ) -endif() +# Build shared libraries using custom command +add_custom_command( + OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + COMMAND cmake -DLLAMA_CUBLAS=1 -DCUDAToolkit_ROOT=/usr/local/cuda/ -DBUILD_SHARED_LIBS=on ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp + COMMAND make + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp +) +add_custom_target( + build_shared_libs ALL + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so +) +# Install shared libraries +install( + FILES + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libcmpnct_unicode.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggml_shared.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libfalcon.so + ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so + DESTINATION falcon_cpp +) diff --git a/Makefile b/Makefile index 66d93f3a2..3301081d0 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ update: git submodule update --init --recursive update.vendor: - cd vendor/llama.cpp && git pull origin master + cd vendor/ggllm.cpp && git pull origin master build: python3 setup.py develop @@ -34,14 +34,14 @@ deploy.gh-docs: mkdocs gh-deploy clean: - - cd vendor/llama.cpp && make clean - - cd vendor/llama.cpp && rm libllama.so + - cd vendor/ggllm.cpp && make clean + - cd vendor/ggllm.cpp && rm llamacpp.so - rm -rf _skbuild - - rm llama_cpp/*.so - - rm llama_cpp/*.dylib - - rm llama_cpp/*.metal - - rm llama_cpp/*.dll - - rm llama_cpp/*.lib + - rm falcon_cpp/*.so + - rm falcon_cpp/*.dylib + - rm falcon_cpp/*.metal + - rm falcon_cpp/*.dll + - rm falcon_cpp/*.lib .PHONY: \ update \ diff --git a/README.md b/README.md index fb652a925..9a490f40d 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,7 @@ -# 🦙 Python Bindings for `llama.cpp` +# Python Bindings for `ggllm.cpp`, a library for loading and execution of inferences to falcon based models -[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. +Simple Python bindings for [`ggllm.cpp`](https://github.com/cmp-nct/ggllm.cpp) library. This package provides: - Low-level access to C API via `ctypes` interface. @@ -15,73 +9,18 @@ This package provides: - OpenAI-like API - LangChain compatibility -Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest). +This project is currently in alpha development and is not yet completely functional. Any contributions are warmly welcomed. -## Installation from PyPI (recommended) - -Install from PyPI (requires a c compiler): - -```bash -pip install llama-cpp-python -``` - -The above command will attempt to install the package and build `llama.cpp` from source. -This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system. - -If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different compiler options, please add the following flags to ensure that the package is rebuilt correctly: - -```bash -pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir -``` - -Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example: -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` -Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac. - -### Installation with OpenBLAS / cuBLAS / CLBlast / Metal - -`llama.cpp` supports multiple BLAS backends for faster processing. -Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend. - -To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing: - -```bash -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python -``` - -Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md) - ## High-level API -The high-level API provides a simple managed interface through the `Llama` class. +The high-level API provides a simple managed interface through the `Falcon` class. Below is a short example demonstrating how to use the high-level API to generate text: ```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="./models/7B/ggml-model.bin") +>>> from falcon_cpp import Falcon +>>> llm = Falcon(model_path="./models/7B/ggml-model.bin") >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) >>> print(output) { @@ -107,63 +46,51 @@ Below is a short example demonstrating how to use the high-level API to generate ## Web Server -`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. -This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). +`falcon-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. +This allows you to use ggllm.cpp to inference falcon models with any OpenAI compatible client (language libraries, services, etc). To install the server package and get started: ```bash -pip install llama-cpp-python[server] -python3 -m llama_cpp.server --model models/7B/ggml-model.bin +python3 -m falcon_cpp.server --model models/7B/ggml-model.bin ``` Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. -## Docker image - -A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server: - -```bash -docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest -``` - ## Low-level API -The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`. -The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). +The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `ggllm.cpp`. +The entire lowe-level API can be found in [falcon_cpp/falcon_cpp.py](https://github.com/sirajperson/falcon-cpp-python/blob/master/falcon_cpp/falcon_cpp.py) and directly mirrors the C API in [libfalcon.h](https://github.com/cmp-nct/ggllm.cpp/blob/master/libfalcon.h). Below is a short example demonstrating how to use the low-level API to tokenize a prompt: ```python ->>> import llama_cpp +>>> import falcon_cpp >>> import ctypes ->>> params = llama_cpp.llama_context_default_params() +>>> params = falcon_cpp.falcon_context_default_params() # use bytes for char * params ->>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params) +>>> ctx = falcon_cpp.falcon_init_backend("./models/7b/ggml-model.bin", params) >>> max_tokens = params.n_ctx # use ctypes arrays for array params ->>> tokens = (llama_cpp.llama_token * int(max_tokens))() ->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True)) ->>> llama_cpp.llama_free(ctx) +>>> tokens = (falcon_cpp.falcon_token * int(max_tokens))() +>>> n_tokens = falcon_cpp.falcon_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=falcon_cpp.c_bool(True)) +>>> falcon_cpp.falcon_free(ctx) ``` Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API. - # Documentation - -Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python). -If you find any issues with the documentation, please open an issue or submit a PR. +Coming soon... # Development -This package is under active development and I welcome any contributions. +Again, this package is under active development and I welcome any contributions. To get started, clone the repository and install the package in development mode: ```bash -git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git -cd llama-cpp-python +git clone --recurse-submodules git@github.com:sirajperson/falcon-cpp-python.git +cd falcon-cpp-python # Install with pip pip install -e . @@ -175,16 +102,16 @@ pip install -e .[server] poetry install --all-extras . .venv/bin/activate -# Will need to be re-run any time vendor/llama.cpp is updated +# Will need to be re-run any time vendor/ggllm.cpp is updated python3 setup.py develop ``` -# How does this compare to other Python bindings of `llama.cpp`? - -I originally wrote this package for my own use with two goals in mind: +# This Project is a fork of llama-cpp-python -- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python -- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp` +This project was originally llama-cpp-python and owes an immense thanks to @abetlen. +This project's goal is to: +- Provide a simple process to install `ggllm.cpp` and access the full C API in `libfalcon.h` from Python +- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `ggllm.cpp` Any contributions and changes to this package will be made with these goals in mind. diff --git a/docker/README.md b/docker/README.md deleted file mode 100644 index 053d311b4..000000000 --- a/docker/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Install Docker Server - -**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR! - -[Install Docker Engine](https://docs.docker.com/engine/install) - -**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) - -# Simple Dockerfiles for building the llama-cpp-python server with external model bin files -## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image -``` -cd ./openblas_simple -docker build -t openblas_simple . -docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t openblas_simple -``` -where `/` is the full path to the model file on the Docker host system. - -## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image -``` -cd ./cuda_simple -docker build -t cuda_simple . -docker run -e USE_MLOCK=0 -e MODEL=/var/model/ -v :/var/model -t cuda_simple -``` -where `/` is the full path to the model file on the Docker host system. - -# "Open-Llama-in-a-box" -## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server -``` -$ cd ./open_llama -./build.sh -./start.sh -``` - -# Manually choose your own Llama model from Hugging Face -`python3 ./hug_model.py -a TheBloke -t llama` -You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g. -``` -docker $ ls -lh *.bin --rw-rw-r-- 1 user user 4.8G May 23 18:30 q5_1.bin -lrwxrwxrwx 1 user user 24 May 23 18:30 model.bin -> q5_1.bin -``` -**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least -**TWICE** as much disk space as the size of the model: - -| Model | Quantized size | -|------:|----------------:| -| 3B | 3 GB | -| 7B | 5 GB | -| 13B | 10 GB | -| 33B | 25 GB | -| 65B | 50 GB | - -**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...` - -## Use OpenBLAS -Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS: -### Build: -`docker build -t openblas .` -### Run: -`docker run --cap-add SYS_RESOURCE -t openblas` - -## Use CuBLAS -### Build: -`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .` -### Run: -`docker run --cap-add SYS_RESOURCE -t cublas` diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile deleted file mode 100644 index 24906d53a..000000000 --- a/docker/cuda_simple/Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04" -FROM nvidia/cuda:${CUDA_IMAGE} - -# We need to set the host to 0.0.0.0 to allow outside access -ENV HOST 0.0.0.0 - -COPY . . - -# Install the package -RUN apt update && apt install -y python3 python3-pip -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -RUN LLAMA_CUBLAS=1 pip install llama-cpp-python - -# Run the server -CMD python3 -m llama_cpp.server diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile deleted file mode 100644 index f0ef5f721..000000000 --- a/docker/open_llama/Dockerfile +++ /dev/null @@ -1,51 +0,0 @@ -# Define the image argument and provide a default value -ARG IMAGE=python:3-slim-bullseye - -# Use the image as specified -FROM ${IMAGE} - -# Re-declare the ARG after FROM -ARG IMAGE - -# Update and upgrade the existing packages -RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - ninja-build \ - build-essential - -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -# Perform the conditional installations based on the image -RUN echo "Image: ${IMAGE}" && \ - if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \ - echo "OpenBLAS install:" && \ - apt-get install -y --no-install-recommends libopenblas-dev && \ - LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \ -else \ - echo "CuBLAS install:" && \ - LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \ -fi - -# Clean up apt cache -RUN rm -rf /var/lib/apt/lists/* - -# Set a working directory for better clarity -WORKDIR /app - -# Copy files to the app directory -RUN echo "Installing model...this can take some time..." -COPY ./model.bin /app/model.bin -COPY ./start_server.sh /app/start_server.sh - -# Make the server start script executable -RUN chmod +x /app/start_server.sh - -# Set environment variable for the host -ENV HOST=0.0.0.0 - -# Expose a port for the server -EXPOSE 8000 - -# Run the server start script -CMD ["/bin/sh", "/app/start_server.sh"] diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh deleted file mode 100755 index 3a6457dcd..000000000 --- a/docker/open_llama/build.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/sh - -MODEL="open_llama_3b" -# Get open_llama_3b_ggml q5_1 quantization -python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1" -ls -lh *.bin - -# Build the default OpenBLAS image -docker build -t $MODEL . -docker images | egrep "^(REPOSITORY|$MODEL)" - -echo -echo "To start the docker container run:" -echo "docker run -t -p 8000:8000 $MODEL" diff --git a/docker/open_llama/hug_model.py b/docker/open_llama/hug_model.py deleted file mode 100644 index 13c5b6b0d..000000000 --- a/docker/open_llama/hug_model.py +++ /dev/null @@ -1,139 +0,0 @@ -import requests -import json -import os -import struct -import argparse - -def make_request(url, params=None): - print(f"Making request to {url}...") - response = requests.get(url, params=params) - if response.status_code == 200: - return json.loads(response.text) - else: - print(f"Request failed with status code {response.status_code}") - return None - -def check_magic_and_version(filename): - with open(filename, 'rb') as f: - # Read the first 6 bytes from the file - data = f.read(6) - - # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int - # and the next 2 bytes as a little-endian unsigned short - magic, version = struct.unpack('= 10485760: # 10 MB - print('.', end='', flush=True) - total_downloaded = 0 - print("\nDownload complete.") - - # Creating a symbolic link from destination to "model.bin" - if os.path.isfile("model.bin"): - os.remove("model.bin") # remove the existing link if any - os.symlink(destination, "model.bin") - else: - print(f"Download failed with status code {response.status_code}") - -def get_user_choice(model_list): - # Print the enumerated list - print("\n") - for i, (model_id, rfilename) in enumerate(model_list): - print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}") - - # Get user's choice - choice = input("Choose a model to download by entering the corresponding number: ") - try: - index = int(choice) - 1 - if 0 <= index < len(model_list): - # Return the chosen model - return model_list[index] - else: - print("Invalid choice.") - except ValueError: - print("Invalid input. Please enter a number corresponding to a model.") - except IndexError: - print("Invalid choice. Index out of range.") - - return None - -def main(): - # Create an argument parser - parser = argparse.ArgumentParser(description='Process some parameters.') - - # Arguments - parser.add_argument('-v', '--version', type=int, default=0x0003, - help='hexadecimal version number of ggml file') - parser.add_argument('-a', '--author', type=str, default='TheBloke', - help='HuggingFace author filter') - parser.add_argument('-t', '--tag', type=str, default='llama', - help='HuggingFace tag filter') - parser.add_argument('-s', '--search', type=str, default='', - help='HuggingFace search filter') - parser.add_argument('-f', '--filename', type=str, default='q5_1', - help='HuggingFace model repository filename substring match') - - # Parse the arguments - args = parser.parse_args() - - # Define the parameters - params = { - "author": args.author, - "tags": args.tag, - "search": args.search - } - - models = make_request('https://huggingface.co/api/models', params=params) - if models is None: - return - - model_list = [] - # Iterate over the models - for model in models: - model_id = model['id'] - model_info = make_request(f'https://huggingface.co/api/models/{model_id}') - if model_info is None: - continue - - for sibling in model_info.get('siblings', []): - rfilename = sibling.get('rfilename') - if rfilename and args.filename in rfilename: - model_list.append((model_id, rfilename)) - - # Choose the model - model_list.sort(key=lambda x: x[0]) - if len(model_list) == 0: - print("No models found") - exit(1) - elif len(model_list) == 1: - model_choice = model_list[0] - else: - model_choice = get_user_choice(model_list) - - if model_choice is not None: - model_id, rfilename = model_choice - url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}" - dest = f"{model_id.replace('/', '_')}_{rfilename}" - download_file(url, dest) - _, version = check_magic_and_version(dest) - if version != args.version: - print(f"Warning: Expected version {args.version}, but found different version in the file.") - else: - print("Error - model choice was None") - exit(2) - -if __name__ == '__main__': - main() diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh deleted file mode 100755 index 7ee8f748e..000000000 --- a/docker/open_llama/start.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/bin/sh - -MODEL="open_llama_3b" - -# Start Docker container -docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL & -sleep 10 -echo -docker ps | egrep "(^CONTAINER|$MODEL)" - -# Test the model works -echo -curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{ - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": [ - "\n", - "###" - ] -}' | grep Paris -if [ $? -eq 0 ] -then - echo - echo "$MODEL is working!!" -else - echo - echo "ERROR: $MODEL not replying." - exit 1 -fi diff --git a/docker/open_llama/start_server.sh b/docker/open_llama/start_server.sh deleted file mode 100755 index d3329eec3..000000000 --- a/docker/open_llama/start_server.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/sh - -# For mlock support -ulimit -l unlimited - -if [ "$IMAGE" = "python:3-slim-bullseye" ]; then - python3 -B -m llama_cpp.server --model /app/model.bin -else - # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM - python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000 -fi diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile deleted file mode 100644 index 1a95caeda..000000000 --- a/docker/openblas_simple/Dockerfile +++ /dev/null @@ -1,15 +0,0 @@ -FROM python:3-slim-bullseye - -# We need to set the host to 0.0.0.0 to allow outside access -ENV HOST 0.0.0.0 - -COPY . . - -# Install the package -RUN apt update && apt install -y libopenblas-dev ninja-build build-essential -RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette - -RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose - -# Run the server -CMD python3 -m llama_cpp.server diff --git a/docs/api-reference.md b/docs/api-reference.md deleted file mode 100644 index 1290cad49..000000000 --- a/docs/api-reference.md +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: API Reference ---- - -::: llama_cpp.Llama - options: - members: - - __init__ - - tokenize - - detokenize - - reset - - eval - - sample - - generate - - create_embedding - - embed - - create_completion - - __call__ - - create_chat_completion - - set_cache - - save_state - - load_state - - token_bos - - token_eos - show_root_heading: true - -::: llama_cpp.LlamaCache - options: - show_root_heading: true - -::: llama_cpp.LlamaState - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessor - options: - show_root_heading: true - -::: llama_cpp.LogitsProcessorList - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteria - options: - show_root_heading: true - -::: llama_cpp.StoppingCriteriaList - options: - show_root_heading: true - -::: llama_cpp.llama_cpp - options: - show_if_no_docstring: true \ No newline at end of file diff --git a/docs/index.md b/docs/index.md deleted file mode 100644 index 7d5ccc314..000000000 --- a/docs/index.md +++ /dev/null @@ -1,92 +0,0 @@ -# Getting Started - -## 🦙 Python Bindings for `llama.cpp` - -[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python) -[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml) -[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) -[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/) - -Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library. -This package provides: - -- Low-level access to C API via `ctypes` interface. -- High-level Python API for text completion - - OpenAI-like API - - LangChain compatibility - -## Installation - -Install from PyPI: - -```bash -pip install llama-cpp-python -``` - -## High-level API - -```python ->>> from llama_cpp import Llama ->>> llm = Llama(model_path="./models/7B/ggml-model.bin") ->>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True) ->>> print(output) -{ - "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx", - "object": "text_completion", - "created": 1679561337, - "model": "./models/7B/ggml-model.bin", - "choices": [ - { - "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.", - "index": 0, - "logprobs": None, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 14, - "completion_tokens": 28, - "total_tokens": 42 - } -} -``` - -## Web Server - -`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. -This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). - -To install the server package and get started: - -```bash -pip install llama-cpp-python[server] -export MODEL=./models/7B/ggml-model.bin -python3 -m llama_cpp.server -``` - -Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation. - -## Low-level API - -The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`. -The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h). - - -## Development - -This package is under active development and I welcome any contributions. - -To get started, clone the repository and install the package in development mode: - -```bash -git clone git@github.com:abetlen/llama-cpp-python.git -git submodule update --init --recursive -# Will need to be re-run any time vendor/llama.cpp is updated -python3 setup.py develop -``` - -## License - -This project is licensed under the terms of the MIT license. \ No newline at end of file diff --git a/docs/install/macos.md b/docs/install/macos.md deleted file mode 100644 index 600469615..000000000 --- a/docs/install/macos.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: MacOS Install with Metal GPU ---- - -**(1) Make sure you have xcode installed... at least the command line parts** -``` -# check the path of your xcode install -xcode-select -p - -# xcode installed returns -# /Applications/Xcode-beta.app/Contents/Developer - -# if xcode is missing then install it... it takes ages; -xcode-select --install -``` - -**(2) Install the conda version for MacOS that supports Metal GPU** -``` -wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh -bash Miniforge3-MacOSX-arm64.sh -``` - -**(3) Make a conda environment** -``` -conda create -n llama python=3.9.16 -conda activate llama -``` - -**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** - *(you needed xcode installed in order pip to build/compile the C++ code)* -``` -pip uninstall llama-cpp-python -y -CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir -pip install 'llama-cpp-python[server]' - -# you should now have llama-cpp-python v0.1.62 installed -llama-cpp-python         0.1.62      - -``` - -**(4) Download a v3 ggml model** - - **ggmlv3** - - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 - -https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML - - -**(6) run the llama-cpp-python API server with MacOS Metal GPU support** -``` -# config your ggml model path -# make sure it is ggml v3 -# make sure it is q4_0 -export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin -python3 -m llama_cpp.server --model $MODEL --n_gpu_layers 1 -``` - -***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used* - - diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index 199bd4ffb..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -mkdocs -mkdocs-material -mkdocstrings[python] \ No newline at end of file diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py index e41f37577..e6f85e180 100644 --- a/examples/high_level_api/high_level_api_inference.py +++ b/examples/high_level_api/high_level_api_inference.py @@ -1,13 +1,13 @@ import json import argparse -from llama_cpp import Llama +from falcon_cpp import Falcon parser = argparse.ArgumentParser() -parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin") +parser.add_argument("-m", "--model", type=str, default="../../models/tiiuae_falcon-7b/ggml-model-tiiuae_falcon-7b-f16.bin") args = parser.parse_args() -llm = Llama(model_path=args.model) +llm = Falcon(model_path=args.model) output = llm( "Question: What are the names of the planets in the solar system? Answer: ", diff --git a/falcon_cpp/__init__.py b/falcon_cpp/__init__.py new file mode 100644 index 000000000..e7d40876f --- /dev/null +++ b/falcon_cpp/__init__.py @@ -0,0 +1,2 @@ +from .falcon_cpp import * +from .falcon import * diff --git a/llama_cpp/llama.py b/falcon_cpp/falcon.py similarity index 65% rename from llama_cpp/llama.py rename to falcon_cpp/falcon.py index 688b2a74f..010586cd9 100644 --- a/llama_cpp/llama.py +++ b/falcon_cpp/falcon.py @@ -20,15 +20,15 @@ import diskcache -from . import llama_cpp -from .llama_types import * +from . import falcon_cpp +from .falcon_types import * import numpy as np import numpy.typing as npt -class BaseLlamaCache(ABC): - """Base cache class for a llama.cpp model.""" +class BaseFalconCache(ABC): + """Base cache class for a falcon.cpp model.""" def __init__(self, capacity_bytes: int = (2 << 30)): self.capacity_bytes = capacity_bytes @@ -39,13 +39,13 @@ def cache_size(self) -> int: raise NotImplementedError def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": raise NotImplementedError @abstractmethod @@ -53,30 +53,30 @@ def __contains__(self, key: Sequence[int]) -> bool: raise NotImplementedError @abstractmethod - def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None: + def __setitem__(self, key: Sequence[int], value: "FalconState") -> None: raise NotImplementedError -class LlamaRAMCache(BaseLlamaCache): - """Cache for a llama.cpp model using RAM.""" +class FalconRAMCache(BaseFalconCache): + """Cache for a falcon.cpp model using RAM.""" def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict() + self.cache_state: OrderedDict[Tuple[int, ...], "FalconState"] = OrderedDict() @property def cache_size(self): - return sum([state.llama_state_size for state in self.cache_state.values()]) + return sum([state.falcon_state_size for state in self.cache_state.values()]) def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key = None keys = ( - (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() + (k, Falcon.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: if prefix_len > min_len: @@ -84,7 +84,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -96,7 +96,7 @@ def __getitem__(self, key: Sequence[int]) -> "LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "LlamaState"): + def __setitem__(self, key: Sequence[int], value: "FalconState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -106,14 +106,14 @@ def __setitem__(self, key: Sequence[int], value: "LlamaState"): # Alias for backwards compatibility -LlamaCache = LlamaRAMCache +FalconCache = FalconRAMCache -class LlamaDiskCache(BaseLlamaCache): - """Cache for a llama.cpp model using disk.""" +class FalconDiskCache(BaseFalconCache): + """Cache for a falcon.cpp model using disk.""" def __init__( - self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30) + self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30) ): super().__init__(capacity_bytes) self.cache = diskcache.Cache(cache_dir) @@ -123,60 +123,60 @@ def cache_size(self): return int(self.cache.volume()) # type: ignore def _find_longest_prefix_key( - self, - key: Tuple[int, ...], + self, + key: Tuple[int, ...], ) -> Optional[Tuple[int, ...]]: min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = Llama.longest_token_prefix(k, key) + prefix_len = Falcon.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "LlamaState": + def __getitem__(self, key: Sequence[int]) -> "FalconState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "LlamaState" = self.cache.pop(_key) # type: ignore + value: "FalconState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, - # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens + # Falcon.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore return value def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "LlamaState"): - print("LlamaDiskCache.__setitem__: called", file=sys.stderr) + def __setitem__(self, key: Sequence[int], value: "FalconState"): + print("FalconDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: - print("LlamaDiskCache.__setitem__: delete", file=sys.stderr) + print("FalconDiskCache.__setitem__: delete", file=sys.stderr) del self.cache[key] self.cache[key] = value - print("LlamaDiskCache.__setitem__: set", file=sys.stderr) + print("FalconDiskCache.__setitem__: set", file=sys.stderr) while self.cache_size > self.capacity_bytes and len(self.cache) > 0: key_to_remove = next(iter(self.cache)) del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) + print("FalconDiskCache.__setitem__: trim", file=sys.stderr) -class LlamaState: +class FalconState: def __init__( - self, - input_ids: npt.NDArray[np.intc], - scores: npt.NDArray[np.single], - n_tokens: int, - llama_state: bytes, - llama_state_size: int, + self, + input_ids: npt.NDArray[np.intc], + scores: npt.NDArray[np.single], + n_tokens: int, + falcon_state: bytes, + falcon_state_size: int, ): self.input_ids = input_ids self.scores = scores self.n_tokens = n_tokens - self.llama_state = llama_state - self.llama_state_size = llama_state_size + self.falcon_state = falcon_state + self.falcon_state_size = falcon_state_size LogitsProcessor = Callable[[List[int], List[float]], List[float]] @@ -197,61 +197,80 @@ def __call__(self, input_ids: List[int], logits: List[float]) -> bool: return any([stopping_criteria(input_ids, logits) for stopping_criteria in self]) -class Llama: - """High-level Python wrapper for a llama.cpp model.""" +class Falcon: + """High-level Python wrapper for a falcon.cpp model.""" def __init__( - self, - model_path: str, - # NOTE: These parameters are likely to change in the future. - n_ctx: int = 512, - n_parts: int = -1, - n_gpu_layers: int = 0, - seed: int = 1337, - f16_kv: bool = True, - logits_all: bool = False, - vocab_only: bool = False, - use_mmap: bool = True, - use_mlock: bool = False, - embedding: bool = False, - n_threads: Optional[int] = None, - n_batch: int = 512, - last_n_tokens_size: int = 64, - lora_base: Optional[str] = None, - lora_path: Optional[str] = None, - low_vram: bool = False, - verbose: bool = True, + self, + model_path: str, + # NOTE: These parameters are likely to change in the future. + n_ctx: int = 512, + n_parts: int = -1, + n_gpu_layers: int = 0, + seed: int = 1337, + f16_kv: bool = True, + logits_all: bool = False, + vocab_only: bool = False, + use_mmap: bool = True, + use_mlock: bool = False, + embedding: bool = False, + n_threads: Optional[int] = None, + n_batch: int = 512, + last_n_tokens_size: int = 64, + lora_base: Optional[str] = None, + lora_path: Optional[str] = None, + low_vram: bool = False, + verbose: bool = True, ): - """Load a llama.cpp model from `model_path`. - Args: - model_path: Path to the model. - n_ctx: Maximum context size. - n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. - seed: Random seed. -1 for random. - f16_kv: Use half-precision for key/value cache. - logits_all: Return logits for all tokens, not just the last token. - vocab_only: Only load the vocabulary no weights. - use_mmap: Use mmap if possible. - use_mlock: Force the system to keep the model in RAM. - embedding: Embedding mode only. - n_threads: Number of threads to use. If None, the number of threads is automatically determined. - n_batch: Maximum number of prompt tokens to batch together when calling llama_eval. - last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. - lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. - lora_path: Path to a LoRA file to apply to the model. - verbose: Print verbose output to stderr. - - Raises: - ValueError: If the model path does not exist. - - Returns: - A Llama instance. - """ + # TODO: Add the parameters for + ''' + -ts SPLIT --tensor-split SPLIT + how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1 + -mg i, --main-gpu i the GPU to use for scratch and small tensors (0 = first) + --override-max-gpu N + limits the number of GPUs visible (allows to disable multi/single GPU processing) + --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM) + --mtest compute maximum memory usage + --export export the computation graph to 'falcon.ggml' + --verbose-prompt print prompt before generation + -dt, --debug-timings print GGML_PERF debug output (requires GGML_PERF=1 for timings) + 1 = print first layer, 2 = print first and last layer, 3+ = all layers + --lora FNAME apply LoRA adapter (implies --no-mmap) + --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter + -m FNAME, --model FNAME + ''' + + """Load a Falcon model from `model_path`. + + Args: + model_path: Path to the model. + n_ctx: Maximum context size. + n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. + seed: Random seed. -1 for random. + f16_kv: Use half-precision for key/value cache. + logits_all: Return logits for all tokens, not just the last token. + vocab_only: Only load the vocabulary no weights. + use_mmap: Use mmap if possible. + use_mlock: Force the system to keep the model in RAM. + embedding: Embedding mode only. + n_threads: Number of threads to use. If None, the number of threads is automatically determined. + n_batch: Maximum number of prompt tokens to batch together when calling falcon_eval. + last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. + lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. + lora_path: Path to a LoRA file to apply to the model. + verbose: Print verbose output to stderr. + + Raises: + ValueError: If the model path does not exist. + + Returns: + A falcon instance. + """ self.verbose = verbose self.model_path = model_path - self.params = llama_cpp.llama_context_default_params() + self.params = falcon_cpp.falcon_context_default_params() self.params.n_ctx = n_ctx self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed @@ -266,7 +285,7 @@ def __init__( self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) - self.cache: Optional[BaseLlamaCache] = None + self.cache: Optional[BaseFalconCache] = None self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) @@ -280,35 +299,35 @@ def __init__( if not os.path.exists(model_path): raise ValueError(f"Model path does not exist: {model_path}") - self.model = llama_cpp.llama_load_model_from_file( + self.model = falcon_cpp.falcon_load_model_from_file( self.model_path.encode("utf-8"), self.params ) assert self.model is not None - self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params) + self.ctx = falcon_cpp.falcon_new_context_with_model(self.model, self.params) assert self.ctx is not None if self.lora_path: - if llama_cpp.llama_model_apply_lora_from_file( - self.model, - llama_cpp.c_char_p(self.lora_path.encode("utf-8")), - llama_cpp.c_char_p(self.lora_base.encode("utf-8")) - if self.lora_base is not None - else llama_cpp.c_char_p(0), - llama_cpp.c_int(self.n_threads), + if falcon_cpp.falcon_model_apply_lora_from_file( + self.model, + falcon_cpp.c_char_p(self.lora_path.encode("utf-8")), + falcon_cpp.c_char_p(self.lora_base.encode("utf-8")) + if self.lora_base is not None + else falcon_cpp.c_char_p(0), + falcon_cpp.c_int(self.n_threads), ): raise RuntimeError( f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}" ) if self.verbose: - print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr) + print(falcon_cpp.falcon_print_system_info().decode("utf-8"), file=sys.stderr) self._n_vocab = self.n_vocab() self._n_ctx = self.n_ctx() - size = llama_cpp.c_size_t(self._n_vocab) - sorted = llama_cpp.c_bool(False) + size = falcon_cpp.c_size_t(self._n_vocab) + sorted = falcon_cpp.c_bool(False) self._candidates_data = np.array( [], dtype=np.dtype( @@ -316,14 +335,14 @@ def __init__( ), ) self._candidates_data.resize(3, self._n_vocab, refcheck=False) - candidates = llama_cpp.llama_token_data_array( - data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), + candidates = falcon_cpp.falcon_token_data_array( + data=self._candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p), size=size, sorted=sorted, ) self._candidates = candidates - self._token_nl = Llama.token_nl() - self._token_eos = Llama.token_eos() + self._token_nl = Falcon.token_nl() + self._token_eos = Falcon.token_eos() self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -364,23 +383,23 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]: """ assert self.ctx is not None n_ctx = self._n_ctx - tokens = (llama_cpp.llama_token * n_ctx)() - n_tokens = llama_cpp.llama_tokenize( + tokens = (falcon_cpp.falcon_token * n_ctx)() + n_tokens = falcon_cpp.falcon_tokenize( self.ctx, text, tokens, - llama_cpp.c_int(n_ctx), - llama_cpp.c_bool(add_bos), + falcon_cpp.c_int(n_ctx), + falcon_cpp.c_bool(add_bos), ) if n_tokens < 0: n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * n_tokens)() - n_tokens = llama_cpp.llama_tokenize( + tokens = (falcon_cpp.falcon_token * n_tokens)() + n_tokens = falcon_cpp.falcon_tokenize( self.ctx, text, tokens, - llama_cpp.c_int(n_tokens), - llama_cpp.c_bool(add_bos), + falcon_cpp.c_int(n_tokens), + falcon_cpp.c_bool(add_bos), ) if n_tokens < 0: raise RuntimeError( @@ -400,12 +419,12 @@ def detokenize(self, tokens: List[int]) -> bytes: assert self.ctx is not None output = b"" for token in tokens: - output += llama_cpp.llama_token_to_str( - self.ctx, llama_cpp.llama_token(token) + output += falcon_cpp.falcon_token_to_str( + self.ctx, falcon_cpp.falcon_token(token) ) return output - def set_cache(self, cache: Optional[BaseLlamaCache]): + def set_cache(self, cache: Optional[BaseFalconCache]): """Set the cache. Args: @@ -426,52 +445,53 @@ def eval(self, tokens: Sequence[int]): assert self.ctx is not None n_ctx = self._n_ctx for i in range(0, len(tokens), self.n_batch): - batch = tokens[i : min(len(tokens), i + self.n_batch)] + batch = tokens[i: min(len(tokens), i + self.n_batch)] n_past = min(n_ctx - len(batch), len(self._input_ids)) n_tokens = len(batch) - return_code = llama_cpp.llama_eval( + return_code = falcon_cpp.falcon_eval( ctx=self.ctx, - tokens=(llama_cpp.llama_token * len(batch))(*batch), - n_tokens=llama_cpp.c_int(n_tokens), - n_past=llama_cpp.c_int(n_past), - n_threads=llama_cpp.c_int(self.n_threads), + tokens=(falcon_cpp.falcon_token * len(batch))(*batch), + n_tokens=falcon_cpp.c_int(n_tokens), + n_past=falcon_cpp.c_int(n_past), + n_threads=falcon_cpp.c_int(self.n_threads), ) if return_code != 0: - raise RuntimeError(f"llama_eval returned {return_code}") + raise RuntimeError(f"falcon_eval returned {return_code}") # Save tokens - self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch + self.input_ids[self.n_tokens: self.n_tokens + n_tokens] = batch # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[ + :] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols] # Update n_tokens self.n_tokens += n_tokens def _sample( - self, - last_n_tokens_data, # type: llama_cpp.Array[llama_cpp.llama_token] - last_n_tokens_size: llama_cpp.c_int, - top_k: llama_cpp.c_int, - top_p: llama_cpp.c_float, - temp: llama_cpp.c_float, - tfs_z: llama_cpp.c_float, - repeat_penalty: llama_cpp.c_float, - frequency_penalty: llama_cpp.c_float, - presence_penalty: llama_cpp.c_float, - mirostat_mode: llama_cpp.c_int, - mirostat_tau: llama_cpp.c_float, - mirostat_eta: llama_cpp.c_float, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + last_n_tokens_data, # type: falcon_cpp.Array[falcon_cpp.falcon_token] + last_n_tokens_size: falcon_cpp.c_int, + top_k: falcon_cpp.c_int, + top_p: falcon_cpp.c_float, + temp: falcon_cpp.c_float, + tfs_z: falcon_cpp.c_float, + repeat_penalty: falcon_cpp.c_float, + frequency_penalty: falcon_cpp.c_float, + presence_penalty: falcon_cpp.c_float, + mirostat_mode: falcon_cpp.c_int, + mirostat_tau: falcon_cpp.c_float, + mirostat_eta: falcon_cpp.c_float, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): assert self.ctx is not None assert self.n_tokens > 0 n_vocab = self._n_vocab n_ctx = self._n_ctx - top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k + top_k = falcon_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k last_n_tokens_size = ( - llama_cpp.c_int(n_ctx) + falcon_cpp.c_int(n_ctx) if last_n_tokens_size.value < 0 else last_n_tokens_size ) @@ -490,110 +510,110 @@ def _sample( candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore candidates_data["logit"] = logits candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) - candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) - candidates.sorted = llama_cpp.c_bool(False) - candidates.size = llama_cpp.c_size_t(n_vocab) - llama_cpp.llama_sample_repetition_penalty( + candidates.data = candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p) + candidates.sorted = falcon_cpp.c_bool(False) + candidates.size = falcon_cpp.c_size_t(n_vocab) + falcon_cpp.falcon_sample_repetition_penalty( ctx=self.ctx, last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore penalty=repeat_penalty, ) - llama_cpp.llama_sample_frequency_and_presence_penalties( + falcon_cpp.falcon_sample_frequency_and_presence_penalties( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore last_tokens_data=last_n_tokens_data, last_tokens_size=last_n_tokens_size, alpha_frequency=frequency_penalty, alpha_presence=presence_penalty, ) if not penalize_nl: - candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit) + candidates.data[self._token_nl].logit = falcon_cpp.c_float(nl_logit) if temp.value == 0.0: - return llama_cpp.llama_sample_token_greedy( + return falcon_cpp.falcon_sample_token_greedy( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore ) elif mirostat_mode.value == 1: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) - mirostat_m = llama_cpp.c_int(100) - llama_cpp.llama_sample_temperature( + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + mirostat_m = falcon_cpp.c_int(100) + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) - return llama_cpp.llama_sample_token_mirostat( + return falcon_cpp.falcon_sample_token_mirostat( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore m=mirostat_m, ) elif mirostat_mode.value == 2: - mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) - llama_cpp.llama_sample_temperature( + mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value) + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=falcon_cpp.ctypes.pointer(candidates), temp=temp, ) - return llama_cpp.llama_sample_token_mirostat_v2( + return falcon_cpp.falcon_sample_token_mirostat_v2( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore tau=mirostat_tau, eta=mirostat_eta, - mu=llama_cpp.ctypes.byref(mirostat_mu), # type: ignore + mu=falcon_cpp.ctypes.byref(mirostat_mu), # type: ignore ) else: - llama_cpp.llama_sample_top_k( + falcon_cpp.falcon_sample_top_k( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore k=top_k, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_tail_free( + falcon_cpp.falcon_sample_tail_free( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore z=tfs_z, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_typical( + falcon_cpp.falcon_sample_typical( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore - p=llama_cpp.c_float(1.0), - min_keep=llama_cpp.c_size_t(1), + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore + p=falcon_cpp.c_float(1.0), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_top_p( + falcon_cpp.falcon_sample_top_p( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore p=top_p, - min_keep=llama_cpp.c_size_t(1), + min_keep=falcon_cpp.c_size_t(1), ) - llama_cpp.llama_sample_temperature( + falcon_cpp.falcon_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) - return llama_cpp.llama_sample_token( + return falcon_cpp.falcon_sample_token( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=falcon_cpp.ctypes.byref(candidates), # type: ignore ) def sample( - self, - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_eta: float = 0.1, - mirostat_tau: float = 5.0, - penalize_nl: bool = True, - logits_processor: Optional[LogitsProcessorList] = None, + self, + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, ): """Sample a token from the model. @@ -607,52 +627,52 @@ def sample( The sampled token. """ assert self.ctx is not None - last_n_tokens_data = [llama_cpp.llama_token(0)] * max( + last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max( 0, self.last_n_tokens_size - len(self._input_ids) - ) + self._input_ids[-self.last_n_tokens_size :].tolist() + ) + self._input_ids[-self.last_n_tokens_size:].tolist() return self._sample( - last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)( + last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)( *last_n_tokens_data ), - last_n_tokens_size=llama_cpp.c_int(self.last_n_tokens_size), - top_k=llama_cpp.c_int(top_k), - top_p=llama_cpp.c_float(top_p), - temp=llama_cpp.c_float(temp), - tfs_z=llama_cpp.c_float(tfs_z), - repeat_penalty=llama_cpp.c_float(repeat_penalty), - frequency_penalty=llama_cpp.c_float(frequency_penalty), - presence_penalty=llama_cpp.c_float(presence_penalty), - mirostat_mode=llama_cpp.c_int(mirostat_mode), - mirostat_tau=llama_cpp.c_float(mirostat_tau), - mirostat_eta=llama_cpp.c_float(mirostat_eta), + last_n_tokens_size=falcon_cpp.c_int(self.last_n_tokens_size), + top_k=falcon_cpp.c_int(top_k), + top_p=falcon_cpp.c_float(top_p), + temp=falcon_cpp.c_float(temp), + tfs_z=falcon_cpp.c_float(tfs_z), + repeat_penalty=falcon_cpp.c_float(repeat_penalty), + frequency_penalty=falcon_cpp.c_float(frequency_penalty), + presence_penalty=falcon_cpp.c_float(presence_penalty), + mirostat_mode=falcon_cpp.c_int(mirostat_mode), + mirostat_tau=falcon_cpp.c_float(mirostat_tau), + mirostat_eta=falcon_cpp.c_float(mirostat_eta), penalize_nl=penalize_nl, logits_processor=logits_processor, ) def generate( - self, - tokens: Sequence[int], - top_k: int = 40, - top_p: float = 0.95, - temp: float = 0.80, - repeat_penalty: float = 1.1, - reset: bool = True, - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - logits_processor: Optional[LogitsProcessorList] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, + self, + tokens: Sequence[int], + top_k: int = 40, + top_p: float = 0.95, + temp: float = 0.80, + repeat_penalty: float = 1.1, + reset: bool = True, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, ) -> Generator[int, Optional[Sequence[int]], None]: """Create a generator of tokens from a prompt. Examples: - >>> llama = Llama("models/ggml-7b.bin") - >>> tokens = llama.tokenize(b"Hello, world!") - >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): - ... print(llama.detokenize([token])) + >>> falcon = Falcon("models/ggml-7b.bin") + >>> tokens = falcon.tokenize(b"Hello, world!") + >>> for token in falcon.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1): + ... print(falcon.detokenize([token])) Args: tokens: The prompt tokens. @@ -676,7 +696,7 @@ def generate( break if longest_prefix > 0: if self.verbose: - print("Llama.generate: prefix-match hit", file=sys.stderr) + print("Falcon.generate: prefix-match hit", file=sys.stderr) reset = False tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix @@ -700,7 +720,7 @@ def generate( logits_processor=logits_processor, ) if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): return tokens_or_none = yield token @@ -709,7 +729,7 @@ def generate( tokens.extend(tokens_or_none) def create_embedding( - self, input: Union[str, List[str]], model: Optional[str] = None + self, input: Union[str, List[str]], model: Optional[str] = None ) -> Embedding: """Embed a string. @@ -724,11 +744,11 @@ def create_embedding( if self.params.embedding == False: raise RuntimeError( - "Llama model must be created with embedding=True to call this method" + "Falcon model must be created with embedding=True to call this method" ) if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + falcon_cpp.falcon_reset_timings(self.ctx) if isinstance(input, str): inputs = [input] @@ -743,9 +763,9 @@ def create_embedding( self.eval(tokens) n_tokens = len(tokens) total_tokens += n_tokens - embedding = llama_cpp.llama_get_embeddings(self.ctx)[ - : llama_cpp.llama_n_embd(self.ctx) - ] + embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[ + : falcon_cpp.falcon_n_embd(self.ctx) + ] data.append( { @@ -755,7 +775,7 @@ def create_embedding( } ) if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + falcon_cpp.falcon_print_timings(self.ctx) return { "object": "list", @@ -779,34 +799,34 @@ def embed(self, input: str) -> List[float]: return list(map(float, self.create_embedding(input)["data"][0]["embedding"])) def _create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 16, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 16, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]: assert self.ctx is not None completion_id: str = f"cmpl-{str(uuid.uuid4())}" created: int = int(time.time()) completion_tokens: List[int] = [] - # Add blank space to start of prompt to match OG llama tokenizer + # Add blank space to start of prompt to match OG Falcon tokenizer prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8")) text: bytes = b"" returned_tokens: int = 0 @@ -816,7 +836,7 @@ def _create_completion( model_name: str = model if model is not None else self.model_path if self.verbose: - llama_cpp.llama_reset_timings(self.ctx) + falcon_cpp.falcon_reset_timings(self.ctx) if len(prompt_tokens) > self._n_ctx: raise ValueError( @@ -843,36 +863,36 @@ def _create_completion( if self.cache: try: cache_item = self.cache[prompt_tokens] - cache_prefix_len = Llama.longest_token_prefix( + cache_prefix_len = Falcon.longest_token_prefix( cache_item.input_ids.tolist(), prompt_tokens ) - eval_prefix_len = Llama.longest_token_prefix( + eval_prefix_len = Falcon.longest_token_prefix( self._input_ids.tolist(), prompt_tokens ) if cache_prefix_len > eval_prefix_len: self.load_state(cache_item) if self.verbose: - print("Llama._create_completion: cache hit", file=sys.stderr) + print("Falcon._create_completion: cache hit", file=sys.stderr) except KeyError: if self.verbose: - print("Llama._create_completion: cache miss", file=sys.stderr) + print("Falcon._create_completion: cache miss", file=sys.stderr) finish_reason = "length" multibyte_fix = 0 for token in self.generate( - prompt_tokens, - top_k=top_k, - top_p=top_p, - temp=temperature, - tfs_z=tfs_z, - mirostat_mode=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - frequency_penalty=frequency_penalty, - presence_penalty=presence_penalty, - repeat_penalty=repeat_penalty, - stopping_criteria=stopping_criteria, - logits_processor=logits_processor, + prompt_tokens, + top_k=top_k, + top_p=top_p, + temp=temperature, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + repeat_penalty=repeat_penalty, + stopping_criteria=stopping_criteria, + logits_processor=logits_processor, ): if token == self._token_eos: text = self.detokenize(completion_tokens) @@ -924,7 +944,7 @@ def _create_completion( token_end_position += len(self.detokenize([token])) # Check if stop sequence is in the token if token_end_position >= ( - remaining_length - first_stop_position - 1 + remaining_length - first_stop_position - 1 ): break logprobs_or_none: Optional[CompletionLogprobs] = None @@ -937,7 +957,7 @@ def _create_completion( ) token_offset = len(prompt_tokens) + returned_tokens logits = self._scores[token_offset - 1, :].tolist() - current_logprobs = Llama.logits_to_logprobs(logits) + current_logprobs = Falcon.logits_to_logprobs(logits) sorted_logprobs = list( sorted( zip(current_logprobs, range(len(current_logprobs))), @@ -985,13 +1005,13 @@ def _create_completion( break if stopping_criteria is not None and stopping_criteria( - self._input_ids.tolist(), self._scores[-1, :].tolist() + self._input_ids.tolist(), self._scores[-1, :].tolist() ): text = self.detokenize(completion_tokens) finish_reason = "stop" if self.verbose: - llama_cpp.llama_print_timings(self.ctx) + falcon_cpp.falcon_print_timings(self.ctx) if stream: remaining_tokens = completion_tokens[returned_tokens:] @@ -1016,7 +1036,7 @@ def _create_completion( ) token_offset = len(prompt_tokens) + returned_tokens - 1 logits = self._scores[token_offset, :].tolist() - current_logprobs = Llama.logits_to_logprobs(logits) + current_logprobs = Falcon.logits_to_logprobs(logits) sorted_logprobs = list( sorted( zip(current_logprobs, range(len(current_logprobs))), @@ -1050,8 +1070,8 @@ def _create_completion( "choices": [ { "text": last_text[ - : len(last_text) - (token_end_position - end) - ].decode("utf-8", errors="ignore"), + : len(last_text) - (token_end_position - end) + ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, "finish_reason": finish_reason, @@ -1080,14 +1100,14 @@ def _create_completion( } if self.cache: if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) + print("Falcon._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() - print("Llama._create_completion: cache saved", file=sys.stderr) + print("Falcon._create_completion: cache saved", file=sys.stderr) return if self.cache: if self.verbose: - print("Llama._create_completion: cache save", file=sys.stderr) + print("Falcon._create_completion: cache save", file=sys.stderr) self.cache[prompt_tokens + completion_tokens] = self.save_state() text_str = text.decode("utf-8", errors="ignore") @@ -1118,10 +1138,10 @@ def _create_completion( for token in all_tokens ] all_logprobs = [ - Llama.logits_to_logprobs(row.tolist()) for row in self._scores - ][token_offset:] + Falcon.logits_to_logprobs(row.tolist()) for row in self._scores + ][token_offset:] for token, token_str, logprobs_token in zip( - all_tokens, all_token_strs, all_logprobs + all_tokens, all_token_strs, all_logprobs ): text_offsets.append(text_offset) text_offset += len(token_str) @@ -1172,27 +1192,27 @@ def _create_completion( } def create_completion( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1245,27 +1265,27 @@ def create_completion( return completion def __call__( - self, - prompt: str, - suffix: Optional[str] = None, - max_tokens: int = 128, - temperature: float = 0.8, - top_p: float = 0.95, - logprobs: Optional[int] = None, - echo: bool = False, - stop: Optional[Union[str, List[str]]] = [], - frequency_penalty: float = 0.0, - presence_penalty: float = 0.0, - repeat_penalty: float = 1.1, - top_k: int = 40, - stream: bool = False, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - stopping_criteria: Optional[StoppingCriteriaList] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + prompt: str, + suffix: Optional[str] = None, + max_tokens: int = 128, + temperature: float = 0.8, + top_p: float = 0.95, + logprobs: Optional[int] = None, + echo: bool = False, + stop: Optional[Union[str, List[str]]] = [], + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + repeat_penalty: float = 1.1, + top_k: int = 40, + stream: bool = False, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[Completion, Iterator[CompletionChunk]]: """Generate text from a prompt. @@ -1313,7 +1333,7 @@ def __call__( ) def _convert_text_completion_to_chat( - self, completion: Completion + self, completion: Completion ) -> ChatCompletion: return { "id": "chat" + completion["id"], @@ -1334,8 +1354,8 @@ def _convert_text_completion_to_chat( } def _convert_text_completion_chunks_to_chat( - self, - chunks: Iterator[CompletionChunk], + self, + chunks: Iterator[CompletionChunk], ) -> Iterator[ChatCompletionChunk]: for i, chunk in enumerate(chunks): if i == 0: @@ -1371,23 +1391,23 @@ def _convert_text_completion_chunks_to_chat( } def create_chat_completion( - self, - messages: List[ChatCompletionMessage], - temperature: float = 0.2, - top_p: float = 0.95, - top_k: int = 40, - stream: bool = False, - stop: Optional[Union[str, List[str]]] = [], - max_tokens: int = 256, - presence_penalty: float = 0.0, - frequency_penalty: float = 0.0, - repeat_penalty: float = 1.1, - tfs_z: float = 1.0, - mirostat_mode: int = 0, - mirostat_tau: float = 5.0, - mirostat_eta: float = 0.1, - model: Optional[str] = None, - logits_processor: Optional[LogitsProcessorList] = None, + self, + messages: List[ChatCompletionMessage], + temperature: float = 0.2, + top_p: float = 0.95, + top_k: int = 40, + stream: bool = False, + stop: Optional[Union[str, List[str]]] = [], + max_tokens: int = 256, + presence_penalty: float = 0.0, + frequency_penalty: float = 0.0, + repeat_penalty: float = 1.1, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_tau: float = 5.0, + mirostat_eta: float = 0.1, + model: Optional[str] = None, + logits_processor: Optional[LogitsProcessorList] = None, ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]: """Generate a chat completion from a list of messages. @@ -1440,10 +1460,10 @@ def create_chat_completion( def __del__(self): if self.model is not None: - llama_cpp.llama_free_model(self.model) + falcon_cpp.falcon_free_model(self.model) self.model = None if self.ctx is not None: - llama_cpp.llama_free(self.ctx) + falcon_cpp.falcon_free(self.ctx) self.ctx = None def __getstate__(self): @@ -1492,82 +1512,82 @@ def __setstate__(self, state): verbose=state["verbose"], ) - def save_state(self) -> LlamaState: + def save_state(self) -> FalconState: assert self.ctx is not None if self.verbose: - print("Llama.save_state: saving llama state", file=sys.stderr) - state_size = llama_cpp.llama_get_state_size(self.ctx) + print("Falcon.save_state: saving falcon state", file=sys.stderr) + state_size = falcon_cpp.falcon_get_state_size(self.ctx) if self.verbose: - print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr) - llama_state = (llama_cpp.c_uint8 * int(state_size))() + print(f"Falcon.save_state: got state size: {state_size}", file=sys.stderr) + falcon_state = (falcon_cpp.c_uint8 * int(state_size))() if self.verbose: - print("Llama.save_state: allocated state", file=sys.stderr) - n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state) + print("Falcon.save_state: allocated state", file=sys.stderr) + n_bytes = falcon_cpp.falcon_copy_state_data(self.ctx, falcon_state) if self.verbose: - print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr) + print(f"Falcon.save_state: copied falcon state: {n_bytes}", file=sys.stderr) if int(n_bytes) > int(state_size): - raise RuntimeError("Failed to copy llama state data") - llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))() - llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes)) + raise RuntimeError("Failed to copy Falcon state data") + falcon_state_compact = (falcon_cpp.c_uint8 * int(n_bytes))() + falcon_cpp.ctypes.memmove(falcon_state_compact, falcon_state, int(n_bytes)) if self.verbose: print( - f"Llama.save_state: saving {n_bytes} bytes of llama state", + f"Falcon.save_state: saving {n_bytes} bytes of falcon state", file=sys.stderr, ) - return LlamaState( + return FalconState( scores=self.scores.copy(), input_ids=self.input_ids.copy(), n_tokens=self.n_tokens, - llama_state=bytes(llama_state_compact), - llama_state_size=n_bytes, + falcon_state=bytes(falcon_state_compact), + falcon_state_size=n_bytes, ) - def load_state(self, state: LlamaState) -> None: + def load_state(self, state: FalconState) -> None: assert self.ctx is not None self.scores = state.scores.copy() self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens - state_size = state.llama_state_size - LLamaStateArrayType = llama_cpp.c_uint8 * state_size - llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) + state_size = state.falcon_state_size + FalconStateArrayType = falcon_cpp.c_uint8 * state_size + falcon_state = FalconStateArrayType.from_buffer_copy(state.falcon_state) - if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size: - raise RuntimeError("Failed to set llama state data") + if falcon_cpp.falcon_set_state_data(self.ctx, falcon_state) != state_size: + raise RuntimeError("Failed to set Falcon state data") def n_ctx(self) -> int: """Return the context window size.""" assert self.ctx is not None - return llama_cpp.llama_n_ctx(self.ctx) + return falcon_cpp.falcon_n_ctx(self.ctx) def n_embd(self) -> int: """Return the embedding size.""" assert self.ctx is not None - return llama_cpp.llama_n_embd(self.ctx) + return falcon_cpp.falcon_n_embd(self.ctx) def n_vocab(self) -> int: """Return the vocabulary size.""" assert self.ctx is not None - return llama_cpp.llama_n_vocab(self.ctx) + return falcon_cpp.falcon_n_vocab(self.ctx) - def tokenizer(self) -> "LlamaTokenizer": + def tokenizer(self) -> "FalconTokenizer": """Return the tokenizer for this model.""" assert self.ctx is not None - return LlamaTokenizer(self) + return FalconTokenizer(self) @staticmethod def token_eos() -> int: """Return the end-of-sequence token.""" - return llama_cpp.llama_token_eos() + return falcon_cpp.falcon_token_eos() @staticmethod def token_bos() -> int: """Return the beginning-of-sequence token.""" - return llama_cpp.llama_token_bos() + return falcon_cpp.falcon_token_bos() @staticmethod def token_nl() -> int: """Return the newline token.""" - return llama_cpp.llama_token_nl() + return falcon_cpp.falcon_token_nl() @staticmethod def logits_to_logprobs(logits: List[float]) -> List[float]: @@ -1586,18 +1606,18 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]): return longest_prefix -class LlamaTokenizer: - def __init__(self, llama: Llama): - self.llama = llama +class FalconTokenizer: + def __init__(self, falcon: Falcon): + self.falcon = falcon def encode(self, text: str, add_bos: bool = True) -> List[int]: - return self.llama.tokenize( + return self.falcon.tokenize( text.encode("utf-8", errors="ignore"), add_bos=add_bos ) def decode(self, tokens: List[int]) -> str: - return self.llama.detokenize(tokens).decode("utf-8", errors="ignore") + return self.falcon.detokenize(tokens).decode("utf-8", errors="ignore") @classmethod - def from_ggml_file(cls, path: str) -> "LlamaTokenizer": - return cls(Llama(model_path=path, vocab_only=True)) + def from_ggml_file(cls, path: str) -> "FalconTokenizer": + return cls(Falcon(model_path=path, vocab_only=True)) \ No newline at end of file diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py new file mode 100644 index 000000000..121b98c96 --- /dev/null +++ b/falcon_cpp/falcon_cpp.py @@ -0,0 +1,1023 @@ +import sys +import os +import ctypes +from ctypes import ( + c_int, + c_float, + c_char_p, + c_void_p, + c_bool, + POINTER, + _Pointer, # type: ignore + Structure, + Array, + c_uint8, + c_size_t, +) +import pathlib +from typing import List, Union + + +# Load the library +def _load_shared_library(lib_base_name: str): + # Construct the paths to the possible shared library names + _base_path = pathlib.Path(__file__).parent.resolve() + # Searching for the library in the current directory under the name "libFalcon" (default name + # for falconcpp) and "falcon" (default name for this repo) + _lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + _lib_paths += [ + _base_path / f"lib{lib_base_name}.so", + _base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + _lib_paths += [ + _base_path / f"{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") + + if "FALCON_CPP_LIB" in os.environ: + lib_base_name = os.environ["FALCON_CPP_LIB"] + _lib = pathlib.Path(lib_base_name) + _base_path = _lib.parent.resolve() + _lib_paths = [_lib.resolve()] + + cdll_args = dict() # type: ignore + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + cdll_args["winmode"] = 0 + + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path), **cdll_args) + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + +# Specify the base name of the shared library to load +_lib_base_name = "falcon" + +# Load the library +_lib = _load_shared_library(_lib_base_name) + +# Misc +c_float_p = POINTER(c_float) +c_uint8_p = POINTER(c_uint8) +c_size_t_p = POINTER(c_size_t) + +# falcon.h bindings + +GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") +GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) +FALCON_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) + +# #define FALCON_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' +FALCON_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) +# #define FALCON_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' +FALCON_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) +# #define FALCON_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' +FALCON_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) +# #define FLACON_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' +FALCON_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) +# #define FALCON_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +FALCON_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) + +# #define FALCON_FILE_VERSION 3 +FALCON_FILE_VERSION = c_int(3) +FALCON_FILE_MAGIC = FALCON_FILE_MAGIC_GGJT +FALCON_FILE_MAGIC_UNVERSIONED = FALCON_FILE_MAGIC_GGML +FALCON_SESSION_MAGIC = FALCON_FILE_MAGIC_GGSN +FALCON_SESSION_VERSION = c_int(1) + +# struct falcon_model; +falcon_model_p = c_void_p + +# struct falcon_context; +falcon_context_p = c_void_p + + +# typedef int falcon_token; +falcon_token = c_int +falcon_token_p = POINTER(falcon_token) + + +# typedef struct falcon_token_data { +# falcon_token id; // token id +# float logit; // log-odds of the token +# float p; // probability of the token +# } falcon_token_data; +class falcon_token_data(Structure): + _fields_ = [ + ("id", falcon_token), + ("logit", c_float), + ("p", c_float), + ] + + +falcon_token_data_p = POINTER(falcon_token_data) + +# typedef struct falcon_token_data_array { +# falcon_token_data * data; +# size_t size; +# bool sorted; +# } falcon_token_data_array; +class falcon_token_data_array(Structure): + _fields_ = [ + ("data", falcon_token_data_p), + ("size", c_size_t), + ("sorted", c_bool), + ] + + +falcon_token_data_array_p = POINTER(falcon_token_data_array) + +# typedef void (*falcon_progress_callback)(float progress, void *ctx); +falcon_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) + + +# struct falcon_context_params { +# int seed; // RNG seed, -1 for random +# int n_ctx; // text context +# int n_batch; // prompt processing batch size +# int n_gpu_layers; // number of layers to store in VRAM +# int main_gpu; // the GPU that is used for scratch and small tensors +# float tensor_split[FALCON_MAX_DEVICES]; // how to split layers across multiple GPUs +# // called with a progress value between 0 and 1, pass NULL to disable +# falcon_progress_callback progress_callback; +# // context pointer passed to the progress callback +# void * progress_callback_user_data; + + +# // Keep the booleans together to avoid misalignment during copy-by-value. +# bool low_vram; // if true, reduce VRAM usage at the cost of performance +# bool f16_kv; // use fp16 for KV cache +# bool logits_all; // the falcon_eval() call computes all logits, not just the last one +# bool vocab_only; // only load the vocabulary, no weights +# bool use_mmap; // use mmap if possible +# bool use_mlock; // force system to keep model in RAM +# bool embedding; // embedding mode only +# }; +class ggllm_context_params(Structure): + _fields_ = [ + ("seed", c_int), + ("n_ctx", c_int), + ("n_batch", c_int), + ("n_gpu_layers", c_int), + ("main_gpu", c_int), + ("tensor_split", c_float * FALCON_MAX_DEVICES.value), + ("progress_callback", falcon_progress_callback), + ("progress_callback_user_data", c_void_p), + ("low_vram", c_bool), + ("f16_kv", c_bool), + ("logits_all", c_bool), + ("vocab_only", c_bool), + ("use_mmap", c_bool), + ("use_mlock", c_bool), + ("embedding", c_bool), + ] + + +falcon_context_params_p = POINTER(ggllm_context_params) + +# enum falcon_ftype { +# FALCON_FTYPE_ALL_F32 = 0, +# FALCON_FTYPE_MOSTLY_F16 = 1, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 +# // FALCON_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed +# // FALCON_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed +# FALCON_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors +# FALCON_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors +# FALCON_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors +# }; +FALCON_FTYPE_ALL_F32 = c_int(0) +FALCON_FTYPE_MOSTLY_F16 = c_int(1) +FALCON_FTYPE_MOSTLY_Q4_0 = c_int(2) +FALCON_FTYPE_MOSTLY_Q4_1 = c_int(3) +FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) +FALCON_FTYPE_MOSTLY_Q8_0 = c_int(7) +FALCON_FTYPE_MOSTLY_Q5_0 = c_int(8) +FALCON_FTYPE_MOSTLY_Q5_1 = c_int(9) +FALCON_FTYPE_MOSTLY_Q2_K = c_int(10) +FALCON_FTYPE_MOSTLY_Q3_K_S = c_int(11) +FALCON_FTYPE_MOSTLY_Q3_K_M = c_int(12) +FALCON_FTYPE_MOSTLY_Q3_K_L = c_int(13) +FALCON_FTYPE_MOSTLY_Q4_K_S = c_int(14) +FALCON_FTYPE_MOSTLY_Q4_K_M = c_int(15) +FALCON_FTYPE_MOSTLY_Q5_K_S = c_int(16) +FALCON_FTYPE_MOSTLY_Q5_K_M = c_int(17) +FALCON_FTYPE_MOSTLY_Q6_K = c_int(18) + + +# // model quantization parameters +# typedef struct falcon_model_quantize_params { +# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() +# enum falcon_ftype ftype; // quantize to this falcon_ftype +# bool allow_requantize; // allow quantizing non-f32/f16 tensors +# bool quantize_output_tensor; // quantize output.weight +# } falcon_model_quantize_params; +class falcon_model_quantize_params(Structure): + _fields_ = [ + ("nthread", c_int), + ("ftype", c_int), + ("allow_requantize", c_bool), + ("quantize_output_tensor", c_bool), + ] + + +# FALCON_API struct falcon_context_params falcon_context_default_params(); +def falcon_context_default_params() -> ggllm_context_params: + return _lib.ggllm_context_default_params() + + +_lib.ggllm_context_default_params.argtypes = [] +_lib.ggllm_context_default_params.restype = ggllm_context_params + + +# FALCON_API struct falcon_model_quantize_params falcon_model_quantize_default_params(); +def falcon_model_quantize_default_params() -> falcon_model_quantize_params: + return _lib.ggllm_model_quantize_default_params() + + +_lib.ggllm_model_quantize_default_params.argtypes = [] +_lib.ggllm_model_quantize_default_params.restype = falcon_model_quantize_params + + +# FALCON_API bool falcon_mmap_supported(); +def falcon_mmap_supported() -> bool: + return _lib.ggllm_mmap_supported() + + +_lib.ggllm_mmap_supported.argtypes = [] +_lib.ggllm_mmap_supported.restype = c_bool + + +# FALCON_API bool falcon_mlock_supported(); +def falcon_mlock_supported() -> bool: + return _lib.ggllm_mlock_supported() + + +_lib.ggllm_mlock_supported.argtypes = [] +_lib.ggllm_mlock_supported.restype = c_bool + + +# // TODO: not great API - very likely to change +# // Initialize the falcon + ggml backend +# // If numa is true, use NUMA optimizations +# // Call once at the start of the program +# FLACON_API void falcon_init_backend(bool numa); +def falcon_init_backend(numa: c_bool): + return _lib.ggllm_init_backend(numa) + + +_lib.ggllm_init_backend.argtypes = [c_bool] +_lib.ggllm_init_backend.restype = None + + +# FALCON_API struct falcon_model * falcon_load_model_from_file( +# const char * path_model, +# struct falcon_context_params params); +def falcon_load_model_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_model_p: + return _lib.ggllm_load_model_from_file(path_model, params) + + +_lib.ggllm_load_model_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_load_model_from_file.restype = falcon_model_p + + +# FALCON_API void falcon_free_model(struct falcon_model * model); +def falcon_free_model(model: falcon_model_p): + return _lib.ggllm_free_model(model) + + +_lib.ggllm_free_model.argtypes = [falcon_model_p] +_lib.ggllm_free_model.restype = None + + +# FALCON_API struct falcon_context * falcon_new_context_with_model( +# struct falcon_model * model, +# struct falcon_context_params params); +def falcon_new_context_with_model( + model: falcon_model_p, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_new_context_with_model(model, params) + + +_lib.ggllm_new_context_with_model.argtypes = [falcon_model_p, ggllm_context_params] +_lib.ggllm_new_context_with_model.restype = falcon_context_p + + +# FALCON_API int64_t ggllm_time_us(); +def ggllm_time_us() -> int: + return _lib.ggllm_time_us() + + +_lib.ggllm_time_us.argtypes = [] +_lib.ggllm_time_us.restype = ctypes.c_int64 + + +# // Various functions for loading a ggml falcon model. +# // Allocate (almost) all memory needed for the model. +# // Return NULL on failure +# FALCON_API struct falcon_context * falcon_init_from_file( +# const char * path_model, +# struct falcon_context_params params); +def ggllm_init_from_file( + path_model: bytes, params: ggllm_context_params +) -> falcon_context_p: + return _lib.ggllm_init_from_file(path_model, params) + + +_lib.ggllm_init_from_file.argtypes = [c_char_p, ggllm_context_params] +_lib.ggllm_init_from_file.restype = falcon_context_p + + +# Frees all allocated memory +# FALCON_API void falcon_free(struct falcon_context * ctx); +def falcon_free(ctx: falcon_context_p): + return _lib.ggllm_free(ctx) + + +_lib.ggllm_free.argtypes = [falcon_context_p] +_lib.ggllm_free.restype = None + + +# // Returns 0 on success +# FALCON_API int ggllm_model_quantize( +# const char * fname_inp, +# const char * fname_out, +# const falcon_model_quantize_params * params); +def ggllm_model_quantize( + fname_inp: bytes, + fname_out: bytes, + params, # type: POINTER(falcon_model_quantize_params) # type: ignore +) -> int: + return _lib.ggllm_model_quantize(fname_inp, fname_out, params) + + +_lib.ggllm_model_quantize.argtypes = [ + c_char_p, + c_char_p, + POINTER(falcon_model_quantize_params), +] +_lib.ggllm_model_quantize.restype = c_int + + +# Apply a LoRA adapter to a loaded model +# path_base_model is the path to a higher quality model to use as a base for +# the layers modified by the adapter. Can be NULL to use the current loaded model. +# The model needs to be reloaded before applying a new adapter, otherwise the adapter +# will be applied on top of the previous one +# Returns 0 on success +# FALCON_API int falcon_apply_lora_from_file( +# struct falcon_context * ctx, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def ggllm_apply_lora_from_file( + ctx: falcon_context_p, + path_lora: c_char_p, + path_base_model: c_char_p, + n_threads: c_int, +) -> int: + return _lib.ggllm_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) + + +_lib.ggllm_apply_lora_from_file.argtypes = [falcon_context_p, c_char_p, c_char_p, c_int] +_lib.ggllm_apply_lora_from_file.restype = c_int + + +# FALCON_API int ggllm_model_apply_lora_from_file( +# const struct ggllm_model * model, +# const char * path_lora, +# const char * path_base_model, +# int n_threads); +def falcon_model_apply_lora_from_file( + model: falcon_model_p, + path_lora: Union[c_char_p, bytes], + path_base_model: Union[c_char_p, bytes], + n_threads: c_int, +) -> int: + return _lib.ggllm_model_apply_lora_from_file( + model, path_lora, path_base_model, n_threads + ) + + +_lib.ggllm_model_apply_lora_from_file.argtypes = [ + falcon_model_p, + c_char_p, + c_char_p, + c_int, +] +_lib.ggllm_model_apply_lora_from_file.restype = c_int + + +# Returns the number of tokens in the KV cache +# FALCON_API int falcon_get_kv_cache_token_count(const struct falcon_context * ctx); +def ggllm_get_kv_cache_token_count(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_kv_cache_token_count(ctx) + + +_lib.ggllm_get_kv_cache_token_count.argtypes = [falcon_context_p] +_lib.ggllm_get_kv_cache_token_count.restype = c_int + + +# Sets the current rng seed. +# FALCON_API void falcon_set_rng_seed(struct falcon_context * ctx, int seed); +def falcon_set_rng_seed(ctx: falcon_context_p, seed: c_int): + return _lib.ggllm_set_rng_seed(ctx, seed) + + +_lib.ggllm_set_rng_seed.argtypes = [falcon_context_p, c_int] +_lib.ggllm_set_rng_seed.restype = None + + +# Returns the maximum size in bytes of the state (rng, logits, embedding +# and kv_cache) - will often be smaller after compacting tokens +# FALCON_API size_t falcon_get_state_size(const struct falcon_context * ctx); +def falcon_get_state_size(ctx: falcon_context_p) -> int: + return _lib.ggllm_get_state_size(ctx) + + +_lib.ggllm_get_state_size.argtypes = [falcon_context_p] +_lib.ggllm_get_state_size.restype = c_size_t + + +# Copies the state to the specified destination address. +# Destination needs to have allocated enough memory. +# Returns the number of bytes copied +# FALCON_API size_t falcon_copy_state_data(struct falcon_context * ctx, uint8_t * dst); +def falcon_copy_state_data( + ctx: falcon_context_p, dst # type: Array[c_uint8] +) -> int: + return _lib.ggllm_copy_state_data(ctx, dst) + + +_lib.ggllm_copy_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_copy_state_data.restype = c_size_t + + +# Set the state reading from the specified address +# Returns the number of bytes read +# FALCON_API size_t falcon_set_state_data(struct falcon_context * ctx, uint8_t * src); +def falcon_set_state_data( + ctx: falcon_context_p, src # type: Array[c_uint8] +) -> int: + return _lib.ggllm_set_state_data(ctx, src) + + +_lib.ggllm_set_state_data.argtypes = [falcon_context_p, c_uint8_p] +_lib.ggllm_set_state_data.restype = c_size_t + + +# Save/load session file +# GGLLM_API bool falcon_load_session_file(struct falcon_context * ctx, const char * path_session, falcon_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); +def ggllm_load_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens_out, # type: Array[falcon_token] + n_token_capacity: c_size_t, + n_token_count_out, # type: _Pointer[c_size_t] +) -> int: + return _lib.ggllm_load_session_file( + ctx, path_session, tokens_out, n_token_capacity, n_token_count_out + ) + + +_lib.ggllm_load_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, + c_size_t_p, +] +_lib.ggllm_load_session_file.restype = c_size_t + + +# FALCON_API bool falcon_save_session_file(struct falcon_context * ctx, const char * path_session, const falcon_token * tokens, size_t n_token_count); +def ggllm_save_session_file( + ctx: falcon_context_p, + path_session: bytes, + tokens, # type: Array[falcon_token] + n_token_count: c_size_t, +) -> int: + return _lib.ggllm_save_session_file(ctx, path_session, tokens, n_token_count) + + +_lib.ggllm_save_session_file.argtypes = [ + falcon_context_p, + c_char_p, + falcon_token_p, + c_size_t, +] +_lib.ggllm_save_session_file.restype = c_size_t + + +# Run the falcon inference to obtain the logits and probabilities for the next token. +# tokens + n_tokens is the provided batch of new tokens to process +# n_past is the number of tokens to use from previous eval calls +# Returns 0 on success +# GGLLM_API int falcon_eval( +# struct falcon_context * ctx, +# const falcon_token * tokens, +# int n_tokens, +# int n_past, +# int n_threads); +def falcon_eval( + ctx: falcon_context_p, + tokens, # type: Array[falcon_token] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval(ctx, tokens, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval.argtypes = [falcon_context_p, falcon_token_p, c_int, c_int, c_int] +_lib.ggllm_eval.restype = c_int + + +# // Same as falcon_eval, but use float matrix input directly. +# FALCON_API int falcon_eval_embd( +# struct falcon_context * ctx, +# const float * embd, +# int n_tokens, +# int n_past, +# int n_threads); +def ggllm_eval_embd( + ctx: falcon_context_p, + embd, # type: Array[c_float] + n_tokens: c_int, + n_past: c_int, + n_threads: c_int, +) -> int: + return _lib.ggllm_eval_embd(ctx, embd, n_tokens, n_past, n_threads) + + +_lib.ggllm_eval_embd.argtypes = [falcon_context_p, c_float_p, c_int, c_int, c_int] +_lib.ggllm_eval_embd.restype = c_int + + +# Convert the provided text into tokens. +# The tokens pointer must be large enough to hold the resulting tokens. +# Returns the number of tokens on success, no more than n_max_tokens +# Returns a negative number on failure - the number of tokens that would have been returned +# TODO: not sure if correct +# FALCON_API int ggllm_tokenize( +# struct falcon_context * ctx, +# const char * text, +# falcon_token * tokens, +# int n_max_tokens, +# bool add_bos); +def falcon_tokenize( + ctx: falcon_context_p, + text: bytes, + tokens, # type: Array[falcon_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.ggllm_tokenize(ctx, text, tokens, n_max_tokens, add_bos) + + +_lib.ggllm_tokenize.argtypes = [falcon_context_p, c_char_p, falcon_token_p, c_int, c_bool] +_lib.ggllm_tokenize.restype = c_int + + +# GGLLM_API int ggllm_n_vocab(const struct falcon_context * ctx); +def falcon_n_vocab(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_vocab(ctx) + + +_lib.ggllm_n_vocab.argtypes = [falcon_context_p] +_lib.ggllm_n_vocab.restype = c_int + + +# FALCON_API int falcon_n_ctx (const struct falcon_context * ctx); +def falcon_n_ctx(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_ctx(ctx) + + +_lib.ggllm_n_ctx.argtypes = [falcon_context_p] +_lib.ggllm_n_ctx.restype = c_int + + +# FALCON_API int falcon_n_embd (const struct falcon_context * ctx); +def falcon_n_embd(ctx: falcon_context_p) -> int: + return _lib.ggllm_n_embd(ctx) + + +_lib.ggllm_n_embd.argtypes = [falcon_context_p] +_lib.ggllm_n_embd.restype = c_int + + +# // Get the vocabulary as output parameters. +# // Returns number of results. +# FALCON_API int falcon_get_vocab( +# const struct falcon_context * ctx, +# const char * * strings, +# float * scores, +# int capacity); +def falcon_get_vocab( + ctx: falcon_context_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.ggllm_get_vocab(ctx, strings, scores, capacity) + + +_lib.ggllm_get_vocab.argtypes = [falcon_context_p, c_char_p, c_float, c_int] +_lib.ggllm_get_vocab.restype = c_int + + +# Token logits obtained from the last call to falcon_eval() +# The logits for the last token are stored in the last row +# Can be mutated in order to change the probabilities of the next token +# Rows: n_tokens +# Cols: n_vocab +# FALCON_API float * falcon_get_logits(struct falcon_context * ctx); +def falcon_get_logits( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_logits(ctx) + + +_lib.ggllm_get_logits.argtypes = [falcon_context_p] +_lib.ggllm_get_logits.restype = c_float_p + + +# Get the embeddings for the input +# shape: [n_embd] (1-dimensional) +# FALCON_API float * falcon_get_embeddings(struct falcon_context * ctx); +def falcon_get_embeddings( + ctx: falcon_context_p, +): # type: (...) -> Array[float] # type: ignore + return _lib.ggllm_get_embeddings(ctx) + + +_lib.ggllm_get_embeddings.argtypes = [falcon_context_p] +_lib.ggllm_get_embeddings.restype = c_float_p + + +# Token Id -> String. Uses the vocabulary in the provided context +# FLACON_API const char * falcon_token_to_str(const struct falcon_context * ctx, falcon_token token); +def falcon_token_to_str(ctx: falcon_context_p, token: falcon_token) -> bytes: + return _lib.ggllm_token_to_str(ctx, token) + + +_lib.ggllm_token_to_str.argtypes = [falcon_context_p, falcon_token] +_lib.ggllm_token_to_str.restype = c_char_p + +# Special tokens + + +# FALCON_API falcon_token falcon_token_bos(); // beginning-of-sentence +def falcon_token_bos() -> int: + return _lib.ggllm_token_bos() + + +_lib.ggllm_token_bos.argtypes = [] +_lib.ggllm_token_bos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_eos(); // end-of-sentence +def falcon_token_eos() -> int: + return _lib.ggllm_token_eos() + + +_lib.ggllm_token_eos.argtypes = [] +_lib.ggllm_token_eos.restype = falcon_token + + +# FALCON_API falcon_token falcon_token_nl(); // next-line +def falcon_token_nl() -> int: + return _lib.ggllm_token_nl() + + +_lib.ggllm_token_nl.argtypes = [] +_lib.ggllm_token_nl.restype = falcon_token + + +# Sampling functions + + +# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. +# FALCON_API void falcon_sample_repetition_penalty(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float penalty); +def falcon_sample_repetition_penalty( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + penalty: c_float, +): + return _lib.ggllm_sample_repetition_penalty( + ctx, candidates, last_tokens_data, last_tokens_size, penalty + ) + + +_lib.ggllm_sample_repetition_penalty.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, +] +_lib.ggllm_sample_repetition_penalty.restype = None + + +# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. +# FALCON_API void falcon_sample_frequency_and_presence_penalties(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); +def falcon_sample_frequency_and_presence_penalties( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + last_tokens_data, # type: Array[falcon_token] + last_tokens_size: c_int, + alpha_frequency: c_float, + alpha_presence: c_float, +): + return _lib.ggllm_sample_frequency_and_presence_penalties( + ctx, + candidates, + last_tokens_data, + last_tokens_size, + alpha_frequency, + alpha_presence, + ) + + +_lib.ggllm_sample_frequency_and_presence_penalties.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + falcon_token_p, + c_int, + c_float, + c_float, +] +_lib.ggllm_sample_frequency_and_presence_penalties.restype = None + + +# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# FALCON_API void falcon_sample_softmax(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_softmax( + ctx: falcon_context_p, candidates # type: _Pointer[falcon_token_data] +): + return _lib.ggllm_sample_softmax(ctx, candidates) + + +_lib.ggllm_sample_softmax.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_softmax.restype = None + + +# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_k(struct falcon_context * ctx, falcon_token_data_array * candidates, int k, size_t min_keep); +def falcon_sample_top_k( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + k: c_int, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_k(ctx, candidates, k, min_keep) + + +_lib.ggllm_sample_top_k.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_int, + c_size_t, +] +_lib.ggllm_sample_top_k.restype = None + + +# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# FALCON_API void falcon_sample_top_p(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_top_p( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_top_p(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_top_p.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_top_p.restype = None + + +# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. +# FALCON_API void falcon_sample_tail_free(struct falcon_context * ctx, falcon_token_data_array * candidates, float z, size_t min_keep); +def falcon_sample_tail_free( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + z: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_tail_free(ctx, candidates, z, min_keep) + + +_lib.ggllm_sample_tail_free.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_tail_free.restype = None + + +# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. +# FALCON_API void falcon_sample_typical(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep); +def falcon_sample_typical( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + p: c_float, + min_keep: c_size_t, +): + return _lib.ggllm_sample_typical(ctx, candidates, p, min_keep) + + +_lib.ggllm_sample_typical.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_size_t, +] +_lib.ggllm_sample_typical.restype = None + + +# FALCON_API void falcon_sample_temperature(struct falcon_context * ctx, falcon_token_data_array * candidates, float temp); +def falcon_sample_temperature( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + temp: c_float, +): + return _lib.ggllm_sample_temperature(ctx, candidates, temp) + + +_lib.ggllm_sample_temperature.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, +] +_lib.ggllm_sample_temperature.restype = None + + +# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, int m, float * mu); +def falcon_sample_token_mirostat( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + m: c_int, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) + + +_lib.ggllm_sample_token_mirostat.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_int, + c_float_p, +] +_lib.ggllm_sample_token_mirostat.restype = falcon_token + + +# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. +# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. +# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. +# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. +# FALCON_API falcon_token falcon_sample_token_mirostat_v2(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, float * mu); +def falcon_sample_token_mirostat_v2( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] + tau: c_float, + eta: c_float, + mu, # type: _Pointer[c_float] +) -> int: + return _lib.ggllm_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) + + +_lib.ggllm_sample_token_mirostat_v2.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, + c_float, + c_float, + c_float_p, +] +_lib.ggllm_sample_token_mirostat_v2.restype = falcon_token + + +# @details Selects the token with the highest probability. +# FALCON_API falcon_token falcon_sample_token_greedy(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token_greedy( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token_greedy(ctx, candidates) + + +_lib.ggllm_sample_token_greedy.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token_greedy.restype = falcon_token + + +# @details Randomly selects a token from the candidates based on their probabilities. +# FALCON_API falcon_token falcon_sample_token(struct falcon_context * ctx, falcon_token_data_array * candidates); +def falcon_sample_token( + ctx: falcon_context_p, + candidates, # type: _Pointer[falcon_token_data_array] +) -> int: + return _lib.ggllm_sample_token(ctx, candidates) + + +_lib.ggllm_sample_token.argtypes = [ + falcon_context_p, + falcon_token_data_array_p, +] +_lib.ggllm_sample_token.restype = falcon_token + + +# Performance information + + +# FALCON_API void falcon_print_timings(struct falcon_context * ctx); +def falcon_print_timings(ctx: falcon_context_p): + _lib.ggllm_print_timings(ctx) + + +_lib.ggllm_print_timings.argtypes = [falcon_context_p] +_lib.ggllm_print_timings.restype = None + + +# FALCON_API void falcon_reset_timings(struct falcon_context * ctx); +def falcon_reset_timings(ctx: falcon_context_p): + _lib.ggllm_reset_timings(ctx) + + +_lib.ggllm_reset_timings.argtypes = [falcon_context_p] +_lib.ggllm_reset_timings.restype = None + + +# Print system information +# FALCON_API const char * falcon_print_system_info(void); +def falcon_print_system_info() -> bytes: + return _lib.ggllm_print_system_info() + + +_lib.ggllm_print_system_info.argtypes = [] +_lib.ggllm_print_system_info.restype = c_char_p + +################################################################################################### + + +_falcon_initialized = False + +if not _falcon_initialized: + falcon_init_backend(c_bool(False)) + _falcon_initialized = True \ No newline at end of file diff --git a/llama_cpp/llama_types.py b/falcon_cpp/falcon_types.py similarity index 100% rename from llama_cpp/llama_types.py rename to falcon_cpp/falcon_types.py diff --git a/llama_cpp/server/__init__.py b/falcon_cpp/server/__init__.py similarity index 100% rename from llama_cpp/server/__init__.py rename to falcon_cpp/server/__init__.py diff --git a/llama_cpp/server/__main__.py b/falcon_cpp/server/__main__.py similarity index 100% rename from llama_cpp/server/__main__.py rename to falcon_cpp/server/__main__.py diff --git a/llama_cpp/server/app.py b/falcon_cpp/server/app.py similarity index 88% rename from llama_cpp/server/app.py rename to falcon_cpp/server/app.py index ef319c7e0..2e0972ea6 100644 --- a/llama_cpp/server/app.py +++ b/falcon_cpp/server/app.py @@ -5,7 +5,7 @@ from typing import Iterator, List, Optional, Union, Dict from typing_extensions import TypedDict, Literal -import llama_cpp +import falcon_cpp import anyio from anyio.streams.memory import MemoryObjectSendStream @@ -24,7 +24,7 @@ class Settings(BaseSettings): default=None, description="The alias of the model to use for generating completions.", ) - n_ctx: int = Field(default=2048, ge=1, description="The context size.") + n_ctx: int = Field(default=8192, ge=1, description="The context size.") n_gpu_layers: int = Field( default=0, ge=0, @@ -43,11 +43,11 @@ class Settings(BaseSettings): ) f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.") use_mlock: bool = Field( - default=llama_cpp.llama_mlock_supported(), + default=falcon_cpp.falcon_mlock_supported(), description="Use mlock.", ) use_mmap: bool = Field( - default=llama_cpp.llama_mmap_supported(), + default=falcon_cpp.falcon_mmap_supported(), description="Use mmap.", ) embedding: bool = Field(default=True, description="Whether to use embeddings.") @@ -90,14 +90,14 @@ class Settings(BaseSettings): router = APIRouter() settings: Optional[Settings] = None -llama: Optional[llama_cpp.Llama] = None +falcon: Optional[falcon_cpp.falcon] = None def create_app(settings: Optional[Settings] = None): if settings is None: settings = Settings() app = FastAPI( - title="🦙 llama.cpp Python API", + title="🦙 falcon.cpp Python API", version="0.0.1", ) app.add_middleware( @@ -108,8 +108,8 @@ def create_app(settings: Optional[Settings] = None): allow_headers=["*"], ) app.include_router(router) - global llama - llama = llama_cpp.Llama( + global falcon + falcon = falcon_cpp.Falcon( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, seed=settings.seed, @@ -129,14 +129,14 @@ def create_app(settings: Optional[Settings] = None): if settings.cache_type == "disk": if settings.verbose: print(f"Using disk cache with size {settings.cache_size}") - cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size) + cache = falcon_cpp.FalconDiskCache(capacity_bytes=settings.cache_size) else: if settings.verbose: print(f"Using ram cache with size {settings.cache_size}") - cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size) + cache = falcon_cpp.FalconRAMCache(capacity_bytes=settings.cache_size) - cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size) - llama.set_cache(cache) + cache = falcon_cpp.FalconCache(capacity_bytes=settings.cache_size) + falcon.set_cache(cache) def set_settings(_settings: Settings): global settings @@ -146,12 +146,12 @@ def set_settings(_settings: Settings): return app -llama_lock = Lock() +falcon_lock = Lock() -def get_llama(): - with llama_lock: - yield llama +def get_falcon(): + with falcon_lock: + yield falcon def get_settings(): @@ -276,7 +276,7 @@ class CreateCompletionRequest(BaseModel): best_of: Optional[int] = 1 user: Optional[str] = Field(None) - # llama.cpp specific parameters + # falcon.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) @@ -290,11 +290,11 @@ class Config: } -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) +CreateCompletionResponse = create_model_from_typeddict(falcon_cpp.Completion) def make_logit_bias_processor( - llama: llama_cpp.Llama, + falcon: falcon_cpp.Falcon, logit_bias: Dict[str, float], logit_bias_type: Optional[Literal["input_ids", "tokens"]], ): @@ -310,7 +310,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): token = token.encode('utf-8') - for input_id in llama.tokenize(token, add_bos=False): + for input_id in falcon.tokenize(token, add_bos=False): to_bias[input_id] = score def logit_bias_processor( @@ -333,7 +333,7 @@ def logit_bias_processor( async def create_completion( request: Request, body: CreateCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), + falcon: falcon_cpp.Falcon = Depends(get_falcon), ): if isinstance(body.prompt, list): assert len(body.prompt) <= 1 @@ -349,8 +349,8 @@ async def create_completion( kwargs = body.dict(exclude=exclude) if body.logit_bias is not None: - kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ - make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), ]) if body.stream: @@ -359,7 +359,7 @@ async def create_completion( async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: - iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs) # type: ignore + iterator: Iterator[falcon_cpp.CompletionChunk] = await run_in_threadpool(falcon, **kwargs) # type: ignore async for chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): @@ -378,7 +378,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): recv_chan, data_sender_callable=partial(event_publisher, send_chan) ) else: - completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore + completion: falcon_cpp.Completion = await run_in_threadpool(falcon, **kwargs) # type: ignore return completion @@ -395,7 +395,7 @@ class Config: } -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) +CreateEmbeddingResponse = create_model_from_typeddict(falcon_cpp.Embedding) @router.post( @@ -403,10 +403,10 @@ class Config: response_model=CreateEmbeddingResponse, ) async def create_embedding( - request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) + request: CreateEmbeddingRequest, falcon: falcon_cpp.Falcon = Depends(get_falcon) ): return await run_in_threadpool( - llama.create_embedding, **request.dict(exclude={"user"}) + falcon.create_embedding, **request.dict(exclude={"user"}) ) @@ -438,7 +438,7 @@ class CreateChatCompletionRequest(BaseModel): n: Optional[int] = 1 user: Optional[str] = Field(None) - # llama.cpp specific parameters + # falcon.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) @@ -458,7 +458,7 @@ class Config: } -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) +CreateChatCompletionResponse = create_model_from_typeddict(falcon_cpp.ChatCompletion) @router.post( @@ -468,8 +468,8 @@ class Config: async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, - llama: llama_cpp.Llama = Depends(get_llama), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + falcon: falcon_cpp.Falcon = Depends(get_falcon), +) -> Union[falcon_cpp.ChatCompletion, EventSourceResponse]: exclude = { "n", "logit_bias", @@ -479,8 +479,8 @@ async def create_chat_completion( kwargs = body.dict(exclude=exclude) if body.logit_bias is not None: - kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ - make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type), + kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([ + make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type), ]) if body.stream: @@ -489,7 +489,7 @@ async def create_chat_completion( async def event_publisher(inner_send_chan: MemoryObjectSendStream): async with inner_send_chan: try: - iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs) # type: ignore + iterator: Iterator[falcon_cpp.ChatCompletionChunk] = await run_in_threadpool(falcon.create_chat_completion, **kwargs) # type: ignore async for chat_chunk in iterate_in_threadpool(iterator): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): @@ -509,8 +509,8 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): data_sender_callable=partial(event_publisher, send_chan), ) else: - completion: llama_cpp.ChatCompletion = await run_in_threadpool( - llama.create_chat_completion, **kwargs # type: ignore + completion: falcon_cpp.ChatCompletion = await run_in_threadpool( + falcon.create_chat_completion, **kwargs # type: ignore ) return completion @@ -533,7 +533,7 @@ class ModelList(TypedDict): @router.get("/v1/models", response_model=GetModelResponse) async def get_models( settings: Settings = Depends(get_settings), - llama: llama_cpp.Llama = Depends(get_llama), + falcon: falcon_cpp.Falcon = Depends(get_falcon), ) -> ModelList: return { "object": "list", @@ -541,7 +541,7 @@ async def get_models( { "id": settings.model_alias if settings.model_alias is not None - else llama.model_path, + else falcon.model_path, "object": "model", "owned_by": "me", "permissions": [], diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py deleted file mode 100644 index dce1764f6..000000000 --- a/llama_cpp/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .llama_cpp import * -from .llama import * diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py deleted file mode 100644 index 52fc14e1d..000000000 --- a/llama_cpp/llama_cpp.py +++ /dev/null @@ -1,1024 +0,0 @@ -import sys -import os -import ctypes -from ctypes import ( - c_int, - c_float, - c_char_p, - c_void_p, - c_bool, - POINTER, - _Pointer, # type: ignore - Structure, - Array, - c_uint8, - c_size_t, -) -import pathlib -from typing import List, Union - - -# Load the library -def _load_shared_library(lib_base_name: str): - # Construct the paths to the possible shared library names - _base_path = pathlib.Path(__file__).parent.resolve() - # Searching for the library in the current directory under the name "libllama" (default name - # for llamacpp) and "llama" (default name for this repo) - _lib_paths: List[pathlib.Path] = [] - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - ] - elif sys.platform == "darwin": - _lib_paths += [ - _base_path / f"lib{lib_base_name}.so", - _base_path / f"lib{lib_base_name}.dylib", - ] - elif sys.platform == "win32": - _lib_paths += [ - _base_path / f"{lib_base_name}.dll", - ] - else: - raise RuntimeError("Unsupported platform") - - if "LLAMA_CPP_LIB" in os.environ: - lib_base_name = os.environ["LLAMA_CPP_LIB"] - _lib = pathlib.Path(lib_base_name) - _base_path = _lib.parent.resolve() - _lib_paths = [_lib.resolve()] - - cdll_args = dict() # type: ignore - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(_base_path)) - if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) - cdll_args["winmode"] = 0 - - # Try to load the shared library, handling potential errors - for _lib_path in _lib_paths: - if _lib_path.exists(): - try: - return ctypes.CDLL(str(_lib_path), **cdll_args) - except Exception as e: - raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}") - - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - - -# Specify the base name of the shared library to load -_lib_base_name = "llama" - -# Load the library -_lib = _load_shared_library(_lib_base_name) - -# Misc -c_float_p = POINTER(c_float) -c_uint8_p = POINTER(c_uint8) -c_size_t_p = POINTER(c_size_t) - -# llama.h bindings - -GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas") -GGML_CUDA_MAX_DEVICES = ctypes.c_int(16) -LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1) - -# #define LLAMA_FILE_MAGIC_GGJT 0x67676a74u // 'ggjt' -LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74) -# #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' -LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61) -# #define LLAMA_FILE_MAGIC_GGMF 0x67676d66u // 'ggmf' -LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66) -# #define LLAMA_FILE_MAGIC_GGML 0x67676d6cu // 'ggml' -LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C) -# #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' -LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E) - -# #define LLAMA_FILE_VERSION 3 -LLAMA_FILE_VERSION = c_int(3) -LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT -LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML -LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -LLAMA_SESSION_VERSION = c_int(1) - -# struct llama_model; -llama_model_p = c_void_p - -# struct llama_context; -llama_context_p = c_void_p - - -# typedef int llama_token; -llama_token = c_int -llama_token_p = POINTER(llama_token) - - -# typedef struct llama_token_data { -# llama_token id; // token id -# float logit; // log-odds of the token -# float p; // probability of the token -# } llama_token_data; -class llama_token_data(Structure): - _fields_ = [ - ("id", llama_token), - ("logit", c_float), - ("p", c_float), - ] - - -llama_token_data_p = POINTER(llama_token_data) - - -# typedef struct llama_token_data_array { -# llama_token_data * data; -# size_t size; -# bool sorted; -# } llama_token_data_array; -class llama_token_data_array(Structure): - _fields_ = [ - ("data", llama_token_data_p), - ("size", c_size_t), - ("sorted", c_bool), - ] - - -llama_token_data_array_p = POINTER(llama_token_data_array) - -# typedef void (*llama_progress_callback)(float progress, void *ctx); -llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p) - - -# struct llama_context_params { -# int seed; // RNG seed, -1 for random -# int n_ctx; // text context -# int n_batch; // prompt processing batch size -# int n_gpu_layers; // number of layers to store in VRAM -# int main_gpu; // the GPU that is used for scratch and small tensors -# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs -# // called with a progress value between 0 and 1, pass NULL to disable -# llama_progress_callback progress_callback; -# // context pointer passed to the progress callback -# void * progress_callback_user_data; - - -# // Keep the booleans together to avoid misalignment during copy-by-value. -# bool low_vram; // if true, reduce VRAM usage at the cost of performance -# bool f16_kv; // use fp16 for KV cache -# bool logits_all; // the llama_eval() call computes all logits, not just the last one -# bool vocab_only; // only load the vocabulary, no weights -# bool use_mmap; // use mmap if possible -# bool use_mlock; // force system to keep model in RAM -# bool embedding; // embedding mode only -# }; -class llama_context_params(Structure): - _fields_ = [ - ("seed", c_int), - ("n_ctx", c_int), - ("n_batch", c_int), - ("n_gpu_layers", c_int), - ("main_gpu", c_int), - ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), - ("progress_callback", llama_progress_callback), - ("progress_callback_user_data", c_void_p), - ("low_vram", c_bool), - ("f16_kv", c_bool), - ("logits_all", c_bool), - ("vocab_only", c_bool), - ("use_mmap", c_bool), - ("use_mlock", c_bool), - ("embedding", c_bool), - ] - - -llama_context_params_p = POINTER(llama_context_params) - -# enum llama_ftype { -# LLAMA_FTYPE_ALL_F32 = 0, -# LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 -# // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed -# // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed -# LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q2_K = 10,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_S = 11,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_M = 12,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q3_K_L = 13,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_K_S = 14,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q4_K_M = 15,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_K_S = 16,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q5_K_M = 17,// except 1d tensors -# LLAMA_FTYPE_MOSTLY_Q6_K = 18,// except 1d tensors -# }; -LLAMA_FTYPE_ALL_F32 = c_int(0) -LLAMA_FTYPE_MOSTLY_F16 = c_int(1) -LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2) -LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3) -LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4) -LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7) -LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8) -LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9) -LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10) -LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11) -LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12) -LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13) -LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14) -LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15) -LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16) -LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17) -LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18) - - -# // model quantization parameters -# typedef struct llama_model_quantize_params { -# int nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() -# enum llama_ftype ftype; // quantize to this llama_ftype -# bool allow_requantize; // allow quantizing non-f32/f16 tensors -# bool quantize_output_tensor; // quantize output.weight -# } llama_model_quantize_params; -class llama_model_quantize_params(Structure): - _fields_ = [ - ("nthread", c_int), - ("ftype", c_int), - ("allow_requantize", c_bool), - ("quantize_output_tensor", c_bool), - ] - - -# LLAMA_API struct llama_context_params llama_context_default_params(); -def llama_context_default_params() -> llama_context_params: - return _lib.llama_context_default_params() - - -_lib.llama_context_default_params.argtypes = [] -_lib.llama_context_default_params.restype = llama_context_params - - -# LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(); -def llama_model_quantize_default_params() -> llama_model_quantize_params: - return _lib.llama_model_quantize_default_params() - - -_lib.llama_model_quantize_default_params.argtypes = [] -_lib.llama_model_quantize_default_params.restype = llama_model_quantize_params - - -# LLAMA_API bool llama_mmap_supported(); -def llama_mmap_supported() -> bool: - return _lib.llama_mmap_supported() - - -_lib.llama_mmap_supported.argtypes = [] -_lib.llama_mmap_supported.restype = c_bool - - -# LLAMA_API bool llama_mlock_supported(); -def llama_mlock_supported() -> bool: - return _lib.llama_mlock_supported() - - -_lib.llama_mlock_supported.argtypes = [] -_lib.llama_mlock_supported.restype = c_bool - - -# // TODO: not great API - very likely to change -# // Initialize the llama + ggml backend -# // If numa is true, use NUMA optimizations -# // Call once at the start of the program -# LLAMA_API void llama_init_backend(bool numa); -def llama_init_backend(numa: c_bool): - return _lib.llama_init_backend(numa) - - -_lib.llama_init_backend.argtypes = [c_bool] -_lib.llama_init_backend.restype = None - - -# LLAMA_API struct llama_model * llama_load_model_from_file( -# const char * path_model, -# struct llama_context_params params); -def llama_load_model_from_file( - path_model: bytes, params: llama_context_params -) -> llama_model_p: - return _lib.llama_load_model_from_file(path_model, params) - - -_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params] -_lib.llama_load_model_from_file.restype = llama_model_p - - -# LLAMA_API void llama_free_model(struct llama_model * model); -def llama_free_model(model: llama_model_p): - return _lib.llama_free_model(model) - - -_lib.llama_free_model.argtypes = [llama_model_p] -_lib.llama_free_model.restype = None - - -# LLAMA_API struct llama_context * llama_new_context_with_model( -# struct llama_model * model, -# struct llama_context_params params); -def llama_new_context_with_model( - model: llama_model_p, params: llama_context_params -) -> llama_context_p: - return _lib.llama_new_context_with_model(model, params) - - -_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params] -_lib.llama_new_context_with_model.restype = llama_context_p - - -# LLAMA_API int64_t llama_time_us(); -def llama_time_us() -> int: - return _lib.llama_time_us() - - -_lib.llama_time_us.argtypes = [] -_lib.llama_time_us.restype = ctypes.c_int64 - - -# // Various functions for loading a ggml llama model. -# // Allocate (almost) all memory needed for the model. -# // Return NULL on failure -# LLAMA_API struct llama_context * llama_init_from_file( -# const char * path_model, -# struct llama_context_params params); -def llama_init_from_file( - path_model: bytes, params: llama_context_params -) -> llama_context_p: - return _lib.llama_init_from_file(path_model, params) - - -_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params] -_lib.llama_init_from_file.restype = llama_context_p - - -# Frees all allocated memory -# LLAMA_API void llama_free(struct llama_context * ctx); -def llama_free(ctx: llama_context_p): - return _lib.llama_free(ctx) - - -_lib.llama_free.argtypes = [llama_context_p] -_lib.llama_free.restype = None - - -# // Returns 0 on success -# LLAMA_API int llama_model_quantize( -# const char * fname_inp, -# const char * fname_out, -# const llama_model_quantize_params * params); -def llama_model_quantize( - fname_inp: bytes, - fname_out: bytes, - params, # type: POINTER(llama_model_quantize_params) # type: ignore -) -> int: - return _lib.llama_model_quantize(fname_inp, fname_out, params) - - -_lib.llama_model_quantize.argtypes = [ - c_char_p, - c_char_p, - POINTER(llama_model_quantize_params), -] -_lib.llama_model_quantize.restype = c_int - - -# Apply a LoRA adapter to a loaded model -# path_base_model is the path to a higher quality model to use as a base for -# the layers modified by the adapter. Can be NULL to use the current loaded model. -# The model needs to be reloaded before applying a new adapter, otherwise the adapter -# will be applied on top of the previous one -# Returns 0 on success -# LLAMA_API int llama_apply_lora_from_file( -# struct llama_context * ctx, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def llama_apply_lora_from_file( - ctx: llama_context_p, - path_lora: c_char_p, - path_base_model: c_char_p, - n_threads: c_int, -) -> int: - return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads) - - -_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int] -_lib.llama_apply_lora_from_file.restype = c_int - - -# LLAMA_API int llama_model_apply_lora_from_file( -# const struct llama_model * model, -# const char * path_lora, -# const char * path_base_model, -# int n_threads); -def llama_model_apply_lora_from_file( - model: llama_model_p, - path_lora: Union[c_char_p, bytes], - path_base_model: Union[c_char_p, bytes], - n_threads: c_int, -) -> int: - return _lib.llama_model_apply_lora_from_file( - model, path_lora, path_base_model, n_threads - ) - - -_lib.llama_model_apply_lora_from_file.argtypes = [ - llama_model_p, - c_char_p, - c_char_p, - c_int, -] -_lib.llama_model_apply_lora_from_file.restype = c_int - - -# Returns the number of tokens in the KV cache -# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx); -def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int: - return _lib.llama_get_kv_cache_token_count(ctx) - - -_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p] -_lib.llama_get_kv_cache_token_count.restype = c_int - - -# Sets the current rng seed. -# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed); -def llama_set_rng_seed(ctx: llama_context_p, seed: c_int): - return _lib.llama_set_rng_seed(ctx, seed) - - -_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int] -_lib.llama_set_rng_seed.restype = None - - -# Returns the maximum size in bytes of the state (rng, logits, embedding -# and kv_cache) - will often be smaller after compacting tokens -# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); -def llama_get_state_size(ctx: llama_context_p) -> int: - return _lib.llama_get_state_size(ctx) - - -_lib.llama_get_state_size.argtypes = [llama_context_p] -_lib.llama_get_state_size.restype = c_size_t - - -# Copies the state to the specified destination address. -# Destination needs to have allocated enough memory. -# Returns the number of bytes copied -# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst); -def llama_copy_state_data( - ctx: llama_context_p, dst # type: Array[c_uint8] -) -> int: - return _lib.llama_copy_state_data(ctx, dst) - - -_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p] -_lib.llama_copy_state_data.restype = c_size_t - - -# Set the state reading from the specified address -# Returns the number of bytes read -# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src); -def llama_set_state_data( - ctx: llama_context_p, src # type: Array[c_uint8] -) -> int: - return _lib.llama_set_state_data(ctx, src) - - -_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p] -_lib.llama_set_state_data.restype = c_size_t - - -# Save/load session file -# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); -def llama_load_session_file( - ctx: llama_context_p, - path_session: bytes, - tokens_out, # type: Array[llama_token] - n_token_capacity: c_size_t, - n_token_count_out, # type: _Pointer[c_size_t] -) -> int: - return _lib.llama_load_session_file( - ctx, path_session, tokens_out, n_token_capacity, n_token_count_out - ) - - -_lib.llama_load_session_file.argtypes = [ - llama_context_p, - c_char_p, - llama_token_p, - c_size_t, - c_size_t_p, -] -_lib.llama_load_session_file.restype = c_size_t - - -# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); -def llama_save_session_file( - ctx: llama_context_p, - path_session: bytes, - tokens, # type: Array[llama_token] - n_token_count: c_size_t, -) -> int: - return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count) - - -_lib.llama_save_session_file.argtypes = [ - llama_context_p, - c_char_p, - llama_token_p, - c_size_t, -] -_lib.llama_save_session_file.restype = c_size_t - - -# Run the llama inference to obtain the logits and probabilities for the next token. -# tokens + n_tokens is the provided batch of new tokens to process -# n_past is the number of tokens to use from previous eval calls -# Returns 0 on success -# LLAMA_API int llama_eval( -# struct llama_context * ctx, -# const llama_token * tokens, -# int n_tokens, -# int n_past, -# int n_threads); -def llama_eval( - ctx: llama_context_p, - tokens, # type: Array[llama_token] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads) - - -_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int] -_lib.llama_eval.restype = c_int - - -# // Same as llama_eval, but use float matrix input directly. -# LLAMA_API int llama_eval_embd( -# struct llama_context * ctx, -# const float * embd, -# int n_tokens, -# int n_past, -# int n_threads); -def llama_eval_embd( - ctx: llama_context_p, - embd, # type: Array[c_float] - n_tokens: c_int, - n_past: c_int, - n_threads: c_int, -) -> int: - return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads) - - -_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int] -_lib.llama_eval_embd.restype = c_int - - -# Convert the provided text into tokens. -# The tokens pointer must be large enough to hold the resulting tokens. -# Returns the number of tokens on success, no more than n_max_tokens -# Returns a negative number on failure - the number of tokens that would have been returned -# TODO: not sure if correct -# LLAMA_API int llama_tokenize( -# struct llama_context * ctx, -# const char * text, -# llama_token * tokens, -# int n_max_tokens, -# bool add_bos); -def llama_tokenize( - ctx: llama_context_p, - text: bytes, - tokens, # type: Array[llama_token] - n_max_tokens: c_int, - add_bos: c_bool, -) -> int: - return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos) - - -_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool] -_lib.llama_tokenize.restype = c_int - - -# LLAMA_API int llama_n_vocab(const struct llama_context * ctx); -def llama_n_vocab(ctx: llama_context_p) -> int: - return _lib.llama_n_vocab(ctx) - - -_lib.llama_n_vocab.argtypes = [llama_context_p] -_lib.llama_n_vocab.restype = c_int - - -# LLAMA_API int llama_n_ctx (const struct llama_context * ctx); -def llama_n_ctx(ctx: llama_context_p) -> int: - return _lib.llama_n_ctx(ctx) - - -_lib.llama_n_ctx.argtypes = [llama_context_p] -_lib.llama_n_ctx.restype = c_int - - -# LLAMA_API int llama_n_embd (const struct llama_context * ctx); -def llama_n_embd(ctx: llama_context_p) -> int: - return _lib.llama_n_embd(ctx) - - -_lib.llama_n_embd.argtypes = [llama_context_p] -_lib.llama_n_embd.restype = c_int - - -# // Get the vocabulary as output parameters. -# // Returns number of results. -# LLAMA_API int llama_get_vocab( -# const struct llama_context * ctx, -# const char * * strings, -# float * scores, -# int capacity); -def llama_get_vocab( - ctx: llama_context_p, - strings, # type: Array[c_char_p] # type: ignore - scores, # type: Array[c_float] # type: ignore - capacity: c_int, -) -> int: - return _lib.llama_get_vocab(ctx, strings, scores, capacity) - - -_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int] -_lib.llama_get_vocab.restype = c_int - - -# Token logits obtained from the last call to llama_eval() -# The logits for the last token are stored in the last row -# Can be mutated in order to change the probabilities of the next token -# Rows: n_tokens -# Cols: n_vocab -# LLAMA_API float * llama_get_logits(struct llama_context * ctx); -def llama_get_logits( - ctx: llama_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.llama_get_logits(ctx) - - -_lib.llama_get_logits.argtypes = [llama_context_p] -_lib.llama_get_logits.restype = c_float_p - - -# Get the embeddings for the input -# shape: [n_embd] (1-dimensional) -# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); -def llama_get_embeddings( - ctx: llama_context_p, -): # type: (...) -> Array[float] # type: ignore - return _lib.llama_get_embeddings(ctx) - - -_lib.llama_get_embeddings.argtypes = [llama_context_p] -_lib.llama_get_embeddings.restype = c_float_p - - -# Token Id -> String. Uses the vocabulary in the provided context -# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); -def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: - return _lib.llama_token_to_str(ctx, token) - - -_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] -_lib.llama_token_to_str.restype = c_char_p - -# Special tokens - - -# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence -def llama_token_bos() -> int: - return _lib.llama_token_bos() - - -_lib.llama_token_bos.argtypes = [] -_lib.llama_token_bos.restype = llama_token - - -# LLAMA_API llama_token llama_token_eos(); // end-of-sentence -def llama_token_eos() -> int: - return _lib.llama_token_eos() - - -_lib.llama_token_eos.argtypes = [] -_lib.llama_token_eos.restype = llama_token - - -# LLAMA_API llama_token llama_token_nl(); // next-line -def llama_token_nl() -> int: - return _lib.llama_token_nl() - - -_lib.llama_token_nl.argtypes = [] -_lib.llama_token_nl.restype = llama_token - - -# Sampling functions - - -# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. -# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty); -def llama_sample_repetition_penalty( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - last_tokens_data, # type: Array[llama_token] - last_tokens_size: c_int, - penalty: c_float, -): - return _lib.llama_sample_repetition_penalty( - ctx, candidates, last_tokens_data, last_tokens_size, penalty - ) - - -_lib.llama_sample_repetition_penalty.argtypes = [ - llama_context_p, - llama_token_data_array_p, - llama_token_p, - c_int, - c_float, -] -_lib.llama_sample_repetition_penalty.restype = None - - -# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. -# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence); -def llama_sample_frequency_and_presence_penalties( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - last_tokens_data, # type: Array[llama_token] - last_tokens_size: c_int, - alpha_frequency: c_float, - alpha_presence: c_float, -): - return _lib.llama_sample_frequency_and_presence_penalties( - ctx, - candidates, - last_tokens_data, - last_tokens_size, - alpha_frequency, - alpha_presence, - ) - - -_lib.llama_sample_frequency_and_presence_penalties.argtypes = [ - llama_context_p, - llama_token_data_array_p, - llama_token_p, - c_int, - c_float, - c_float, -] -_lib.llama_sample_frequency_and_presence_penalties.restype = None - - -# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_softmax( - ctx: llama_context_p, candidates # type: _Pointer[llama_token_data] -): - return _lib.llama_sample_softmax(ctx, candidates) - - -_lib.llama_sample_softmax.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_softmax.restype = None - - -# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep); -def llama_sample_top_k( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - k: c_int, - min_keep: c_size_t, -): - return _lib.llama_sample_top_k(ctx, candidates, k, min_keep) - - -_lib.llama_sample_top_k.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_int, - c_size_t, -] -_lib.llama_sample_top_k.restype = None - - -# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); -def llama_sample_top_p( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_top_p(ctx, candidates, p, min_keep) - - -_lib.llama_sample_top_p.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_top_p.restype = None - - -# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. -# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep); -def llama_sample_tail_free( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - z: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep) - - -_lib.llama_sample_tail_free.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_tail_free.restype = None - - -# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep); -def llama_sample_typical( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - p: c_float, - min_keep: c_size_t, -): - return _lib.llama_sample_typical(ctx, candidates, p, min_keep) - - -_lib.llama_sample_typical.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_size_t, -] -_lib.llama_sample_typical.restype = None - - -# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp); -def llama_sample_temperature( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - temp: c_float, -): - return _lib.llama_sample_temperature(ctx, candidates, temp) - - -_lib.llama_sample_temperature.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, -] -_lib.llama_sample_temperature.restype = None - - -# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu); -def llama_sample_token_mirostat( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - tau: c_float, - eta: c_float, - m: c_int, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu) - - -_lib.llama_sample_token_mirostat.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_float, - c_int, - c_float_p, -] -_lib.llama_sample_token_mirostat.restype = llama_token - - -# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. -# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. -# @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. -# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. -# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu); -def llama_sample_token_mirostat_v2( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] - tau: c_float, - eta: c_float, - mu, # type: _Pointer[c_float] -) -> int: - return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu) - - -_lib.llama_sample_token_mirostat_v2.argtypes = [ - llama_context_p, - llama_token_data_array_p, - c_float, - c_float, - c_float_p, -] -_lib.llama_sample_token_mirostat_v2.restype = llama_token - - -# @details Selects the token with the highest probability. -# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_token_greedy( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] -) -> int: - return _lib.llama_sample_token_greedy(ctx, candidates) - - -_lib.llama_sample_token_greedy.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_token_greedy.restype = llama_token - - -# @details Randomly selects a token from the candidates based on their probabilities. -# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates); -def llama_sample_token( - ctx: llama_context_p, - candidates, # type: _Pointer[llama_token_data_array] -) -> int: - return _lib.llama_sample_token(ctx, candidates) - - -_lib.llama_sample_token.argtypes = [ - llama_context_p, - llama_token_data_array_p, -] -_lib.llama_sample_token.restype = llama_token - - -# Performance information - - -# LLAMA_API void llama_print_timings(struct llama_context * ctx); -def llama_print_timings(ctx: llama_context_p): - _lib.llama_print_timings(ctx) - - -_lib.llama_print_timings.argtypes = [llama_context_p] -_lib.llama_print_timings.restype = None - - -# LLAMA_API void llama_reset_timings(struct llama_context * ctx); -def llama_reset_timings(ctx: llama_context_p): - _lib.llama_reset_timings(ctx) - - -_lib.llama_reset_timings.argtypes = [llama_context_p] -_lib.llama_reset_timings.restype = None - - -# Print system information -# LLAMA_API const char * llama_print_system_info(void); -def llama_print_system_info() -> bytes: - return _lib.llama_print_system_info() - - -_lib.llama_print_system_info.argtypes = [] -_lib.llama_print_system_info.restype = c_char_p - -################################################################################################### - - -_llama_initialized = False - -if not _llama_initialized: - llama_init_backend(c_bool(False)) - _llama_initialized = True diff --git a/mkdocs.yml b/mkdocs.yml index 286581176..e4147790b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ -site_name: llama-cpp-python -repo_url: https://github.com/abetlen/llama-cpp-python +site_name: falcon-cpp-python +repo_url: https://github.com/sirajperson/falcon-cpp-python theme: name: "material" @@ -9,7 +9,7 @@ plugins: - search watch: - - llama_cpp + - falcon_cpp markdown_extensions: - pymdownx.highlight: diff --git a/pyproject.toml b/pyproject.toml index e79d72eef..196aaedcb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,13 @@ [tool.poetry] -name = "llama_cpp_python" -version = "0.1.67" -description = "Python bindings for the llama.cpp library" -authors = ["Andrei Betlen "] +name = "falcon_cpp_python" +version = "0.0.1" +description = "Python bindings for the ggllm.cpp library" +authors = ["Jonathan Levin "] license = "MIT" readme = "README.md" -homepage = "https://github.com/abetlen/llama-cpp-python" -repository = "https://github.com/abetlen/llama-cpp-python" -packages = [{include = "llama_cpp"}] +homepage = "https://github.com/abetlen/falcon-cpp-python" +repository = "https://github.com/abetlen/falcon-cpp-python" +packages = [{include = "falcon_cpp"}] include = [ "LICENSE.md", ] @@ -41,4 +41,4 @@ requires = [ "cmake>=3.18", "ninja", ] -build-backend = "setuptools.build_meta" \ No newline at end of file +build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 95593415a..4cc1ad765 100644 --- a/setup.py +++ b/setup.py @@ -6,16 +6,16 @@ long_description = (this_directory / "README.md").read_text(encoding="utf-8") setup( - name="llama_cpp_python", - description="A Python wrapper for llama.cpp", + name="falcon_cpp_python", + description="A Python wrapper for ggllm.cpp to run Falcon models", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.67", - author="Andrei Betlen", - author_email="abetlen@gmail.com", + version="0.0.1", + author="Siraj Levin", + author_email="sirajperson@gmail.com", license="MIT", - package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"}, - packages=["llama_cpp", "llama_cpp.server"], + package_dir={"falcon_cpp": "falcon_cpp", "falcon_cpp.server": "falcon_cpp/server"}, + packages=["falcon_cpp", "falcon_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], diff --git a/tests/test_llama.py b/tests/test_falcon.py similarity index 56% rename from tests/test_llama.py rename to tests/test_falcon.py index 941287de6..d162cc6d6 100644 --- a/tests/test_llama.py +++ b/tests/test_falcon.py @@ -1,39 +1,39 @@ -import llama_cpp +import falcon_cpp -MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin" +MODEL = "./vendor/ggllm/models/ggml-vocab.bin" -def test_llama(): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) +def test_falcon(): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) - assert llama - assert llama.ctx is not None + assert falcon + assert falcon.ctx is not None text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text + assert falcon.detokenize(falcon.tokenize(text)) == text # @pytest.mark.skip(reason="need to update sample mocking") -def test_llama_patch(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = llama_cpp.llama_n_vocab(llama.ctx) +def test_falcon_patch(monkeypatch): + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) ## Set up mock function def mock_eval(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] ) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) output_text = " jumps over the lazy dog." - output_tokens = llama.tokenize(output_text.encode("utf-8")) - token_eos = llama.token_eos() + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() n = 0 def mock_sample(*args, **kwargs): @@ -44,31 +44,31 @@ def mock_sample(*args, **kwargs): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_cpp_sample_token", mock_sample) text = "The quick brown fox" ## Test basic completion until eos n = 0 # reset - completion = llama.create_completion(text, max_tokens=20) + completion = falcon.create_completion(text, max_tokens=20) assert completion["choices"][0]["text"] == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until eos n = 0 # reset - chunks = llama.create_completion(text, max_tokens=20, stream=True) + chunks = falcon.create_completion(text, max_tokens=20, stream=True) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text assert completion["choices"][0]["finish_reason"] == "stop" ## Test basic completion until stop sequence n = 0 # reset - completion = llama.create_completion(text, max_tokens=20, stop=["lazy"]) + completion = falcon.create_completion(text, max_tokens=20, stop=["lazy"]) assert completion["choices"][0]["text"] == " jumps over the " assert completion["choices"][0]["finish_reason"] == "stop" ## Test streaming completion until stop sequence n = 0 # reset - chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) + chunks = falcon.create_completion(text, max_tokens=20, stream=True, stop=["lazy"]) assert ( "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the " ) @@ -76,54 +76,54 @@ def mock_sample(*args, **kwargs): ## Test basic completion until length n = 0 # reset - completion = llama.create_completion(text, max_tokens=2) + completion = falcon.create_completion(text, max_tokens=2) assert completion["choices"][0]["text"] == " j" assert completion["choices"][0]["finish_reason"] == "length" ## Test streaming completion until length n = 0 # reset - chunks = llama.create_completion(text, max_tokens=2, stream=True) + chunks = falcon.create_completion(text, max_tokens=2, stream=True) assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j" assert completion["choices"][0]["finish_reason"] == "length" -def test_llama_pickle(): +def test_falcon_pickle(): import pickle import tempfile fp = tempfile.TemporaryFile() - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - pickle.dump(llama, fp) + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + pickle.dump(falcon, fp) fp.seek(0) - llama = pickle.load(fp) + falcon = pickle.load(fp) - assert llama - assert llama.ctx is not None + assert falcon + assert falcon.ctx is not None text = b"Hello World" - assert llama.detokenize(llama.tokenize(text)) == text + assert falcon.detokenize(falcon.tokenize(text)) == text def test_utf8(monkeypatch): - llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True) - n_vocab = llama_cpp.llama_n_vocab(llama.ctx) + falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True) + n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx) ## Set up mock function def mock_eval(*args, **kwargs): return 0 def mock_get_logits(*args, **kwargs): - return (llama_cpp.c_float * n_vocab)( - *[llama_cpp.c_float(0) for _ in range(n_vocab)] + return (falcon_cpp.c_float * n_vocab)( + *[falcon_cpp.c_float(0) for _ in range(n_vocab)] ) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval) - monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits) output_text = "😀" - output_tokens = llama.tokenize(output_text.encode("utf-8")) - token_eos = llama.token_eos() + output_tokens = falcon.tokenize(output_text.encode("utf-8")) + token_eos = falcon.token_eos() n = 0 def mock_sample(*args, **kwargs): @@ -134,22 +134,22 @@ def mock_sample(*args, **kwargs): else: return token_eos - monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample) + monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_sample_token", mock_sample) ## Test basic completion with utf8 multibyte n = 0 # reset - completion = llama.create_completion("", max_tokens=4) + completion = falcon.create_completion("", max_tokens=4) assert completion["choices"][0]["text"] == output_text ## Test basic completion with incomplete utf8 multibyte n = 0 # reset - completion = llama.create_completion("", max_tokens=1) + completion = falcon.create_completion("", max_tokens=1) assert completion["choices"][0]["text"] == "" -def test_llama_server(): +def test_falcon_server(): from fastapi.testclient import TestClient - from llama_cpp.server.app import create_app, Settings + from falcon_cpp.server.app import create_app, Settings settings = Settings( model=MODEL, diff --git a/vendor/ggllm.cpp b/vendor/ggllm.cpp new file mode 160000 index 000000000..8c019b677 --- /dev/null +++ b/vendor/ggllm.cpp @@ -0,0 +1 @@ +Subproject commit 8c019b67757538e7750cd30640fd00bbe8bc30de diff --git a/vendor/llama.cpp b/vendor/llama.cpp deleted file mode 160000 index 96a712ca1..000000000 --- a/vendor/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1