diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 5df12aaf5..a5e1a9cb5 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -12,17 +12,17 @@ assignees: ''
 Please answer the following questions for yourself before submitting an issue.
 
 - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/abetlen/llama-cpp-python/blob/main/README.md).
+- [ ] I carefully followed the [README.md](https://github.com/sirajperson/falcon-cpp-python/blob/main/README.md).
 - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/abetlen/llama-cpp-python/discussions), and have a new bug or useful enhancement to share.
+- [ ] I reviewed the [Discussions](https://github.com/sirajperson/falcon-cpp-python/discussions), and have a new bug or useful enhancement to share.
 
 # Expected Behavior
 
-Please provide a detailed written description of what you were trying to do, and what you expected `llama-cpp-python` to do.
+Please provide a detailed written description of what you were trying to do, and what you expected `falcon-cpp-python` to do.
 
 # Current Behavior
 
-Please provide a detailed written description of what `llama-cpp-python` did, instead.
+Please provide a detailed written description of what `falcon-cpp-python` did, instead.
 
 # Environment and Context
 
@@ -61,13 +61,13 @@ Please provide detailed steps for reproducing the issue. We are not sitting in f
 
 Try the following:
 
-1. `git clone https://github.com/abetlen/llama-cpp-python`
-2. `cd llama-cpp-python`
+1. `git clone https://github.com/sirajperson/falcon-cpp-python`
+2. `cd falcon-cpp-python`
 3. `rm -rf _skbuild/` # delete any old builds
 4. `python setup.py develop`
-5. `cd ./vendor/llama.cpp`
-6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
-7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)
+5. `cd ./vendor/ggllm.cpp`
+6. Follow [ggllm.cpp's instructions](https://github.com/cmp-nct/ggllm.cpp) section on how to compile with `cmake`
+7. Run ggllm.cpp's `./falcon_main` with the same arguments you previously passed to falcon-cpp-python and see if you can reproduce the issue. If you can, [log an issue with ggllm.cpp](https://github.com/cmp-nct/ggllm.cpp/issues)
 
 # Failure Logs
 
@@ -77,10 +77,10 @@ Also, please try to **avoid using screenshots** if at all possible. Instead, cop
 
 Example environment info:
 ```
-llama-cpp-python$ git log | head -1
+falcon-cpp-python$ git log | head -1
 commit 47b0aa6e957b93dbe2c29d53af16fbae2dd628f2
 
-llama-cpp-python$ python3 --version
+falcon-cpp-python$ python3 --version
 Python 3.10.10
 
 llama-cpp-python$ pip list | egrep "uvicorn|fastapi|sse-starlette|numpy"
@@ -89,8 +89,8 @@ numpy                    1.24.3
 sse-starlette            1.3.3
 uvicorn                  0.21.1
 
-llama-cpp-python/vendor/llama.cpp$ git log | head -3
+falcon-cpp-python/vendor/llama.cpp$ git log | head -3
 commit 66874d4fbcc7866377246efbcee938e8cc9c7d76
-Author: Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
+Author: YupHippie <44031344+YupHippie@users.noreply.github.com>
 Date:   Thu May 25 20:18:01 2023 -0600
 ```
diff --git a/.gitmodules b/.gitmodules
index 7edf0975d..eeadc3d38 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,3 @@
-[submodule "vendor/llama.cpp"]
-	path = vendor/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp.git
+[submodule "ggllm.cpp"]
+	path = ggllm.cpp
+	url = https://github.com/sirajperson/ggllm.cpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 788402a56..7e1faac42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,34 +1,24 @@
 cmake_minimum_required(VERSION 3.4...3.22)
 
-project(llama_cpp)
+project(falcon_cpp)
 
-option(FORCE_CMAKE "Force CMake build of Python bindings" OFF)
-
-set(FORCE_CMAKE $ENV{FORCE_CMAKE})
-
-if (UNIX AND NOT FORCE_CMAKE)
-    add_custom_command(
-        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
-        COMMAND make libllama.so
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp
-    )
-    add_custom_target(
-        run ALL
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
-    )
-    install(
-        FILES ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/libllama.so
-        DESTINATION llama_cpp
-    )
-else()
-    set(BUILD_SHARED_LIBS "On")
-    add_subdirectory(vendor/llama.cpp)
-    install(
-        TARGETS llama 
-        LIBRARY DESTINATION llama_cpp
-        RUNTIME DESTINATION llama_cpp
-        ARCHIVE DESTINATION llama_cpp
-        FRAMEWORK DESTINATION llama_cpp
-        RESOURCE DESTINATION llama_cpp
-    )
-endif()
+# Build shared libraries using custom command
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so
+    COMMAND cmake -DLLAMA_CUBLAS=1 -DCUDAToolkit_ROOT=/usr/local/cuda/ -DBUILD_SHARED_LIBS=on ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp
+    COMMAND make
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp
+)
+add_custom_target(
+    build_shared_libs ALL
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so
+)
+# Install shared libraries
+install(
+    FILES
+        ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libcmpnct_unicode.so
+        ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libggml_shared.so
+        ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libfalcon.so
+        ${CMAKE_CURRENT_SOURCE_DIR}/vendor/ggllm.cpp/libllama.so
+    DESTINATION falcon_cpp
+)
diff --git a/Makefile b/Makefile
index 66d93f3a2..3301081d0 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ update:
 	git submodule update --init --recursive
 
 update.vendor:
-	cd vendor/llama.cpp && git pull origin master
+	cd vendor/ggllm.cpp && git pull origin master
 
 build:
 	python3 setup.py develop
@@ -34,14 +34,14 @@ deploy.gh-docs:
 	mkdocs gh-deploy
 
 clean:
-	- cd vendor/llama.cpp && make clean
-	- cd vendor/llama.cpp && rm libllama.so
+	- cd vendor/ggllm.cpp && make clean
+	- cd vendor/ggllm.cpp && rm llamacpp.so
 	- rm -rf _skbuild
-	- rm llama_cpp/*.so
-	- rm llama_cpp/*.dylib
-	- rm llama_cpp/*.metal
-	- rm llama_cpp/*.dll
-	- rm llama_cpp/*.lib
+	- rm falcon_cpp/*.so
+	- rm falcon_cpp/*.dylib
+	- rm falcon_cpp/*.metal
+	- rm falcon_cpp/*.dll
+	- rm falcon_cpp/*.lib
 
 .PHONY: \
 	update \
diff --git a/README.md b/README.md
index fb652a925..9a490f40d 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,7 @@
-# 🦙 Python Bindings for `llama.cpp`
+#  Python Bindings for `ggllm.cpp`, a library for loading and execution of inferences to falcon based models 
 
-[![Documentation Status](https://readthedocs.org/projects/llama-cpp-python/badge/?version=latest)](https://llama-cpp-python.readthedocs.io/en/latest/?badge=latest)
-[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
-[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 
-Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
+Simple Python bindings for [`ggllm.cpp`](https://github.com/cmp-nct/ggllm.cpp) library.
 This package provides:
 
 - Low-level access to C API via `ctypes` interface.
@@ -15,73 +9,18 @@ This package provides:
   - OpenAI-like API
   - LangChain compatibility
 
-Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
+This project is currently in alpha development and is not yet completely functional. Any contributions are warmly welcomed.
 
 
-## Installation from PyPI (recommended)
-
-Install from PyPI (requires a c compiler):
-
-```bash
-pip install llama-cpp-python
-```
-
-The above command will attempt to install the package and build `llama.cpp` from source.
-This is the recommended installation method as it ensures that `llama.cpp` is built with the available optimizations for your system.
-
-If you have previously installed `llama-cpp-python` through pip and want to upgrade your version or rebuild the package with different  compiler options, please add the following flags to ensure that the package is rebuilt correctly:
-
-```bash
-pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
-```
-
-Note: If you are using Apple Silicon (M1) Mac, make sure you have installed a version of Python that supports arm64 architecture. For example:
-```
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
-bash Miniforge3-MacOSX-arm64.sh
-```
-Otherwise, while installing it will build the llama.ccp x86 version which will be 10x slower on Apple Silicon (M1) Mac.
-
-### Installation with OpenBLAS / cuBLAS / CLBlast / Metal
-
-`llama.cpp` supports multiple BLAS backends for faster processing.
-Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and install the pip package for the desired BLAS backend.
-
-To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
-
-```bash
-CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
-```
-
-To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
-
-```bash
-CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
-```
-
-To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
-
-```bash
-CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
-```
-
-To install with Metal (MPS), set the `LLAMA_METAL=on` environment variable before installing:
-
-```bash
-CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install llama-cpp-python
-```
-
-Detailed MacOS Metal GPU install documentation is available at [docs/install/macos.md](docs/install/macos.md)
-
 ## High-level API
 
-The high-level API provides a simple managed interface through the `Llama` class.
+The high-level API provides a simple managed interface through the `Falcon` class.
 
 Below is a short example demonstrating how to use the high-level API to generate text:
 
 ```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="./models/7B/ggml-model.bin")
+>>> from falcon_cpp import Falcon
+>>> llm = Falcon(model_path="./models/7B/ggml-model.bin")
 >>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
 >>> print(output)
 {
@@ -107,63 +46,51 @@ Below is a short example demonstrating how to use the high-level API to generate
 
 ## Web Server
 
-`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
-This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
+`falcon-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
+This allows you to use ggllm.cpp to inference falcon models with any OpenAI compatible client (language libraries, services, etc).
 
 To install the server package and get started:
 
 ```bash
-pip install llama-cpp-python[server]
-python3 -m llama_cpp.server --model models/7B/ggml-model.bin
+python3 -m falcon_cpp.server --model models/7B/ggml-model.bin
 ```
 
 Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
 
-## Docker image
-
-A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
-
-```bash
-docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
-```
-
 ## Low-level API
 
-The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `llama.cpp`.
-The entire lowe-level API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and directly mirrors the C API in [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
+The low-level API is a direct [`ctypes`](https://docs.python.org/3/library/ctypes.html) binding to the C API provided by `ggllm.cpp`.
+The entire lowe-level API can be found in [falcon_cpp/falcon_cpp.py](https://github.com/sirajperson/falcon-cpp-python/blob/master/falcon_cpp/falcon_cpp.py) and directly mirrors the C API in [libfalcon.h](https://github.com/cmp-nct/ggllm.cpp/blob/master/libfalcon.h).
 
 Below is a short example demonstrating how to use the low-level API to tokenize a prompt:
 
 ```python
->>> import llama_cpp
+>>> import falcon_cpp
 >>> import ctypes
->>> params = llama_cpp.llama_context_default_params()
+>>> params = falcon_cpp.falcon_context_default_params()
 # use bytes for char * params
->>> ctx = llama_cpp.llama_init_from_file(b"./models/7b/ggml-model.bin", params)
+>>> ctx = falcon_cpp.falcon_init_backend("./models/7b/ggml-model.bin", params)
 >>> max_tokens = params.n_ctx
 # use ctypes arrays for array params
->>> tokens = (llama_cpp.llama_token * int(max_tokens))()
->>> n_tokens = llama_cpp.llama_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=llama_cpp.c_bool(True))
->>> llama_cpp.llama_free(ctx)
+>>> tokens = (falcon_cpp.falcon_token * int(max_tokens))()
+>>> n_tokens = falcon_cpp.falcon_tokenize(ctx, b"Q: Name the planets in the solar system? A: ", tokens, max_tokens, add_bos=falcon_cpp.c_bool(True))
+>>> falcon_cpp.falcon_free(ctx)
 ```
 
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 
-
 # Documentation
-
-Documentation is available at [https://abetlen.github.io/llama-cpp-python](https://abetlen.github.io/llama-cpp-python).
-If you find any issues with the documentation, please open an issue or submit a PR.
+Coming soon...
 
 # Development
 
-This package is under active development and I welcome any contributions.
+Again, this package is under active development and I welcome any contributions.
 
 To get started, clone the repository and install the package in development mode:
 
 ```bash
-git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
-cd llama-cpp-python
+git clone --recurse-submodules git@github.com:sirajperson/falcon-cpp-python.git
+cd falcon-cpp-python
 
 # Install with pip
 pip install -e .
@@ -175,16 +102,16 @@ pip install -e .[server]
 poetry install --all-extras
 . .venv/bin/activate
 
-# Will need to be re-run any time vendor/llama.cpp is updated
+# Will need to be re-run any time vendor/ggllm.cpp is updated
 python3 setup.py develop
 ```
 
-# How does this compare to other Python bindings of `llama.cpp`?
-
-I originally wrote this package for my own use with two goals in mind:
+# This Project is a fork of llama-cpp-python
 
-- Provide a simple process to install `llama.cpp` and access the full C API in `llama.h` from Python
-- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `llama.cpp`
+This project was originally llama-cpp-python and owes an immense thanks to @abetlen.
+This project's goal is to:
+- Provide a simple process to install `ggllm.cpp` and access the full C API in `libfalcon.h` from Python
+- Provide a high-level Python API that can be used as a drop-in replacement for the OpenAI API so existing apps can be easily ported to use `ggllm.cpp`
 
 Any contributions and changes to this package will be made with these goals in mind.
 
diff --git a/docker/README.md b/docker/README.md
deleted file mode 100644
index 053d311b4..000000000
--- a/docker/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Install Docker Server
-
-**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
-
-[Install Docker Engine](https://docs.docker.com/engine/install)
-
-**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
-
-# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
-## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
-```
-cd ./openblas_simple
-docker build -t openblas_simple .
-docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
-```
-where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
-
-## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
-```
-cd ./cuda_simple
-docker build -t cuda_simple .
-docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
-```
-where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
-
-# "Open-Llama-in-a-box"
-## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
-```
-$ cd ./open_llama
-./build.sh
-./start.sh
-```
-
-# Manually choose your own Llama model from Hugging Face
-`python3 ./hug_model.py -a TheBloke -t llama`
-You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
-```
-docker $ ls -lh *.bin
--rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
-lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
-```
-**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
-**TWICE** as much disk space as the size of the model:
-
-| Model |  Quantized size |
-|------:|----------------:|
-|    3B |            3 GB |
-|    7B |            5 GB |
-|   13B |           10 GB |
-|   33B |           25 GB |
-|   65B |           50 GB |
-
-**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
-
-## Use OpenBLAS
-Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
-### Build:
-`docker build -t openblas .`
-### Run:
-`docker run --cap-add SYS_RESOURCE -t openblas`
-
-## Use CuBLAS
-### Build:
-`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
-### Run:
-`docker run --cap-add SYS_RESOURCE -t cublas`
diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile
deleted file mode 100644
index 24906d53a..000000000
--- a/docker/cuda_simple/Dockerfile
+++ /dev/null
@@ -1,16 +0,0 @@
-ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM nvidia/cuda:${CUDA_IMAGE}
-
-# We need to set the host to 0.0.0.0 to allow outside access
-ENV HOST 0.0.0.0
-
-COPY . .
-
-# Install the package
-RUN apt update && apt install -y python3 python3-pip
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-
-RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
-
-# Run the server
-CMD python3 -m llama_cpp.server
diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile
deleted file mode 100644
index f0ef5f721..000000000
--- a/docker/open_llama/Dockerfile
+++ /dev/null
@@ -1,51 +0,0 @@
-# Define the image argument and provide a default value
-ARG IMAGE=python:3-slim-bullseye
-
-# Use the image as specified
-FROM ${IMAGE}
-
-# Re-declare the ARG after FROM
-ARG IMAGE
-
-# Update and upgrade the existing packages 
-RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
-    python3 \
-    python3-pip \
-    ninja-build \
-    build-essential
-
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-
-# Perform the conditional installations based on the image
-RUN echo "Image: ${IMAGE}" && \
-    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
-    echo "OpenBLAS install:" && \
-    apt-get install -y --no-install-recommends libopenblas-dev && \
-    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
-else \
-    echo "CuBLAS install:" && \
-    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
-fi
-
-# Clean up apt cache
-RUN rm -rf /var/lib/apt/lists/*
-
-# Set a working directory for better clarity
-WORKDIR /app
-
-# Copy files to the app directory
-RUN echo "Installing model...this can take some time..."
-COPY ./model.bin /app/model.bin
-COPY ./start_server.sh /app/start_server.sh
-
-# Make the server start script executable
-RUN chmod +x /app/start_server.sh
-
-# Set environment variable for the host
-ENV HOST=0.0.0.0
-
-# Expose a port for the server
-EXPOSE 8000
-
-# Run the server start script
-CMD ["/bin/sh", "/app/start_server.sh"]
diff --git a/docker/open_llama/build.sh b/docker/open_llama/build.sh
deleted file mode 100755
index 3a6457dcd..000000000
--- a/docker/open_llama/build.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh
-
-MODEL="open_llama_3b"
-# Get  open_llama_3b_ggml q5_1 quantization
-python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
-ls -lh *.bin
-
-# Build the default OpenBLAS image
-docker build -t $MODEL .
-docker images | egrep "^(REPOSITORY|$MODEL)"
-
-echo
-echo "To start the docker container run:"
-echo "docker run -t -p 8000:8000 $MODEL"
diff --git a/docker/open_llama/hug_model.py b/docker/open_llama/hug_model.py
deleted file mode 100644
index 13c5b6b0d..000000000
--- a/docker/open_llama/hug_model.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import requests
-import json
-import os
-import struct
-import argparse
-
-def make_request(url, params=None):
-    print(f"Making request to {url}...")
-    response = requests.get(url, params=params)
-    if response.status_code == 200:
-        return json.loads(response.text)
-    else:
-        print(f"Request failed with status code {response.status_code}")
-        return None
-
-def check_magic_and_version(filename):
-    with open(filename, 'rb') as f:
-        # Read the first 6 bytes from the file
-        data = f.read(6)
-
-    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
-    # and the next 2 bytes as a little-endian unsigned short
-    magic, version = struct.unpack('<I H', data)
-
-    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
-
-    return magic, version
-
-def download_file(url, destination):
-    print(f"Downloading {url} to {destination}...")
-    response = requests.get(url, stream=True)
-    if response.status_code == 200:
-        with open(destination, 'wb') as f:
-            total_downloaded = 0
-            for chunk in response.iter_content(chunk_size=1024):
-                if chunk:  # filter out keep-alive new chunks
-                    f.write(chunk)
-                    total_downloaded += len(chunk)
-                    if total_downloaded >= 10485760:  # 10 MB
-                        print('.', end='', flush=True)
-                        total_downloaded = 0
-        print("\nDownload complete.")
-        
-        # Creating a symbolic link from destination to "model.bin"
-        if os.path.isfile("model.bin"):
-            os.remove("model.bin")  # remove the existing link if any
-        os.symlink(destination, "model.bin")
-    else:
-        print(f"Download failed with status code {response.status_code}")
-
-def get_user_choice(model_list):
-    # Print the enumerated list
-    print("\n")
-    for i, (model_id, rfilename) in enumerate(model_list):
-        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
-
-    # Get user's choice
-    choice = input("Choose a model to download by entering the corresponding number: ")
-    try:
-        index = int(choice) - 1
-        if 0 <= index < len(model_list):
-            # Return the chosen model
-            return model_list[index]
-        else:
-            print("Invalid choice.")
-    except ValueError:
-        print("Invalid input. Please enter a number corresponding to a model.")
-    except IndexError:
-        print("Invalid choice. Index out of range.")
-    
-    return None
-
-def main():
-    # Create an argument parser
-    parser = argparse.ArgumentParser(description='Process some parameters.')
-
-    # Arguments
-    parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='hexadecimal version number of ggml file')
-    parser.add_argument('-a', '--author', type=str, default='TheBloke',
-                        help='HuggingFace author filter')
-    parser.add_argument('-t', '--tag', type=str, default='llama',
-                        help='HuggingFace tag filter')
-    parser.add_argument('-s', '--search', type=str, default='',
-                        help='HuggingFace search filter')
-    parser.add_argument('-f', '--filename', type=str, default='q5_1',
-                        help='HuggingFace model repository filename substring match')
-
-    # Parse the arguments
-    args = parser.parse_args()
-
-    # Define the parameters
-    params = {
-        "author": args.author,
-        "tags": args.tag,
-        "search": args.search
-    }
-
-    models = make_request('https://huggingface.co/api/models', params=params)
-    if models is None:
-        return
-
-    model_list = []
-    # Iterate over the models
-    for model in models:
-        model_id = model['id']
-        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
-        if model_info is None:
-            continue
-
-        for sibling in model_info.get('siblings', []):
-            rfilename = sibling.get('rfilename')
-            if rfilename and args.filename in rfilename:
-                model_list.append((model_id, rfilename))
-
-    # Choose the model
-    model_list.sort(key=lambda x: x[0])
-    if len(model_list) == 0:
-        print("No models found")
-        exit(1)
-    elif len(model_list) == 1:
-        model_choice = model_list[0]
-    else:
-        model_choice = get_user_choice(model_list)
-
-    if model_choice is not None:
-        model_id, rfilename = model_choice
-        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
-        dest = f"{model_id.replace('/', '_')}_{rfilename}"
-        download_file(url, dest)
-        _, version = check_magic_and_version(dest)
-        if version != args.version:
-             print(f"Warning: Expected version {args.version}, but found different version in the file.")
-    else:
-        print("Error - model choice was None")
-        exit(2)
-
-if __name__ == '__main__':
-    main()
diff --git a/docker/open_llama/start.sh b/docker/open_llama/start.sh
deleted file mode 100755
index 7ee8f748e..000000000
--- a/docker/open_llama/start.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/sh
-
-MODEL="open_llama_3b"
-
-# Start Docker container
-docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
-sleep 10
-echo
-docker ps | egrep "(^CONTAINER|$MODEL)"
-
-# Test the model works
-echo
-curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
-  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-  "stop": [
-    "\n",
-    "###"
-  ]
-}' | grep Paris
-if [ $? -eq 0 ]
-then
-    echo
-    echo "$MODEL is working!!"
-else
-    echo
-    echo "ERROR: $MODEL not replying."
-    exit 1
-fi
diff --git a/docker/open_llama/start_server.sh b/docker/open_llama/start_server.sh
deleted file mode 100755
index d3329eec3..000000000
--- a/docker/open_llama/start_server.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/sh
-
-# For mlock support
-ulimit -l unlimited
-
-if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
-    python3 -B -m llama_cpp.server --model /app/model.bin
-else
-    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
-    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
-fi
diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile
deleted file mode 100644
index 1a95caeda..000000000
--- a/docker/openblas_simple/Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM python:3-slim-bullseye
-
-# We need to set the host to 0.0.0.0 to allow outside access
-ENV HOST 0.0.0.0
-
-COPY . .
-
-# Install the package
-RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
-RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-
-RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
-
-# Run the server
-CMD python3 -m llama_cpp.server
diff --git a/docs/api-reference.md b/docs/api-reference.md
deleted file mode 100644
index 1290cad49..000000000
--- a/docs/api-reference.md
+++ /dev/null
@@ -1,53 +0,0 @@
----
-title: API Reference
----
-
-::: llama_cpp.Llama
-    options:
-        members:
-            - __init__
-            - tokenize
-            - detokenize
-            - reset
-            - eval
-            - sample
-            - generate
-            - create_embedding
-            - embed
-            - create_completion
-            - __call__
-            - create_chat_completion
-            - set_cache
-            - save_state
-            - load_state
-            - token_bos
-            - token_eos
-        show_root_heading: true
-
-::: llama_cpp.LlamaCache
-    options:
-        show_root_heading: true
-
-::: llama_cpp.LlamaState
-    options:
-        show_root_heading: true
-
-::: llama_cpp.LogitsProcessor
-    options:
-        show_root_heading: true
-
-::: llama_cpp.LogitsProcessorList
-    options:
-        show_root_heading: true
-
-::: llama_cpp.StoppingCriteria
-    options:
-        show_root_heading: true
-
-::: llama_cpp.StoppingCriteriaList
-    options:
-        show_root_heading: true
-
-::: llama_cpp.llama_cpp
-    options:
-        show_if_no_docstring: true
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index 7d5ccc314..000000000
--- a/docs/index.md
+++ /dev/null
@@ -1,92 +0,0 @@
-# Getting Started
-
-## 🦙 Python Bindings for `llama.cpp`
-
-[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
-[![Tests](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml/badge.svg?branch=main)](https://github.com/abetlen/llama-cpp-python/actions/workflows/test.yaml)
-[![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-[![PyPI - Downloads](https://img.shields.io/pypi/dm/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
-
-Simple Python bindings for **@ggerganov's** [`llama.cpp`](https://github.com/ggerganov/llama.cpp) library.
-This package provides:
-
-- Low-level access to C API via `ctypes` interface.
-- High-level Python API for text completion
-  - OpenAI-like API
-  - LangChain compatibility
-
-## Installation
-
-Install from PyPI:
-
-```bash
-pip install llama-cpp-python
-```
-
-## High-level API
-
-```python
->>> from llama_cpp import Llama
->>> llm = Llama(model_path="./models/7B/ggml-model.bin")
->>> output = llm("Q: Name the planets in the solar system? A: ", max_tokens=32, stop=["Q:", "\n"], echo=True)
->>> print(output)
-{
-  "id": "cmpl-xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx",
-  "object": "text_completion",
-  "created": 1679561337,
-  "model": "./models/7B/ggml-model.bin",
-  "choices": [
-    {
-      "text": "Q: Name the planets in the solar system? A: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune and Pluto.",
-      "index": 0,
-      "logprobs": None,
-      "finish_reason": "stop"
-    }
-  ],
-  "usage": {
-    "prompt_tokens": 14,
-    "completion_tokens": 28,
-    "total_tokens": 42
-  }
-}
-```
-
-## Web Server
-
-`llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
-This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc).
-
-To install the server package and get started:
-
-```bash
-pip install llama-cpp-python[server]
-export MODEL=./models/7B/ggml-model.bin
-python3 -m llama_cpp.server
-```
-
-Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the OpenAPI documentation.
-
-## Low-level API
-
-The low-level API is a direct `ctypes` binding to the C API provided by `llama.cpp`.
-The entire API can be found in [llama_cpp/llama_cpp.py](https://github.com/abetlen/llama-cpp-python/blob/master/llama_cpp/llama_cpp.py) and should mirror [llama.h](https://github.com/ggerganov/llama.cpp/blob/master/llama.h).
-
-
-## Development
-
-This package is under active development and I welcome any contributions.
-
-To get started, clone the repository and install the package in development mode:
-
-```bash
-git clone git@github.com:abetlen/llama-cpp-python.git
-git submodule update --init --recursive
-# Will need to be re-run any time vendor/llama.cpp is updated
-python3 setup.py develop
-```
-
-## License
-
-This project is licensed under the terms of the MIT license.
\ No newline at end of file
diff --git a/docs/install/macos.md b/docs/install/macos.md
deleted file mode 100644
index 600469615..000000000
--- a/docs/install/macos.md
+++ /dev/null
@@ -1,59 +0,0 @@
----
-title: MacOS Install with Metal GPU
----
-
-**(1) Make sure you have xcode installed... at least the command line parts**
-```
-# check the path of your xcode install 
-xcode-select -p
-
-# xcode installed returns
-# /Applications/Xcode-beta.app/Contents/Developer
-
-# if xcode is missing then install it... it takes ages;
-xcode-select --install
-```
-
-**(2) Install the conda version for MacOS that supports Metal GPU**
-```
-wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
-bash Miniforge3-MacOSX-arm64.sh
-```
-
-**(3) Make a conda environment**
-```
-conda create -n llama python=3.9.16
-conda activate llama
-```
-
-**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**  
-    *(you needed xcode installed in order pip to build/compile the C++ code)*
-```
-pip uninstall llama-cpp-python -y
-CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
-pip install 'llama-cpp-python[server]'
-
-# you should now have llama-cpp-python v0.1.62 installed
-llama-cpp-python         0.1.62      
-
-```
-
-**(4) Download a v3 ggml model**
- - **ggmlv3**
- - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
-
-https://huggingface.co/TheBloke/open-llama-7b-open-instruct-GGML
-
-
-**(6) run the llama-cpp-python API server with MacOS Metal GPU support**
-```
-# config your ggml model path
-# make sure it is ggml v3
-# make sure it is q4_0
-export MODEL=[path to your llama.cpp ggml models]]/[ggml-model-name]]q4_0.bin
-python3 -m llama_cpp.server --model $MODEL  --n_gpu_layers 1
-```
-
-***Note:** If you omit the `--n_gpu_layers 1` then CPU will be used*
-
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
deleted file mode 100644
index 199bd4ffb..000000000
--- a/docs/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-mkdocs
-mkdocs-material
-mkdocstrings[python]
\ No newline at end of file
diff --git a/examples/high_level_api/high_level_api_inference.py b/examples/high_level_api/high_level_api_inference.py
index e41f37577..e6f85e180 100644
--- a/examples/high_level_api/high_level_api_inference.py
+++ b/examples/high_level_api/high_level_api_inference.py
@@ -1,13 +1,13 @@
 import json
 import argparse
 
-from llama_cpp import Llama
+from falcon_cpp import Falcon
 
 parser = argparse.ArgumentParser()
-parser.add_argument("-m", "--model", type=str, default="../models/7B/ggml-models.bin")
+parser.add_argument("-m", "--model", type=str, default="../../models/tiiuae_falcon-7b/ggml-model-tiiuae_falcon-7b-f16.bin")
 args = parser.parse_args()
 
-llm = Llama(model_path=args.model)
+llm = Falcon(model_path=args.model)
 
 output = llm(
     "Question: What are the names of the planets in the solar system? Answer: ",
diff --git a/falcon_cpp/__init__.py b/falcon_cpp/__init__.py
new file mode 100644
index 000000000..e7d40876f
--- /dev/null
+++ b/falcon_cpp/__init__.py
@@ -0,0 +1,2 @@
+from .falcon_cpp import *
+from .falcon import *
diff --git a/llama_cpp/llama.py b/falcon_cpp/falcon.py
similarity index 65%
rename from llama_cpp/llama.py
rename to falcon_cpp/falcon.py
index 688b2a74f..010586cd9 100644
--- a/llama_cpp/llama.py
+++ b/falcon_cpp/falcon.py
@@ -20,15 +20,15 @@
 
 import diskcache
 
-from . import llama_cpp
-from .llama_types import *
+from . import falcon_cpp
+from .falcon_types import *
 
 import numpy as np
 import numpy.typing as npt
 
 
-class BaseLlamaCache(ABC):
-    """Base cache class for a llama.cpp model."""
+class BaseFalconCache(ABC):
+    """Base cache class for a falcon.cpp model."""
 
     def __init__(self, capacity_bytes: int = (2 << 30)):
         self.capacity_bytes = capacity_bytes
@@ -39,13 +39,13 @@ def cache_size(self) -> int:
         raise NotImplementedError
 
     def _find_longest_prefix_key(
-        self,
-        key: Tuple[int, ...],
+            self,
+            key: Tuple[int, ...],
     ) -> Optional[Tuple[int, ...]]:
         pass
 
     @abstractmethod
-    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
+    def __getitem__(self, key: Sequence[int]) -> "FalconState":
         raise NotImplementedError
 
     @abstractmethod
@@ -53,30 +53,30 @@ def __contains__(self, key: Sequence[int]) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def __setitem__(self, key: Sequence[int], value: "LlamaState") -> None:
+    def __setitem__(self, key: Sequence[int], value: "FalconState") -> None:
         raise NotImplementedError
 
 
-class LlamaRAMCache(BaseLlamaCache):
-    """Cache for a llama.cpp model using RAM."""
+class FalconRAMCache(BaseFalconCache):
+    """Cache for a falcon.cpp model using RAM."""
 
     def __init__(self, capacity_bytes: int = (2 << 30)):
         super().__init__(capacity_bytes)
         self.capacity_bytes = capacity_bytes
-        self.cache_state: OrderedDict[Tuple[int, ...], "LlamaState"] = OrderedDict()
+        self.cache_state: OrderedDict[Tuple[int, ...], "FalconState"] = OrderedDict()
 
     @property
     def cache_size(self):
-        return sum([state.llama_state_size for state in self.cache_state.values()])
+        return sum([state.falcon_state_size for state in self.cache_state.values()])
 
     def _find_longest_prefix_key(
-        self,
-        key: Tuple[int, ...],
+            self,
+            key: Tuple[int, ...],
     ) -> Optional[Tuple[int, ...]]:
         min_len = 0
         min_key = None
         keys = (
-            (k, Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys()
+            (k, Falcon.longest_token_prefix(k, key)) for k in self.cache_state.keys()
         )
         for k, prefix_len in keys:
             if prefix_len > min_len:
@@ -84,7 +84,7 @@ def _find_longest_prefix_key(
                 min_key = k
         return min_key
 
-    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
+    def __getitem__(self, key: Sequence[int]) -> "FalconState":
         key = tuple(key)
         _key = self._find_longest_prefix_key(key)
         if _key is None:
@@ -96,7 +96,7 @@ def __getitem__(self, key: Sequence[int]) -> "LlamaState":
     def __contains__(self, key: Sequence[int]) -> bool:
         return self._find_longest_prefix_key(tuple(key)) is not None
 
-    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
+    def __setitem__(self, key: Sequence[int], value: "FalconState"):
         key = tuple(key)
         if key in self.cache_state:
             del self.cache_state[key]
@@ -106,14 +106,14 @@ def __setitem__(self, key: Sequence[int], value: "LlamaState"):
 
 
 # Alias for backwards compatibility
-LlamaCache = LlamaRAMCache
+FalconCache = FalconRAMCache
 
 
-class LlamaDiskCache(BaseLlamaCache):
-    """Cache for a llama.cpp model using disk."""
+class FalconDiskCache(BaseFalconCache):
+    """Cache for a falcon.cpp model using disk."""
 
     def __init__(
-        self, cache_dir: str = ".cache/llama_cache", capacity_bytes: int = (2 << 30)
+            self, cache_dir: str = ".cache/falcon_cache", capacity_bytes: int = (2 << 30)
     ):
         super().__init__(capacity_bytes)
         self.cache = diskcache.Cache(cache_dir)
@@ -123,60 +123,60 @@ def cache_size(self):
         return int(self.cache.volume())  # type: ignore
 
     def _find_longest_prefix_key(
-        self,
-        key: Tuple[int, ...],
+            self,
+            key: Tuple[int, ...],
     ) -> Optional[Tuple[int, ...]]:
         min_len = 0
         min_key: Optional[Tuple[int, ...]] = None
         for k in self.cache.iterkeys():  # type: ignore
-            prefix_len = Llama.longest_token_prefix(k, key)
+            prefix_len = Falcon.longest_token_prefix(k, key)
             if prefix_len > min_len:
                 min_len = prefix_len
                 min_key = k  # type: ignore
         return min_key
 
-    def __getitem__(self, key: Sequence[int]) -> "LlamaState":
+    def __getitem__(self, key: Sequence[int]) -> "FalconState":
         key = tuple(key)
         _key = self._find_longest_prefix_key(key)
         if _key is None:
             raise KeyError("Key not found")
-        value: "LlamaState" = self.cache.pop(_key)  # type: ignore
+        value: "FalconState" = self.cache.pop(_key)  # type: ignore
         # NOTE: This puts an integer as key in cache, which breaks,
-        # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
+        # Falcon.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens
         # self.cache.push(_key, side="front")  # type: ignore
         return value
 
     def __contains__(self, key: Sequence[int]) -> bool:
         return self._find_longest_prefix_key(tuple(key)) is not None
 
-    def __setitem__(self, key: Sequence[int], value: "LlamaState"):
-        print("LlamaDiskCache.__setitem__: called", file=sys.stderr)
+    def __setitem__(self, key: Sequence[int], value: "FalconState"):
+        print("FalconDiskCache.__setitem__: called", file=sys.stderr)
         key = tuple(key)
         if key in self.cache:
-            print("LlamaDiskCache.__setitem__: delete", file=sys.stderr)
+            print("FalconDiskCache.__setitem__: delete", file=sys.stderr)
             del self.cache[key]
         self.cache[key] = value
-        print("LlamaDiskCache.__setitem__: set", file=sys.stderr)
+        print("FalconDiskCache.__setitem__: set", file=sys.stderr)
         while self.cache_size > self.capacity_bytes and len(self.cache) > 0:
             key_to_remove = next(iter(self.cache))
             del self.cache[key_to_remove]
-        print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
+        print("FalconDiskCache.__setitem__: trim", file=sys.stderr)
 
 
-class LlamaState:
+class FalconState:
     def __init__(
-        self,
-        input_ids: npt.NDArray[np.intc],
-        scores: npt.NDArray[np.single],
-        n_tokens: int,
-        llama_state: bytes,
-        llama_state_size: int,
+            self,
+            input_ids: npt.NDArray[np.intc],
+            scores: npt.NDArray[np.single],
+            n_tokens: int,
+            falcon_state: bytes,
+            falcon_state_size: int,
     ):
         self.input_ids = input_ids
         self.scores = scores
         self.n_tokens = n_tokens
-        self.llama_state = llama_state
-        self.llama_state_size = llama_state_size
+        self.falcon_state = falcon_state
+        self.falcon_state_size = falcon_state_size
 
 
 LogitsProcessor = Callable[[List[int], List[float]], List[float]]
@@ -197,61 +197,80 @@ def __call__(self, input_ids: List[int], logits: List[float]) -> bool:
         return any([stopping_criteria(input_ids, logits) for stopping_criteria in self])
 
 
-class Llama:
-    """High-level Python wrapper for a llama.cpp model."""
+class Falcon:
+    """High-level Python wrapper for a falcon.cpp model."""
 
     def __init__(
-        self,
-        model_path: str,
-        # NOTE: These parameters are likely to change in the future.
-        n_ctx: int = 512,
-        n_parts: int = -1,
-        n_gpu_layers: int = 0,
-        seed: int = 1337,
-        f16_kv: bool = True,
-        logits_all: bool = False,
-        vocab_only: bool = False,
-        use_mmap: bool = True,
-        use_mlock: bool = False,
-        embedding: bool = False,
-        n_threads: Optional[int] = None,
-        n_batch: int = 512,
-        last_n_tokens_size: int = 64,
-        lora_base: Optional[str] = None,
-        lora_path: Optional[str] = None,
-        low_vram: bool = False,
-        verbose: bool = True,
+            self,
+            model_path: str,
+            # NOTE: These parameters are likely to change in the future.
+            n_ctx: int = 512,
+            n_parts: int = -1,
+            n_gpu_layers: int = 0,
+            seed: int = 1337,
+            f16_kv: bool = True,
+            logits_all: bool = False,
+            vocab_only: bool = False,
+            use_mmap: bool = True,
+            use_mlock: bool = False,
+            embedding: bool = False,
+            n_threads: Optional[int] = None,
+            n_batch: int = 512,
+            last_n_tokens_size: int = 64,
+            lora_base: Optional[str] = None,
+            lora_path: Optional[str] = None,
+            low_vram: bool = False,
+            verbose: bool = True,
     ):
-        """Load a llama.cpp model from `model_path`.
 
-        Args:
-            model_path: Path to the model.
-            n_ctx: Maximum context size.
-            n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
-            seed: Random seed. -1 for random.
-            f16_kv: Use half-precision for key/value cache.
-            logits_all: Return logits for all tokens, not just the last token.
-            vocab_only: Only load the vocabulary no weights.
-            use_mmap: Use mmap if possible.
-            use_mlock: Force the system to keep the model in RAM.
-            embedding: Embedding mode only.
-            n_threads: Number of threads to use. If None, the number of threads is automatically determined.
-            n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
-            last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
-            lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
-            lora_path: Path to a LoRA file to apply to the model.
-            verbose: Print verbose output to stderr.
-
-        Raises:
-            ValueError: If the model path does not exist.
-
-        Returns:
-            A Llama instance.
-        """
+        # TODO: Add the parameters for
+        '''
+            -ts SPLIT --tensor-split SPLIT
+                            how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1
+            -mg i, --main-gpu i   the GPU to use for scratch and small tensors (0 = first)
+            --override-max-gpu N
+                                limits the number of GPUs visible (allows to disable multi/single GPU processing)
+            --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM)
+            --mtest               compute maximum memory usage
+            --export              export the computation graph to 'falcon.ggml'
+            --verbose-prompt      print prompt before generation
+            -dt, --debug-timings  print GGML_PERF debug output (requires GGML_PERF=1 for timings)
+                                1 = print first layer, 2 = print first and last layer, 3+ = all layers
+            --lora FNAME          apply LoRA adapter (implies --no-mmap)
+            --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter
+            -m FNAME, --model FNAME
+        '''
+
+        """Load a Falcon model from `model_path`.
+
+                Args:
+                    model_path: Path to the model.
+                    n_ctx: Maximum context size.
+                    n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
+                    seed: Random seed. -1 for random.
+                    f16_kv: Use half-precision for key/value cache.
+                    logits_all: Return logits for all tokens, not just the last token.
+                    vocab_only: Only load the vocabulary no weights.
+                    use_mmap: Use mmap if possible.
+                    use_mlock: Force the system to keep the model in RAM.
+                    embedding: Embedding mode only.
+                    n_threads: Number of threads to use. If None, the number of threads is automatically determined.
+                    n_batch: Maximum number of prompt tokens to batch together when calling falcon_eval.
+                    last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
+                    lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
+                    lora_path: Path to a LoRA file to apply to the model.
+                    verbose: Print verbose output to stderr.
+
+                Raises:
+                    ValueError: If the model path does not exist.
+
+                Returns:
+                    A falcon instance.
+                """
         self.verbose = verbose
         self.model_path = model_path
 
-        self.params = llama_cpp.llama_context_default_params()
+        self.params = falcon_cpp.falcon_context_default_params()
         self.params.n_ctx = n_ctx
         self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
@@ -266,7 +285,7 @@ def __init__(
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 
-        self.cache: Optional[BaseLlamaCache] = None
+        self.cache: Optional[BaseFalconCache] = None
 
         self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1)
 
@@ -280,35 +299,35 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        self.model = llama_cpp.llama_load_model_from_file(
+        self.model = falcon_cpp.falcon_load_model_from_file(
             self.model_path.encode("utf-8"), self.params
         )
         assert self.model is not None
 
-        self.ctx = llama_cpp.llama_new_context_with_model(self.model, self.params)
+        self.ctx = falcon_cpp.falcon_new_context_with_model(self.model, self.params)
 
         assert self.ctx is not None
 
         if self.lora_path:
-            if llama_cpp.llama_model_apply_lora_from_file(
-                self.model,
-                llama_cpp.c_char_p(self.lora_path.encode("utf-8")),
-                llama_cpp.c_char_p(self.lora_base.encode("utf-8"))
-                if self.lora_base is not None
-                else llama_cpp.c_char_p(0),
-                llama_cpp.c_int(self.n_threads),
+            if falcon_cpp.falcon_model_apply_lora_from_file(
+                    self.model,
+                    falcon_cpp.c_char_p(self.lora_path.encode("utf-8")),
+                    falcon_cpp.c_char_p(self.lora_base.encode("utf-8"))
+                    if self.lora_base is not None
+                    else falcon_cpp.c_char_p(0),
+                    falcon_cpp.c_int(self.n_threads),
             ):
                 raise RuntimeError(
                     f"Failed to apply LoRA from lora path: {self.lora_path} to base path: {self.lora_base}"
                 )
 
         if self.verbose:
-            print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
+            print(falcon_cpp.falcon_print_system_info().decode("utf-8"), file=sys.stderr)
 
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
-        size = llama_cpp.c_size_t(self._n_vocab)
-        sorted = llama_cpp.c_bool(False)
+        size = falcon_cpp.c_size_t(self._n_vocab)
+        sorted = falcon_cpp.c_bool(False)
         self._candidates_data = np.array(
             [],
             dtype=np.dtype(
@@ -316,14 +335,14 @@ def __init__(
             ),
         )
         self._candidates_data.resize(3, self._n_vocab, refcheck=False)
-        candidates = llama_cpp.llama_token_data_array(
-            data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
+        candidates = falcon_cpp.falcon_token_data_array(
+            data=self._candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p),
             size=size,
             sorted=sorted,
         )
         self._candidates = candidates
-        self._token_nl = Llama.token_nl()
-        self._token_eos = Llama.token_eos()
+        self._token_nl = Falcon.token_nl()
+        self._token_eos = Falcon.token_eos()
 
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
@@ -364,23 +383,23 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         """
         assert self.ctx is not None
         n_ctx = self._n_ctx
-        tokens = (llama_cpp.llama_token * n_ctx)()
-        n_tokens = llama_cpp.llama_tokenize(
+        tokens = (falcon_cpp.falcon_token * n_ctx)()
+        n_tokens = falcon_cpp.falcon_tokenize(
             self.ctx,
             text,
             tokens,
-            llama_cpp.c_int(n_ctx),
-            llama_cpp.c_bool(add_bos),
+            falcon_cpp.c_int(n_ctx),
+            falcon_cpp.c_bool(add_bos),
         )
         if n_tokens < 0:
             n_tokens = abs(n_tokens)
-            tokens = (llama_cpp.llama_token * n_tokens)()
-            n_tokens = llama_cpp.llama_tokenize(
+            tokens = (falcon_cpp.falcon_token * n_tokens)()
+            n_tokens = falcon_cpp.falcon_tokenize(
                 self.ctx,
                 text,
                 tokens,
-                llama_cpp.c_int(n_tokens),
-                llama_cpp.c_bool(add_bos),
+                falcon_cpp.c_int(n_tokens),
+                falcon_cpp.c_bool(add_bos),
             )
             if n_tokens < 0:
                 raise RuntimeError(
@@ -400,12 +419,12 @@ def detokenize(self, tokens: List[int]) -> bytes:
         assert self.ctx is not None
         output = b""
         for token in tokens:
-            output += llama_cpp.llama_token_to_str(
-                self.ctx, llama_cpp.llama_token(token)
+            output += falcon_cpp.falcon_token_to_str(
+                self.ctx, falcon_cpp.falcon_token(token)
             )
         return output
 
-    def set_cache(self, cache: Optional[BaseLlamaCache]):
+    def set_cache(self, cache: Optional[BaseFalconCache]):
         """Set the cache.
 
         Args:
@@ -426,52 +445,53 @@ def eval(self, tokens: Sequence[int]):
         assert self.ctx is not None
         n_ctx = self._n_ctx
         for i in range(0, len(tokens), self.n_batch):
-            batch = tokens[i : min(len(tokens), i + self.n_batch)]
+            batch = tokens[i: min(len(tokens), i + self.n_batch)]
             n_past = min(n_ctx - len(batch), len(self._input_ids))
             n_tokens = len(batch)
-            return_code = llama_cpp.llama_eval(
+            return_code = falcon_cpp.falcon_eval(
                 ctx=self.ctx,
-                tokens=(llama_cpp.llama_token * len(batch))(*batch),
-                n_tokens=llama_cpp.c_int(n_tokens),
-                n_past=llama_cpp.c_int(n_past),
-                n_threads=llama_cpp.c_int(self.n_threads),
+                tokens=(falcon_cpp.falcon_token * len(batch))(*batch),
+                n_tokens=falcon_cpp.c_int(n_tokens),
+                n_past=falcon_cpp.c_int(n_past),
+                n_threads=falcon_cpp.c_int(self.n_threads),
             )
             if return_code != 0:
-                raise RuntimeError(f"llama_eval returned {return_code}")
+                raise RuntimeError(f"falcon_eval returned {return_code}")
             # Save tokens
-            self.input_ids[self.n_tokens : self.n_tokens + n_tokens] = batch
+            self.input_ids[self.n_tokens: self.n_tokens + n_tokens] = batch
             # Save logits
             rows = n_tokens if self.params.logits_all else 1
             cols = self._n_vocab
-            offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False
-            self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols]
+            offset = 0 if self.params.logits_all else n_tokens - 1  # NOTE: Only save the last token logits if logits_all is False
+            self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[
+            :] = falcon_cpp.falcon_get_logits(self.ctx)[:rows * cols]
             # Update n_tokens
             self.n_tokens += n_tokens
 
     def _sample(
-        self,
-        last_n_tokens_data,  # type: llama_cpp.Array[llama_cpp.llama_token]
-        last_n_tokens_size: llama_cpp.c_int,
-        top_k: llama_cpp.c_int,
-        top_p: llama_cpp.c_float,
-        temp: llama_cpp.c_float,
-        tfs_z: llama_cpp.c_float,
-        repeat_penalty: llama_cpp.c_float,
-        frequency_penalty: llama_cpp.c_float,
-        presence_penalty: llama_cpp.c_float,
-        mirostat_mode: llama_cpp.c_int,
-        mirostat_tau: llama_cpp.c_float,
-        mirostat_eta: llama_cpp.c_float,
-        penalize_nl: bool = True,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            last_n_tokens_data,  # type: falcon_cpp.Array[falcon_cpp.falcon_token]
+            last_n_tokens_size: falcon_cpp.c_int,
+            top_k: falcon_cpp.c_int,
+            top_p: falcon_cpp.c_float,
+            temp: falcon_cpp.c_float,
+            tfs_z: falcon_cpp.c_float,
+            repeat_penalty: falcon_cpp.c_float,
+            frequency_penalty: falcon_cpp.c_float,
+            presence_penalty: falcon_cpp.c_float,
+            mirostat_mode: falcon_cpp.c_int,
+            mirostat_tau: falcon_cpp.c_float,
+            mirostat_eta: falcon_cpp.c_float,
+            penalize_nl: bool = True,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ):
         assert self.ctx is not None
         assert self.n_tokens > 0
         n_vocab = self._n_vocab
         n_ctx = self._n_ctx
-        top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
+        top_k = falcon_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
         last_n_tokens_size = (
-            llama_cpp.c_int(n_ctx)
+            falcon_cpp.c_int(n_ctx)
             if last_n_tokens_size.value < 0
             else last_n_tokens_size
         )
@@ -490,110 +510,110 @@ def _sample(
         candidates_data["id"] = np.arange(n_vocab, dtype=np.intc)  # type: ignore
         candidates_data["logit"] = logits
         candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
-        candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
-        candidates.sorted = llama_cpp.c_bool(False)
-        candidates.size = llama_cpp.c_size_t(n_vocab)
-        llama_cpp.llama_sample_repetition_penalty(
+        candidates.data = candidates_data.ctypes.data_as(falcon_cpp.falcon_token_data_p)
+        candidates.sorted = falcon_cpp.c_bool(False)
+        candidates.size = falcon_cpp.c_size_t(n_vocab)
+        falcon_cpp.falcon_sample_repetition_penalty(
             ctx=self.ctx,
             last_tokens_data=last_n_tokens_data,
             last_tokens_size=last_n_tokens_size,
-            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
             penalty=repeat_penalty,
         )
-        llama_cpp.llama_sample_frequency_and_presence_penalties(
+        falcon_cpp.falcon_sample_frequency_and_presence_penalties(
             ctx=self.ctx,
-            candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+            candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
             last_tokens_data=last_n_tokens_data,
             last_tokens_size=last_n_tokens_size,
             alpha_frequency=frequency_penalty,
             alpha_presence=presence_penalty,
         )
         if not penalize_nl:
-            candidates.data[self._token_nl].logit = llama_cpp.c_float(nl_logit)
+            candidates.data[self._token_nl].logit = falcon_cpp.c_float(nl_logit)
         if temp.value == 0.0:
-            return llama_cpp.llama_sample_token_greedy(
+            return falcon_cpp.falcon_sample_token_greedy(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
             )
         elif mirostat_mode.value == 1:
-            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
-            mirostat_m = llama_cpp.c_int(100)
-            llama_cpp.llama_sample_temperature(
+            mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value)
+            mirostat_m = falcon_cpp.c_int(100)
+            falcon_cpp.falcon_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token_mirostat(
+            return falcon_cpp.falcon_sample_token_mirostat(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
+                mu=falcon_cpp.ctypes.byref(mirostat_mu),  # type: ignore
                 m=mirostat_m,
             )
         elif mirostat_mode.value == 2:
-            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
-            llama_cpp.llama_sample_temperature(
+            mirostat_mu = falcon_cpp.c_float(2.0 * mirostat_tau.value)
+            falcon_cpp.falcon_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=falcon_cpp.ctypes.pointer(candidates),
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token_mirostat_v2(
+            return falcon_cpp.falcon_sample_token_mirostat_v2(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 tau=mirostat_tau,
                 eta=mirostat_eta,
-                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
+                mu=falcon_cpp.ctypes.byref(mirostat_mu),  # type: ignore
             )
         else:
-            llama_cpp.llama_sample_top_k(
+            falcon_cpp.falcon_sample_top_k(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 k=top_k,
-                min_keep=llama_cpp.c_size_t(1),
+                min_keep=falcon_cpp.c_size_t(1),
             )
-            llama_cpp.llama_sample_tail_free(
+            falcon_cpp.falcon_sample_tail_free(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 z=tfs_z,
-                min_keep=llama_cpp.c_size_t(1),
+                min_keep=falcon_cpp.c_size_t(1),
             )
-            llama_cpp.llama_sample_typical(
+            falcon_cpp.falcon_sample_typical(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                p=llama_cpp.c_float(1.0),
-                min_keep=llama_cpp.c_size_t(1),
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
+                p=falcon_cpp.c_float(1.0),
+                min_keep=falcon_cpp.c_size_t(1),
             )
-            llama_cpp.llama_sample_top_p(
+            falcon_cpp.falcon_sample_top_p(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 p=top_p,
-                min_keep=llama_cpp.c_size_t(1),
+                min_keep=falcon_cpp.c_size_t(1),
             )
-            llama_cpp.llama_sample_temperature(
+            falcon_cpp.falcon_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
-            return llama_cpp.llama_sample_token(
+            return falcon_cpp.falcon_sample_token(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                candidates=falcon_cpp.ctypes.byref(candidates),  # type: ignore
             )
 
     def sample(
-        self,
-        top_k: int = 40,
-        top_p: float = 0.95,
-        temp: float = 0.80,
-        repeat_penalty: float = 1.1,
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_eta: float = 0.1,
-        mirostat_tau: float = 5.0,
-        penalize_nl: bool = True,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            top_k: int = 40,
+            top_p: float = 0.95,
+            temp: float = 0.80,
+            repeat_penalty: float = 1.1,
+            frequency_penalty: float = 0.0,
+            presence_penalty: float = 0.0,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_eta: float = 0.1,
+            mirostat_tau: float = 5.0,
+            penalize_nl: bool = True,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ):
         """Sample a token from the model.
 
@@ -607,52 +627,52 @@ def sample(
             The sampled token.
         """
         assert self.ctx is not None
-        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
+        last_n_tokens_data = [falcon_cpp.falcon_token(0)] * max(
             0, self.last_n_tokens_size - len(self._input_ids)
-        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
+        ) + self._input_ids[-self.last_n_tokens_size:].tolist()
         return self._sample(
-            last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
+            last_n_tokens_data=(falcon_cpp.falcon_token * self.last_n_tokens_size)(
                 *last_n_tokens_data
             ),
-            last_n_tokens_size=llama_cpp.c_int(self.last_n_tokens_size),
-            top_k=llama_cpp.c_int(top_k),
-            top_p=llama_cpp.c_float(top_p),
-            temp=llama_cpp.c_float(temp),
-            tfs_z=llama_cpp.c_float(tfs_z),
-            repeat_penalty=llama_cpp.c_float(repeat_penalty),
-            frequency_penalty=llama_cpp.c_float(frequency_penalty),
-            presence_penalty=llama_cpp.c_float(presence_penalty),
-            mirostat_mode=llama_cpp.c_int(mirostat_mode),
-            mirostat_tau=llama_cpp.c_float(mirostat_tau),
-            mirostat_eta=llama_cpp.c_float(mirostat_eta),
+            last_n_tokens_size=falcon_cpp.c_int(self.last_n_tokens_size),
+            top_k=falcon_cpp.c_int(top_k),
+            top_p=falcon_cpp.c_float(top_p),
+            temp=falcon_cpp.c_float(temp),
+            tfs_z=falcon_cpp.c_float(tfs_z),
+            repeat_penalty=falcon_cpp.c_float(repeat_penalty),
+            frequency_penalty=falcon_cpp.c_float(frequency_penalty),
+            presence_penalty=falcon_cpp.c_float(presence_penalty),
+            mirostat_mode=falcon_cpp.c_int(mirostat_mode),
+            mirostat_tau=falcon_cpp.c_float(mirostat_tau),
+            mirostat_eta=falcon_cpp.c_float(mirostat_eta),
             penalize_nl=penalize_nl,
             logits_processor=logits_processor,
         )
 
     def generate(
-        self,
-        tokens: Sequence[int],
-        top_k: int = 40,
-        top_p: float = 0.95,
-        temp: float = 0.80,
-        repeat_penalty: float = 1.1,
-        reset: bool = True,
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        logits_processor: Optional[LogitsProcessorList] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
+            self,
+            tokens: Sequence[int],
+            top_k: int = 40,
+            top_p: float = 0.95,
+            temp: float = 0.80,
+            repeat_penalty: float = 1.1,
+            reset: bool = True,
+            frequency_penalty: float = 0.0,
+            presence_penalty: float = 0.0,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_tau: float = 5.0,
+            mirostat_eta: float = 0.1,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
     ) -> Generator[int, Optional[Sequence[int]], None]:
         """Create a generator of tokens from a prompt.
 
         Examples:
-            >>> llama = Llama("models/ggml-7b.bin")
-            >>> tokens = llama.tokenize(b"Hello, world!")
-            >>> for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1):
-            ...     print(llama.detokenize([token]))
+            >>> falcon = Falcon("models/ggml-7b.bin")
+            >>> tokens = falcon.tokenize(b"Hello, world!")
+            >>> for token in falcon.generate(tokens, top_k=40, top_p=0.95, temp=1.0, repeat_penalty=1.1):
+            ...     print(falcon.detokenize([token]))
 
         Args:
             tokens: The prompt tokens.
@@ -676,7 +696,7 @@ def generate(
                     break
             if longest_prefix > 0:
                 if self.verbose:
-                    print("Llama.generate: prefix-match hit", file=sys.stderr)
+                    print("Falcon.generate: prefix-match hit", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
@@ -700,7 +720,7 @@ def generate(
                 logits_processor=logits_processor,
             )
             if stopping_criteria is not None and stopping_criteria(
-                self._input_ids.tolist(), self._scores[-1, :].tolist()
+                    self._input_ids.tolist(), self._scores[-1, :].tolist()
             ):
                 return
             tokens_or_none = yield token
@@ -709,7 +729,7 @@ def generate(
                 tokens.extend(tokens_or_none)
 
     def create_embedding(
-        self, input: Union[str, List[str]], model: Optional[str] = None
+            self, input: Union[str, List[str]], model: Optional[str] = None
     ) -> Embedding:
         """Embed a string.
 
@@ -724,11 +744,11 @@ def create_embedding(
 
         if self.params.embedding == False:
             raise RuntimeError(
-                "Llama model must be created with embedding=True to call this method"
+                "Falcon model must be created with embedding=True to call this method"
             )
 
         if self.verbose:
-            llama_cpp.llama_reset_timings(self.ctx)
+            falcon_cpp.falcon_reset_timings(self.ctx)
 
         if isinstance(input, str):
             inputs = [input]
@@ -743,9 +763,9 @@ def create_embedding(
             self.eval(tokens)
             n_tokens = len(tokens)
             total_tokens += n_tokens
-            embedding = llama_cpp.llama_get_embeddings(self.ctx)[
-                : llama_cpp.llama_n_embd(self.ctx)
-            ]
+            embedding = falcon_cpp.falcon_get_embeddings(self.ctx)[
+                        : falcon_cpp.falcon_n_embd(self.ctx)
+                        ]
 
             data.append(
                 {
@@ -755,7 +775,7 @@ def create_embedding(
                 }
             )
         if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
+            falcon_cpp.falcon_print_timings(self.ctx)
 
         return {
             "object": "list",
@@ -779,34 +799,34 @@ def embed(self, input: str) -> List[float]:
         return list(map(float, self.create_embedding(input)["data"][0]["embedding"]))
 
     def _create_completion(
-        self,
-        prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: int = 16,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        logprobs: Optional[int] = None,
-        echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        top_k: int = 40,
-        stream: bool = False,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            prompt: str,
+            suffix: Optional[str] = None,
+            max_tokens: int = 16,
+            temperature: float = 0.8,
+            top_p: float = 0.95,
+            logprobs: Optional[int] = None,
+            echo: bool = False,
+            stop: Optional[Union[str, List[str]]] = [],
+            frequency_penalty: float = 0.0,
+            presence_penalty: float = 0.0,
+            repeat_penalty: float = 1.1,
+            top_k: int = 40,
+            stream: bool = False,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_tau: float = 5.0,
+            mirostat_eta: float = 0.1,
+            model: Optional[str] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
 
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
         created: int = int(time.time())
         completion_tokens: List[int] = []
-        # Add blank space to start of prompt to match OG llama tokenizer
+        # Add blank space to start of prompt to match OG Falcon tokenizer
         prompt_tokens: List[int] = self.tokenize(b" " + prompt.encode("utf-8"))
         text: bytes = b""
         returned_tokens: int = 0
@@ -816,7 +836,7 @@ def _create_completion(
         model_name: str = model if model is not None else self.model_path
 
         if self.verbose:
-            llama_cpp.llama_reset_timings(self.ctx)
+            falcon_cpp.falcon_reset_timings(self.ctx)
 
         if len(prompt_tokens) > self._n_ctx:
             raise ValueError(
@@ -843,36 +863,36 @@ def _create_completion(
         if self.cache:
             try:
                 cache_item = self.cache[prompt_tokens]
-                cache_prefix_len = Llama.longest_token_prefix(
+                cache_prefix_len = Falcon.longest_token_prefix(
                     cache_item.input_ids.tolist(), prompt_tokens
                 )
-                eval_prefix_len = Llama.longest_token_prefix(
+                eval_prefix_len = Falcon.longest_token_prefix(
                     self._input_ids.tolist(), prompt_tokens
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
                     if self.verbose:
-                        print("Llama._create_completion: cache hit", file=sys.stderr)
+                        print("Falcon._create_completion: cache hit", file=sys.stderr)
             except KeyError:
                 if self.verbose:
-                    print("Llama._create_completion: cache miss", file=sys.stderr)
+                    print("Falcon._create_completion: cache miss", file=sys.stderr)
 
         finish_reason = "length"
         multibyte_fix = 0
         for token in self.generate(
-            prompt_tokens,
-            top_k=top_k,
-            top_p=top_p,
-            temp=temperature,
-            tfs_z=tfs_z,
-            mirostat_mode=mirostat_mode,
-            mirostat_tau=mirostat_tau,
-            mirostat_eta=mirostat_eta,
-            frequency_penalty=frequency_penalty,
-            presence_penalty=presence_penalty,
-            repeat_penalty=repeat_penalty,
-            stopping_criteria=stopping_criteria,
-            logits_processor=logits_processor,
+                prompt_tokens,
+                top_k=top_k,
+                top_p=top_p,
+                temp=temperature,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
+                frequency_penalty=frequency_penalty,
+                presence_penalty=presence_penalty,
+                repeat_penalty=repeat_penalty,
+                stopping_criteria=stopping_criteria,
+                logits_processor=logits_processor,
         ):
             if token == self._token_eos:
                 text = self.detokenize(completion_tokens)
@@ -924,7 +944,7 @@ def _create_completion(
                     token_end_position += len(self.detokenize([token]))
                     # Check if stop sequence is in the token
                     if token_end_position >= (
-                        remaining_length - first_stop_position - 1
+                            remaining_length - first_stop_position - 1
                     ):
                         break
                     logprobs_or_none: Optional[CompletionLogprobs] = None
@@ -937,7 +957,7 @@ def _create_completion(
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :].tolist()
-                        current_logprobs = Llama.logits_to_logprobs(logits)
+                        current_logprobs = Falcon.logits_to_logprobs(logits)
                         sorted_logprobs = list(
                             sorted(
                                 zip(current_logprobs, range(len(current_logprobs))),
@@ -985,13 +1005,13 @@ def _create_completion(
                 break
 
         if stopping_criteria is not None and stopping_criteria(
-            self._input_ids.tolist(), self._scores[-1, :].tolist()
+                self._input_ids.tolist(), self._scores[-1, :].tolist()
         ):
             text = self.detokenize(completion_tokens)
             finish_reason = "stop"
 
         if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
+            falcon_cpp.falcon_print_timings(self.ctx)
 
         if stream:
             remaining_tokens = completion_tokens[returned_tokens:]
@@ -1016,7 +1036,7 @@ def _create_completion(
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :].tolist()
-                    current_logprobs = Llama.logits_to_logprobs(logits)
+                    current_logprobs = Falcon.logits_to_logprobs(logits)
                     sorted_logprobs = list(
                         sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
@@ -1050,8 +1070,8 @@ def _create_completion(
                         "choices": [
                             {
                                 "text": last_text[
-                                    : len(last_text) - (token_end_position - end)
-                                ].decode("utf-8", errors="ignore"),
+                                        : len(last_text) - (token_end_position - end)
+                                        ].decode("utf-8", errors="ignore"),
                                 "index": 0,
                                 "logprobs": logprobs_or_none,
                                 "finish_reason": finish_reason,
@@ -1080,14 +1100,14 @@ def _create_completion(
                 }
             if self.cache:
                 if self.verbose:
-                    print("Llama._create_completion: cache save", file=sys.stderr)
+                    print("Falcon._create_completion: cache save", file=sys.stderr)
                 self.cache[prompt_tokens + completion_tokens] = self.save_state()
-                print("Llama._create_completion: cache saved", file=sys.stderr)
+                print("Falcon._create_completion: cache saved", file=sys.stderr)
             return
 
         if self.cache:
             if self.verbose:
-                print("Llama._create_completion: cache save", file=sys.stderr)
+                print("Falcon._create_completion: cache save", file=sys.stderr)
             self.cache[prompt_tokens + completion_tokens] = self.save_state()
 
         text_str = text.decode("utf-8", errors="ignore")
@@ -1118,10 +1138,10 @@ def _create_completion(
                 for token in all_tokens
             ]
             all_logprobs = [
-                Llama.logits_to_logprobs(row.tolist()) for row in self._scores
-            ][token_offset:]
+                               Falcon.logits_to_logprobs(row.tolist()) for row in self._scores
+                           ][token_offset:]
             for token, token_str, logprobs_token in zip(
-                all_tokens, all_token_strs, all_logprobs
+                    all_tokens, all_token_strs, all_logprobs
             ):
                 text_offsets.append(text_offset)
                 text_offset += len(token_str)
@@ -1172,27 +1192,27 @@ def _create_completion(
         }
 
     def create_completion(
-        self,
-        prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: int = 128,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        logprobs: Optional[int] = None,
-        echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        top_k: int = 40,
-        stream: bool = False,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            prompt: str,
+            suffix: Optional[str] = None,
+            max_tokens: int = 128,
+            temperature: float = 0.8,
+            top_p: float = 0.95,
+            logprobs: Optional[int] = None,
+            echo: bool = False,
+            stop: Optional[Union[str, List[str]]] = [],
+            frequency_penalty: float = 0.0,
+            presence_penalty: float = 0.0,
+            repeat_penalty: float = 1.1,
+            top_k: int = 40,
+            stream: bool = False,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_tau: float = 5.0,
+            mirostat_eta: float = 0.1,
+            model: Optional[str] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1245,27 +1265,27 @@ def create_completion(
         return completion
 
     def __call__(
-        self,
-        prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: int = 128,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        logprobs: Optional[int] = None,
-        echo: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        frequency_penalty: float = 0.0,
-        presence_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        top_k: int = 40,
-        stream: bool = False,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        stopping_criteria: Optional[StoppingCriteriaList] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            prompt: str,
+            suffix: Optional[str] = None,
+            max_tokens: int = 128,
+            temperature: float = 0.8,
+            top_p: float = 0.95,
+            logprobs: Optional[int] = None,
+            echo: bool = False,
+            stop: Optional[Union[str, List[str]]] = [],
+            frequency_penalty: float = 0.0,
+            presence_penalty: float = 0.0,
+            repeat_penalty: float = 1.1,
+            top_k: int = 40,
+            stream: bool = False,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_tau: float = 5.0,
+            mirostat_eta: float = 0.1,
+            model: Optional[str] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
 
@@ -1313,7 +1333,7 @@ def __call__(
         )
 
     def _convert_text_completion_to_chat(
-        self, completion: Completion
+            self, completion: Completion
     ) -> ChatCompletion:
         return {
             "id": "chat" + completion["id"],
@@ -1334,8 +1354,8 @@ def _convert_text_completion_to_chat(
         }
 
     def _convert_text_completion_chunks_to_chat(
-        self,
-        chunks: Iterator[CompletionChunk],
+            self,
+            chunks: Iterator[CompletionChunk],
     ) -> Iterator[ChatCompletionChunk]:
         for i, chunk in enumerate(chunks):
             if i == 0:
@@ -1371,23 +1391,23 @@ def _convert_text_completion_chunks_to_chat(
             }
 
     def create_chat_completion(
-        self,
-        messages: List[ChatCompletionMessage],
-        temperature: float = 0.2,
-        top_p: float = 0.95,
-        top_k: int = 40,
-        stream: bool = False,
-        stop: Optional[Union[str, List[str]]] = [],
-        max_tokens: int = 256,
-        presence_penalty: float = 0.0,
-        frequency_penalty: float = 0.0,
-        repeat_penalty: float = 1.1,
-        tfs_z: float = 1.0,
-        mirostat_mode: int = 0,
-        mirostat_tau: float = 5.0,
-        mirostat_eta: float = 0.1,
-        model: Optional[str] = None,
-        logits_processor: Optional[LogitsProcessorList] = None,
+            self,
+            messages: List[ChatCompletionMessage],
+            temperature: float = 0.2,
+            top_p: float = 0.95,
+            top_k: int = 40,
+            stream: bool = False,
+            stop: Optional[Union[str, List[str]]] = [],
+            max_tokens: int = 256,
+            presence_penalty: float = 0.0,
+            frequency_penalty: float = 0.0,
+            repeat_penalty: float = 1.1,
+            tfs_z: float = 1.0,
+            mirostat_mode: int = 0,
+            mirostat_tau: float = 5.0,
+            mirostat_eta: float = 0.1,
+            model: Optional[str] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
 
@@ -1440,10 +1460,10 @@ def create_chat_completion(
 
     def __del__(self):
         if self.model is not None:
-            llama_cpp.llama_free_model(self.model)
+            falcon_cpp.falcon_free_model(self.model)
             self.model = None
         if self.ctx is not None:
-            llama_cpp.llama_free(self.ctx)
+            falcon_cpp.falcon_free(self.ctx)
             self.ctx = None
 
     def __getstate__(self):
@@ -1492,82 +1512,82 @@ def __setstate__(self, state):
             verbose=state["verbose"],
         )
 
-    def save_state(self) -> LlamaState:
+    def save_state(self) -> FalconState:
         assert self.ctx is not None
         if self.verbose:
-            print("Llama.save_state: saving llama state", file=sys.stderr)
-        state_size = llama_cpp.llama_get_state_size(self.ctx)
+            print("Falcon.save_state: saving falcon state", file=sys.stderr)
+        state_size = falcon_cpp.falcon_get_state_size(self.ctx)
         if self.verbose:
-            print(f"Llama.save_state: got state size: {state_size}", file=sys.stderr)
-        llama_state = (llama_cpp.c_uint8 * int(state_size))()
+            print(f"Falcon.save_state: got state size: {state_size}", file=sys.stderr)
+        falcon_state = (falcon_cpp.c_uint8 * int(state_size))()
         if self.verbose:
-            print("Llama.save_state: allocated state", file=sys.stderr)
-        n_bytes = llama_cpp.llama_copy_state_data(self.ctx, llama_state)
+            print("Falcon.save_state: allocated state", file=sys.stderr)
+        n_bytes = falcon_cpp.falcon_copy_state_data(self.ctx, falcon_state)
         if self.verbose:
-            print(f"Llama.save_state: copied llama state: {n_bytes}", file=sys.stderr)
+            print(f"Falcon.save_state: copied falcon state: {n_bytes}", file=sys.stderr)
         if int(n_bytes) > int(state_size):
-            raise RuntimeError("Failed to copy llama state data")
-        llama_state_compact = (llama_cpp.c_uint8 * int(n_bytes))()
-        llama_cpp.ctypes.memmove(llama_state_compact, llama_state, int(n_bytes))
+            raise RuntimeError("Failed to copy Falcon state data")
+        falcon_state_compact = (falcon_cpp.c_uint8 * int(n_bytes))()
+        falcon_cpp.ctypes.memmove(falcon_state_compact, falcon_state, int(n_bytes))
         if self.verbose:
             print(
-                f"Llama.save_state: saving {n_bytes} bytes of llama state",
+                f"Falcon.save_state: saving {n_bytes} bytes of falcon state",
                 file=sys.stderr,
             )
-        return LlamaState(
+        return FalconState(
             scores=self.scores.copy(),
             input_ids=self.input_ids.copy(),
             n_tokens=self.n_tokens,
-            llama_state=bytes(llama_state_compact),
-            llama_state_size=n_bytes,
+            falcon_state=bytes(falcon_state_compact),
+            falcon_state_size=n_bytes,
         )
 
-    def load_state(self, state: LlamaState) -> None:
+    def load_state(self, state: FalconState) -> None:
         assert self.ctx is not None
         self.scores = state.scores.copy()
         self.input_ids = state.input_ids.copy()
         self.n_tokens = state.n_tokens
-        state_size = state.llama_state_size
-        LLamaStateArrayType = llama_cpp.c_uint8 * state_size
-        llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state)
+        state_size = state.falcon_state_size
+        FalconStateArrayType = falcon_cpp.c_uint8 * state_size
+        falcon_state = FalconStateArrayType.from_buffer_copy(state.falcon_state)
 
-        if llama_cpp.llama_set_state_data(self.ctx, llama_state) != state_size:
-            raise RuntimeError("Failed to set llama state data")
+        if falcon_cpp.falcon_set_state_data(self.ctx, falcon_state) != state_size:
+            raise RuntimeError("Failed to set Falcon state data")
 
     def n_ctx(self) -> int:
         """Return the context window size."""
         assert self.ctx is not None
-        return llama_cpp.llama_n_ctx(self.ctx)
+        return falcon_cpp.falcon_n_ctx(self.ctx)
 
     def n_embd(self) -> int:
         """Return the embedding size."""
         assert self.ctx is not None
-        return llama_cpp.llama_n_embd(self.ctx)
+        return falcon_cpp.falcon_n_embd(self.ctx)
 
     def n_vocab(self) -> int:
         """Return the vocabulary size."""
         assert self.ctx is not None
-        return llama_cpp.llama_n_vocab(self.ctx)
+        return falcon_cpp.falcon_n_vocab(self.ctx)
 
-    def tokenizer(self) -> "LlamaTokenizer":
+    def tokenizer(self) -> "FalconTokenizer":
         """Return the tokenizer for this model."""
         assert self.ctx is not None
-        return LlamaTokenizer(self)
+        return FalconTokenizer(self)
 
     @staticmethod
     def token_eos() -> int:
         """Return the end-of-sequence token."""
-        return llama_cpp.llama_token_eos()
+        return falcon_cpp.falcon_token_eos()
 
     @staticmethod
     def token_bos() -> int:
         """Return the beginning-of-sequence token."""
-        return llama_cpp.llama_token_bos()
+        return falcon_cpp.falcon_token_bos()
 
     @staticmethod
     def token_nl() -> int:
         """Return the newline token."""
-        return llama_cpp.llama_token_nl()
+        return falcon_cpp.falcon_token_nl()
 
     @staticmethod
     def logits_to_logprobs(logits: List[float]) -> List[float]:
@@ -1586,18 +1606,18 @@ def longest_token_prefix(a: Sequence[int], b: Sequence[int]):
         return longest_prefix
 
 
-class LlamaTokenizer:
-    def __init__(self, llama: Llama):
-        self.llama = llama
+class FalconTokenizer:
+    def __init__(self, falcon: Falcon):
+        self.falcon = falcon
 
     def encode(self, text: str, add_bos: bool = True) -> List[int]:
-        return self.llama.tokenize(
+        return self.falcon.tokenize(
             text.encode("utf-8", errors="ignore"), add_bos=add_bos
         )
 
     def decode(self, tokens: List[int]) -> str:
-        return self.llama.detokenize(tokens).decode("utf-8", errors="ignore")
+        return self.falcon.detokenize(tokens).decode("utf-8", errors="ignore")
 
     @classmethod
-    def from_ggml_file(cls, path: str) -> "LlamaTokenizer":
-        return cls(Llama(model_path=path, vocab_only=True))
+    def from_ggml_file(cls, path: str) -> "FalconTokenizer":
+        return cls(Falcon(model_path=path, vocab_only=True))
\ No newline at end of file
diff --git a/falcon_cpp/falcon_cpp.py b/falcon_cpp/falcon_cpp.py
new file mode 100644
index 000000000..121b98c96
--- /dev/null
+++ b/falcon_cpp/falcon_cpp.py
@@ -0,0 +1,1023 @@
+import sys
+import os
+import ctypes
+from ctypes import (
+    c_int,
+    c_float,
+    c_char_p,
+    c_void_p,
+    c_bool,
+    POINTER,
+    _Pointer,  # type: ignore
+    Structure,
+    Array,
+    c_uint8,
+    c_size_t,
+)
+import pathlib
+from typing import List, Union
+
+
+# Load the library
+def _load_shared_library(lib_base_name: str):
+    # Construct the paths to the possible shared library names
+    _base_path = pathlib.Path(__file__).parent.resolve()
+    # Searching for the library in the current directory under the name "libFalcon" (default name
+    # for falconcpp) and "falcon" (default name for this repo)
+    _lib_paths: List[pathlib.Path] = []
+    # Determine the file extension based on the platform
+    if sys.platform.startswith("linux"):
+        _lib_paths += [
+            _base_path / f"lib{lib_base_name}.so",
+        ]
+    elif sys.platform == "darwin":
+        _lib_paths += [
+            _base_path / f"lib{lib_base_name}.so",
+            _base_path / f"lib{lib_base_name}.dylib",
+        ]
+    elif sys.platform == "win32":
+        _lib_paths += [
+            _base_path / f"{lib_base_name}.dll",
+        ]
+    else:
+        raise RuntimeError("Unsupported platform")
+
+    if "FALCON_CPP_LIB" in os.environ:
+        lib_base_name = os.environ["FALCON_CPP_LIB"]
+        _lib = pathlib.Path(lib_base_name)
+        _base_path = _lib.parent.resolve()
+        _lib_paths = [_lib.resolve()]
+
+    cdll_args = dict()  # type: ignore
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        cdll_args["winmode"] = 0
+
+    # Try to load the shared library, handling potential errors
+    for _lib_path in _lib_paths:
+        if _lib_path.exists():
+            try:
+                return ctypes.CDLL(str(_lib_path), **cdll_args)
+            except Exception as e:
+                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
+
+    raise FileNotFoundError(
+        f"Shared library with base name '{lib_base_name}' not found"
+    )
+
+
+# Specify the base name of the shared library to load
+_lib_base_name = "falcon"
+
+# Load the library
+_lib = _load_shared_library(_lib_base_name)
+
+# Misc
+c_float_p = POINTER(c_float)
+c_uint8_p = POINTER(c_uint8)
+c_size_t_p = POINTER(c_size_t)
+
+# falcon.h bindings
+
+GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
+GGML_CUDA_MAX_DEVICES = ctypes.c_int(16)
+FALCON_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1)
+
+# #define FALCON_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+FALCON_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
+# #define FALCON_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+FALCON_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
+# #define FALCON_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+FALCON_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
+# #define FLACON_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+FALCON_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
+# #define FALCON_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+FALCON_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
+
+# #define FALCON_FILE_VERSION           3
+FALCON_FILE_VERSION = c_int(3)
+FALCON_FILE_MAGIC = FALCON_FILE_MAGIC_GGJT
+FALCON_FILE_MAGIC_UNVERSIONED = FALCON_FILE_MAGIC_GGML
+FALCON_SESSION_MAGIC = FALCON_FILE_MAGIC_GGSN
+FALCON_SESSION_VERSION = c_int(1)
+
+# struct falcon_model;
+falcon_model_p = c_void_p
+
+# struct falcon_context;
+falcon_context_p = c_void_p
+
+
+# typedef int falcon_token;
+falcon_token = c_int
+falcon_token_p = POINTER(falcon_token)
+
+
+# typedef struct falcon_token_data {
+#     falcon_token id; // token id
+#     float logit;    // log-odds of the token
+#     float p;        // probability of the token
+# } falcon_token_data;
+class falcon_token_data(Structure):
+    _fields_ = [
+        ("id", falcon_token),
+        ("logit", c_float),
+        ("p", c_float),
+    ]
+
+
+falcon_token_data_p = POINTER(falcon_token_data)
+
+# typedef struct falcon_token_data_array {
+#     falcon_token_data * data;
+#     size_t size;
+#     bool sorted;
+# } falcon_token_data_array;
+class falcon_token_data_array(Structure):
+    _fields_ = [
+        ("data", falcon_token_data_p),
+        ("size", c_size_t),
+        ("sorted", c_bool),
+    ]
+
+
+falcon_token_data_array_p = POINTER(falcon_token_data_array)
+
+# typedef void (*falcon_progress_callback)(float progress, void *ctx);
+falcon_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
+
+
+# struct falcon_context_params {
+#     int seed;                              // RNG seed, -1 for random
+#     int n_ctx;                             // text context
+#     int n_batch;                           // prompt processing batch size
+#     int n_gpu_layers;                      // number of layers to store in VRAM
+#     int main_gpu;                          // the GPU that is used for scratch and small tensors
+#     float tensor_split[FALCON_MAX_DEVICES]; // how to split layers across multiple GPUs
+#     // called with a progress value between 0 and 1, pass NULL to disable
+#     falcon_progress_callback progress_callback;
+#     // context pointer passed to the progress callback
+#     void * progress_callback_user_data;
+
+
+#     // Keep the booleans together to avoid misalignment during copy-by-value.
+#     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
+#     bool f16_kv;     // use fp16 for KV cache
+#     bool logits_all; // the falcon_eval() call computes all logits, not just the last one
+#     bool vocab_only; // only load the vocabulary, no weights
+#     bool use_mmap;   // use mmap if possible
+#     bool use_mlock;  // force system to keep model in RAM
+#     bool embedding;  // embedding mode only
+# };
+class ggllm_context_params(Structure):
+    _fields_ = [
+        ("seed", c_int),
+        ("n_ctx", c_int),
+        ("n_batch", c_int),
+        ("n_gpu_layers", c_int),
+        ("main_gpu", c_int),
+        ("tensor_split", c_float * FALCON_MAX_DEVICES.value),
+        ("progress_callback", falcon_progress_callback),
+        ("progress_callback_user_data", c_void_p),
+        ("low_vram", c_bool),
+        ("f16_kv", c_bool),
+        ("logits_all", c_bool),
+        ("vocab_only", c_bool),
+        ("use_mmap", c_bool),
+        ("use_mlock", c_bool),
+        ("embedding", c_bool),
+    ]
+
+
+falcon_context_params_p = POINTER(ggllm_context_params)
+
+# enum falcon_ftype {
+#     FALCON_FTYPE_ALL_F32              = 0,
+#     FALCON_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+#     // FALCON_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+#     // FALCON_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+#     FALCON_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
+#     FALCON_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
+# };
+FALCON_FTYPE_ALL_F32 = c_int(0)
+FALCON_FTYPE_MOSTLY_F16 = c_int(1)
+FALCON_FTYPE_MOSTLY_Q4_0 = c_int(2)
+FALCON_FTYPE_MOSTLY_Q4_1 = c_int(3)
+FALCON_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
+FALCON_FTYPE_MOSTLY_Q8_0 = c_int(7)
+FALCON_FTYPE_MOSTLY_Q5_0 = c_int(8)
+FALCON_FTYPE_MOSTLY_Q5_1 = c_int(9)
+FALCON_FTYPE_MOSTLY_Q2_K = c_int(10)
+FALCON_FTYPE_MOSTLY_Q3_K_S = c_int(11)
+FALCON_FTYPE_MOSTLY_Q3_K_M = c_int(12)
+FALCON_FTYPE_MOSTLY_Q3_K_L = c_int(13)
+FALCON_FTYPE_MOSTLY_Q4_K_S = c_int(14)
+FALCON_FTYPE_MOSTLY_Q4_K_M = c_int(15)
+FALCON_FTYPE_MOSTLY_Q5_K_S = c_int(16)
+FALCON_FTYPE_MOSTLY_Q5_K_M = c_int(17)
+FALCON_FTYPE_MOSTLY_Q6_K = c_int(18)
+
+
+# // model quantization parameters
+# typedef struct falcon_model_quantize_params {
+#     int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+#     enum falcon_ftype   ftype;    // quantize to this falcon_ftype
+#     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+#     bool quantize_output_tensor; // quantize output.weight
+# } falcon_model_quantize_params;
+class falcon_model_quantize_params(Structure):
+    _fields_ = [
+        ("nthread", c_int),
+        ("ftype", c_int),
+        ("allow_requantize", c_bool),
+        ("quantize_output_tensor", c_bool),
+    ]
+
+
+# FALCON_API struct falcon_context_params falcon_context_default_params();
+def falcon_context_default_params() -> ggllm_context_params:
+    return _lib.ggllm_context_default_params()
+
+
+_lib.ggllm_context_default_params.argtypes = []
+_lib.ggllm_context_default_params.restype = ggllm_context_params
+
+
+# FALCON_API struct falcon_model_quantize_params falcon_model_quantize_default_params();
+def falcon_model_quantize_default_params() -> falcon_model_quantize_params:
+    return _lib.ggllm_model_quantize_default_params()
+
+
+_lib.ggllm_model_quantize_default_params.argtypes = []
+_lib.ggllm_model_quantize_default_params.restype = falcon_model_quantize_params
+
+
+# FALCON_API bool falcon_mmap_supported();
+def falcon_mmap_supported() -> bool:
+    return _lib.ggllm_mmap_supported()
+
+
+_lib.ggllm_mmap_supported.argtypes = []
+_lib.ggllm_mmap_supported.restype = c_bool
+
+
+# FALCON_API bool falcon_mlock_supported();
+def falcon_mlock_supported() -> bool:
+    return _lib.ggllm_mlock_supported()
+
+
+_lib.ggllm_mlock_supported.argtypes = []
+_lib.ggllm_mlock_supported.restype = c_bool
+
+
+# // TODO: not great API - very likely to change
+# // Initialize the falcon + ggml backend
+# // If numa is true, use NUMA optimizations
+# // Call once at the start of the program
+# FLACON_API void falcon_init_backend(bool numa);
+def falcon_init_backend(numa: c_bool):
+    return _lib.ggllm_init_backend(numa)
+
+
+_lib.ggllm_init_backend.argtypes = [c_bool]
+_lib.ggllm_init_backend.restype = None
+
+
+# FALCON_API struct falcon_model * falcon_load_model_from_file(
+#                             const char * path_model,
+#         struct falcon_context_params   params);
+def falcon_load_model_from_file(
+    path_model: bytes, params: ggllm_context_params
+) -> falcon_model_p:
+    return _lib.ggllm_load_model_from_file(path_model, params)
+
+
+_lib.ggllm_load_model_from_file.argtypes = [c_char_p, ggllm_context_params]
+_lib.ggllm_load_model_from_file.restype = falcon_model_p
+
+
+# FALCON_API void falcon_free_model(struct falcon_model * model);
+def falcon_free_model(model: falcon_model_p):
+    return _lib.ggllm_free_model(model)
+
+
+_lib.ggllm_free_model.argtypes = [falcon_model_p]
+_lib.ggllm_free_model.restype = None
+
+
+# FALCON_API struct falcon_context * falcon_new_context_with_model(
+#                     struct falcon_model * model,
+#         struct falcon_context_params   params);
+def falcon_new_context_with_model(
+    model: falcon_model_p, params: ggllm_context_params
+) -> falcon_context_p:
+    return _lib.ggllm_new_context_with_model(model, params)
+
+
+_lib.ggllm_new_context_with_model.argtypes = [falcon_model_p, ggllm_context_params]
+_lib.ggllm_new_context_with_model.restype = falcon_context_p
+
+
+# FALCON_API int64_t ggllm_time_us();
+def ggllm_time_us() -> int:
+    return _lib.ggllm_time_us()
+
+
+_lib.ggllm_time_us.argtypes = []
+_lib.ggllm_time_us.restype = ctypes.c_int64
+
+
+# // Various functions for loading a ggml falcon model.
+# // Allocate (almost) all memory needed for the model.
+# // Return NULL on failure
+# FALCON_API struct falcon_context * falcon_init_from_file(
+#                             const char * path_model,
+#         struct falcon_context_params   params);
+def ggllm_init_from_file(
+    path_model: bytes, params: ggllm_context_params
+) -> falcon_context_p:
+    return _lib.ggllm_init_from_file(path_model, params)
+
+
+_lib.ggllm_init_from_file.argtypes = [c_char_p, ggllm_context_params]
+_lib.ggllm_init_from_file.restype = falcon_context_p
+
+
+# Frees all allocated memory
+# FALCON_API void falcon_free(struct falcon_context * ctx);
+def falcon_free(ctx: falcon_context_p):
+    return _lib.ggllm_free(ctx)
+
+
+_lib.ggllm_free.argtypes = [falcon_context_p]
+_lib.ggllm_free.restype = None
+
+
+# // Returns 0 on success
+# FALCON_API int ggllm_model_quantize(
+#         const char * fname_inp,
+#         const char * fname_out,
+#         const falcon_model_quantize_params * params);
+def ggllm_model_quantize(
+    fname_inp: bytes,
+    fname_out: bytes,
+    params,  # type: POINTER(falcon_model_quantize_params) # type: ignore
+) -> int:
+    return _lib.ggllm_model_quantize(fname_inp, fname_out, params)
+
+
+_lib.ggllm_model_quantize.argtypes = [
+    c_char_p,
+    c_char_p,
+    POINTER(falcon_model_quantize_params),
+]
+_lib.ggllm_model_quantize.restype = c_int
+
+
+# Apply a LoRA adapter to a loaded model
+# path_base_model is the path to a higher quality model to use as a base for
+# the layers modified by the adapter. Can be NULL to use the current loaded model.
+# The model needs to be reloaded before applying a new adapter, otherwise the adapter
+# will be applied on top of the previous one
+# Returns 0 on success
+# FALCON_API int falcon_apply_lora_from_file(
+#         struct falcon_context * ctx,
+#                   const char * path_lora,
+#                   const char * path_base_model,
+#                          int   n_threads);
+def ggllm_apply_lora_from_file(
+    ctx: falcon_context_p,
+    path_lora: c_char_p,
+    path_base_model: c_char_p,
+    n_threads: c_int,
+) -> int:
+    return _lib.ggllm_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
+
+
+_lib.ggllm_apply_lora_from_file.argtypes = [falcon_context_p, c_char_p, c_char_p, c_int]
+_lib.ggllm_apply_lora_from_file.restype = c_int
+
+
+# FALCON_API int ggllm_model_apply_lora_from_file(
+#         const struct ggllm_model * model,
+#                     const char * path_lora,
+#                     const char * path_base_model,
+#                             int   n_threads);
+def falcon_model_apply_lora_from_file(
+    model: falcon_model_p,
+    path_lora: Union[c_char_p, bytes],
+    path_base_model: Union[c_char_p, bytes],
+    n_threads: c_int,
+) -> int:
+    return _lib.ggllm_model_apply_lora_from_file(
+        model, path_lora, path_base_model, n_threads
+    )
+
+
+_lib.ggllm_model_apply_lora_from_file.argtypes = [
+    falcon_model_p,
+    c_char_p,
+    c_char_p,
+    c_int,
+]
+_lib.ggllm_model_apply_lora_from_file.restype = c_int
+
+
+# Returns the number of tokens in the KV cache
+# FALCON_API int falcon_get_kv_cache_token_count(const struct falcon_context * ctx);
+def ggllm_get_kv_cache_token_count(ctx: falcon_context_p) -> int:
+    return _lib.ggllm_get_kv_cache_token_count(ctx)
+
+
+_lib.ggllm_get_kv_cache_token_count.argtypes = [falcon_context_p]
+_lib.ggllm_get_kv_cache_token_count.restype = c_int
+
+
+# Sets the current rng seed.
+# FALCON_API void falcon_set_rng_seed(struct falcon_context * ctx, int seed);
+def falcon_set_rng_seed(ctx: falcon_context_p, seed: c_int):
+    return _lib.ggllm_set_rng_seed(ctx, seed)
+
+
+_lib.ggllm_set_rng_seed.argtypes = [falcon_context_p, c_int]
+_lib.ggllm_set_rng_seed.restype = None
+
+
+# Returns the maximum size in bytes of the state (rng, logits, embedding
+# and kv_cache) - will often be smaller after compacting tokens
+# FALCON_API size_t falcon_get_state_size(const struct falcon_context * ctx);
+def falcon_get_state_size(ctx: falcon_context_p) -> int:
+    return _lib.ggllm_get_state_size(ctx)
+
+
+_lib.ggllm_get_state_size.argtypes = [falcon_context_p]
+_lib.ggllm_get_state_size.restype = c_size_t
+
+
+# Copies the state to the specified destination address.
+# Destination needs to have allocated enough memory.
+# Returns the number of bytes copied
+# FALCON_API size_t falcon_copy_state_data(struct falcon_context * ctx, uint8_t * dst);
+def falcon_copy_state_data(
+    ctx: falcon_context_p, dst  # type: Array[c_uint8]
+) -> int:
+    return _lib.ggllm_copy_state_data(ctx, dst)
+
+
+_lib.ggllm_copy_state_data.argtypes = [falcon_context_p, c_uint8_p]
+_lib.ggllm_copy_state_data.restype = c_size_t
+
+
+# Set the state reading from the specified address
+# Returns the number of bytes read
+# FALCON_API size_t falcon_set_state_data(struct falcon_context * ctx, uint8_t * src);
+def falcon_set_state_data(
+    ctx: falcon_context_p, src  # type: Array[c_uint8]
+) -> int:
+    return _lib.ggllm_set_state_data(ctx, src)
+
+
+_lib.ggllm_set_state_data.argtypes = [falcon_context_p, c_uint8_p]
+_lib.ggllm_set_state_data.restype = c_size_t
+
+
+# Save/load session file
+# GGLLM_API bool falcon_load_session_file(struct falcon_context * ctx, const char * path_session, falcon_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+def ggllm_load_session_file(
+    ctx: falcon_context_p,
+    path_session: bytes,
+    tokens_out,  # type: Array[falcon_token]
+    n_token_capacity: c_size_t,
+    n_token_count_out,  # type: _Pointer[c_size_t]
+) -> int:
+    return _lib.ggllm_load_session_file(
+        ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
+    )
+
+
+_lib.ggllm_load_session_file.argtypes = [
+    falcon_context_p,
+    c_char_p,
+    falcon_token_p,
+    c_size_t,
+    c_size_t_p,
+]
+_lib.ggllm_load_session_file.restype = c_size_t
+
+
+# FALCON_API bool falcon_save_session_file(struct falcon_context * ctx, const char * path_session, const falcon_token * tokens, size_t n_token_count);
+def ggllm_save_session_file(
+    ctx: falcon_context_p,
+    path_session: bytes,
+    tokens,  # type: Array[falcon_token]
+    n_token_count: c_size_t,
+) -> int:
+    return _lib.ggllm_save_session_file(ctx, path_session, tokens, n_token_count)
+
+
+_lib.ggllm_save_session_file.argtypes = [
+    falcon_context_p,
+    c_char_p,
+    falcon_token_p,
+    c_size_t,
+]
+_lib.ggllm_save_session_file.restype = c_size_t
+
+
+# Run the falcon inference to obtain the logits and probabilities for the next token.
+# tokens + n_tokens is the provided batch of new tokens to process
+# n_past is the number of tokens to use from previous eval calls
+# Returns 0 on success
+# GGLLM_API int falcon_eval(
+#         struct falcon_context * ctx,
+#            const falcon_token * tokens,
+#                          int   n_tokens,
+#                          int   n_past,
+#                          int   n_threads);
+def falcon_eval(
+    ctx: falcon_context_p,
+    tokens,  # type: Array[falcon_token]
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> int:
+    return _lib.ggllm_eval(ctx, tokens, n_tokens, n_past, n_threads)
+
+
+_lib.ggllm_eval.argtypes = [falcon_context_p, falcon_token_p, c_int, c_int, c_int]
+_lib.ggllm_eval.restype = c_int
+
+
+# // Same as falcon_eval, but use float matrix input directly.
+# FALCON_API int falcon_eval_embd(
+#         struct falcon_context * ctx,
+#                     const float * embd,
+#                             int   n_tokens,
+#                             int   n_past,
+#                             int   n_threads);
+def ggllm_eval_embd(
+    ctx: falcon_context_p,
+    embd,  # type: Array[c_float]
+    n_tokens: c_int,
+    n_past: c_int,
+    n_threads: c_int,
+) -> int:
+    return _lib.ggllm_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
+
+
+_lib.ggllm_eval_embd.argtypes = [falcon_context_p, c_float_p, c_int, c_int, c_int]
+_lib.ggllm_eval_embd.restype = c_int
+
+
+# Convert the provided text into tokens.
+# The tokens pointer must be large enough to hold the resulting tokens.
+# Returns the number of tokens on success, no more than n_max_tokens
+# Returns a negative number on failure - the number of tokens that would have been returned
+# TODO: not sure if correct
+# FALCON_API int ggllm_tokenize(
+#         struct falcon_context * ctx,
+#                   const char * text,
+#                  falcon_token * tokens,
+#                          int   n_max_tokens,
+#                         bool   add_bos);
+def falcon_tokenize(
+    ctx: falcon_context_p,
+    text: bytes,
+    tokens,  # type: Array[falcon_token]
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> int:
+    return _lib.ggllm_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
+
+
+_lib.ggllm_tokenize.argtypes = [falcon_context_p, c_char_p, falcon_token_p, c_int, c_bool]
+_lib.ggllm_tokenize.restype = c_int
+
+
+# GGLLM_API int ggllm_n_vocab(const struct falcon_context * ctx);
+def falcon_n_vocab(ctx: falcon_context_p) -> int:
+    return _lib.ggllm_n_vocab(ctx)
+
+
+_lib.ggllm_n_vocab.argtypes = [falcon_context_p]
+_lib.ggllm_n_vocab.restype = c_int
+
+
+# FALCON_API int falcon_n_ctx  (const struct falcon_context * ctx);
+def falcon_n_ctx(ctx: falcon_context_p) -> int:
+    return _lib.ggllm_n_ctx(ctx)
+
+
+_lib.ggllm_n_ctx.argtypes = [falcon_context_p]
+_lib.ggllm_n_ctx.restype = c_int
+
+
+# FALCON_API int falcon_n_embd (const struct falcon_context * ctx);
+def falcon_n_embd(ctx: falcon_context_p) -> int:
+    return _lib.ggllm_n_embd(ctx)
+
+
+_lib.ggllm_n_embd.argtypes = [falcon_context_p]
+_lib.ggllm_n_embd.restype = c_int
+
+
+# // Get the vocabulary as output parameters.
+# // Returns number of results.
+# FALCON_API int falcon_get_vocab(
+#         const struct falcon_context * ctx,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def falcon_get_vocab(
+    ctx: falcon_context_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.ggllm_get_vocab(ctx, strings, scores, capacity)
+
+
+_lib.ggllm_get_vocab.argtypes = [falcon_context_p, c_char_p, c_float, c_int]
+_lib.ggllm_get_vocab.restype = c_int
+
+
+# Token logits obtained from the last call to falcon_eval()
+# The logits for the last token are stored in the last row
+# Can be mutated in order to change the probabilities of the next token
+# Rows: n_tokens
+# Cols: n_vocab
+# FALCON_API float * falcon_get_logits(struct falcon_context * ctx);
+def falcon_get_logits(
+    ctx: falcon_context_p,
+):  # type: (...) -> Array[float] # type: ignore
+    return _lib.ggllm_get_logits(ctx)
+
+
+_lib.ggllm_get_logits.argtypes = [falcon_context_p]
+_lib.ggllm_get_logits.restype = c_float_p
+
+
+# Get the embeddings for the input
+# shape: [n_embd] (1-dimensional)
+# FALCON_API float * falcon_get_embeddings(struct falcon_context * ctx);
+def falcon_get_embeddings(
+    ctx: falcon_context_p,
+):  # type: (...) -> Array[float] # type: ignore
+    return _lib.ggllm_get_embeddings(ctx)
+
+
+_lib.ggllm_get_embeddings.argtypes = [falcon_context_p]
+_lib.ggllm_get_embeddings.restype = c_float_p
+
+
+# Token Id -> String. Uses the vocabulary in the provided context
+# FLACON_API const char * falcon_token_to_str(const struct falcon_context * ctx, falcon_token token);
+def falcon_token_to_str(ctx: falcon_context_p, token: falcon_token) -> bytes:
+    return _lib.ggllm_token_to_str(ctx, token)
+
+
+_lib.ggllm_token_to_str.argtypes = [falcon_context_p, falcon_token]
+_lib.ggllm_token_to_str.restype = c_char_p
+
+# Special tokens
+
+
+# FALCON_API falcon_token falcon_token_bos(); // beginning-of-sentence
+def falcon_token_bos() -> int:
+    return _lib.ggllm_token_bos()
+
+
+_lib.ggllm_token_bos.argtypes = []
+_lib.ggllm_token_bos.restype = falcon_token
+
+
+# FALCON_API falcon_token falcon_token_eos(); // end-of-sentence
+def falcon_token_eos() -> int:
+    return _lib.ggllm_token_eos()
+
+
+_lib.ggllm_token_eos.argtypes = []
+_lib.ggllm_token_eos.restype = falcon_token
+
+
+# FALCON_API falcon_token falcon_token_nl(); // next-line
+def falcon_token_nl() -> int:
+    return _lib.ggllm_token_nl()
+
+
+_lib.ggllm_token_nl.argtypes = []
+_lib.ggllm_token_nl.restype = falcon_token
+
+
+# Sampling functions
+
+
+# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+# FALCON_API void falcon_sample_repetition_penalty(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float penalty);
+def falcon_sample_repetition_penalty(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    last_tokens_data,  # type: Array[falcon_token]
+    last_tokens_size: c_int,
+    penalty: c_float,
+):
+    return _lib.ggllm_sample_repetition_penalty(
+        ctx, candidates, last_tokens_data, last_tokens_size, penalty
+    )
+
+
+_lib.ggllm_sample_repetition_penalty.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    falcon_token_p,
+    c_int,
+    c_float,
+]
+_lib.ggllm_sample_repetition_penalty.restype = None
+
+
+# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+# FALCON_API void falcon_sample_frequency_and_presence_penalties(struct falcon_context * ctx, falcon_token_data_array * candidates, const falcon_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+def falcon_sample_frequency_and_presence_penalties(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    last_tokens_data,  # type: Array[falcon_token]
+    last_tokens_size: c_int,
+    alpha_frequency: c_float,
+    alpha_presence: c_float,
+):
+    return _lib.ggllm_sample_frequency_and_presence_penalties(
+        ctx,
+        candidates,
+        last_tokens_data,
+        last_tokens_size,
+        alpha_frequency,
+        alpha_presence,
+    )
+
+
+_lib.ggllm_sample_frequency_and_presence_penalties.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    falcon_token_p,
+    c_int,
+    c_float,
+    c_float,
+]
+_lib.ggllm_sample_frequency_and_presence_penalties.restype = None
+
+
+# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+# FALCON_API void falcon_sample_softmax(struct falcon_context * ctx, falcon_token_data_array * candidates);
+def falcon_sample_softmax(
+    ctx: falcon_context_p, candidates  # type: _Pointer[falcon_token_data]
+):
+    return _lib.ggllm_sample_softmax(ctx, candidates)
+
+
+_lib.ggllm_sample_softmax.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+]
+_lib.ggllm_sample_softmax.restype = None
+
+
+# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# FALCON_API void falcon_sample_top_k(struct falcon_context * ctx, falcon_token_data_array * candidates, int k, size_t min_keep);
+def falcon_sample_top_k(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    k: c_int,
+    min_keep: c_size_t,
+):
+    return _lib.ggllm_sample_top_k(ctx, candidates, k, min_keep)
+
+
+_lib.ggllm_sample_top_k.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_int,
+    c_size_t,
+]
+_lib.ggllm_sample_top_k.restype = None
+
+
+# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+# FALCON_API void falcon_sample_top_p(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep);
+def falcon_sample_top_p(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    p: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.ggllm_sample_top_p(ctx, candidates, p, min_keep)
+
+
+_lib.ggllm_sample_top_p.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.ggllm_sample_top_p.restype = None
+
+
+# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+# FALCON_API void falcon_sample_tail_free(struct falcon_context * ctx, falcon_token_data_array * candidates, float z, size_t min_keep);
+def falcon_sample_tail_free(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    z: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.ggllm_sample_tail_free(ctx, candidates, z, min_keep)
+
+
+_lib.ggllm_sample_tail_free.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.ggllm_sample_tail_free.restype = None
+
+
+# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+# FALCON_API void falcon_sample_typical(struct falcon_context * ctx, falcon_token_data_array * candidates, float p, size_t min_keep);
+def falcon_sample_typical(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    p: c_float,
+    min_keep: c_size_t,
+):
+    return _lib.ggllm_sample_typical(ctx, candidates, p, min_keep)
+
+
+_lib.ggllm_sample_typical.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+    c_size_t,
+]
+_lib.ggllm_sample_typical.restype = None
+
+
+# FALCON_API void falcon_sample_temperature(struct falcon_context * ctx, falcon_token_data_array * candidates, float temp);
+def falcon_sample_temperature(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    temp: c_float,
+):
+    return _lib.ggllm_sample_temperature(ctx, candidates, temp)
+
+
+_lib.ggllm_sample_temperature.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+]
+_lib.ggllm_sample_temperature.restype = None
+
+
+# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# FALCON_API falcon_token falcon_sample_token_mirostat(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, int m, float * mu);
+def falcon_sample_token_mirostat(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    tau: c_float,
+    eta: c_float,
+    m: c_int,
+    mu,  # type: _Pointer[c_float]
+) -> int:
+    return _lib.ggllm_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
+
+
+_lib.ggllm_sample_token_mirostat.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+    c_float,
+    c_int,
+    c_float_p,
+]
+_lib.ggllm_sample_token_mirostat.restype = falcon_token
+
+
+# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+# @param candidates A vector of `falcon_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+# FALCON_API falcon_token falcon_sample_token_mirostat_v2(struct falcon_context * ctx, falcon_token_data_array * candidates, float tau, float eta, float * mu);
+def falcon_sample_token_mirostat_v2(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+    tau: c_float,
+    eta: c_float,
+    mu,  # type: _Pointer[c_float]
+) -> int:
+    return _lib.ggllm_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
+
+
+_lib.ggllm_sample_token_mirostat_v2.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+    c_float,
+    c_float,
+    c_float_p,
+]
+_lib.ggllm_sample_token_mirostat_v2.restype = falcon_token
+
+
+# @details Selects the token with the highest probability.
+# FALCON_API falcon_token falcon_sample_token_greedy(struct falcon_context * ctx, falcon_token_data_array * candidates);
+def falcon_sample_token_greedy(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+) -> int:
+    return _lib.ggllm_sample_token_greedy(ctx, candidates)
+
+
+_lib.ggllm_sample_token_greedy.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+]
+_lib.ggllm_sample_token_greedy.restype = falcon_token
+
+
+# @details Randomly selects a token from the candidates based on their probabilities.
+# FALCON_API falcon_token falcon_sample_token(struct falcon_context * ctx, falcon_token_data_array * candidates);
+def falcon_sample_token(
+    ctx: falcon_context_p,
+    candidates,  # type: _Pointer[falcon_token_data_array]
+) -> int:
+    return _lib.ggllm_sample_token(ctx, candidates)
+
+
+_lib.ggllm_sample_token.argtypes = [
+    falcon_context_p,
+    falcon_token_data_array_p,
+]
+_lib.ggllm_sample_token.restype = falcon_token
+
+
+# Performance information
+
+
+# FALCON_API void falcon_print_timings(struct falcon_context * ctx);
+def falcon_print_timings(ctx: falcon_context_p):
+    _lib.ggllm_print_timings(ctx)
+
+
+_lib.ggllm_print_timings.argtypes = [falcon_context_p]
+_lib.ggllm_print_timings.restype = None
+
+
+# FALCON_API void falcon_reset_timings(struct falcon_context * ctx);
+def falcon_reset_timings(ctx: falcon_context_p):
+    _lib.ggllm_reset_timings(ctx)
+
+
+_lib.ggllm_reset_timings.argtypes = [falcon_context_p]
+_lib.ggllm_reset_timings.restype = None
+
+
+# Print system information
+# FALCON_API const char * falcon_print_system_info(void);
+def falcon_print_system_info() -> bytes:
+    return _lib.ggllm_print_system_info()
+
+
+_lib.ggllm_print_system_info.argtypes = []
+_lib.ggllm_print_system_info.restype = c_char_p
+
+###################################################################################################
+
+
+_falcon_initialized = False
+
+if not _falcon_initialized:
+    falcon_init_backend(c_bool(False))
+    _falcon_initialized = True
\ No newline at end of file
diff --git a/llama_cpp/llama_types.py b/falcon_cpp/falcon_types.py
similarity index 100%
rename from llama_cpp/llama_types.py
rename to falcon_cpp/falcon_types.py
diff --git a/llama_cpp/server/__init__.py b/falcon_cpp/server/__init__.py
similarity index 100%
rename from llama_cpp/server/__init__.py
rename to falcon_cpp/server/__init__.py
diff --git a/llama_cpp/server/__main__.py b/falcon_cpp/server/__main__.py
similarity index 100%
rename from llama_cpp/server/__main__.py
rename to falcon_cpp/server/__main__.py
diff --git a/llama_cpp/server/app.py b/falcon_cpp/server/app.py
similarity index 88%
rename from llama_cpp/server/app.py
rename to falcon_cpp/server/app.py
index ef319c7e0..2e0972ea6 100644
--- a/llama_cpp/server/app.py
+++ b/falcon_cpp/server/app.py
@@ -5,7 +5,7 @@
 from typing import Iterator, List, Optional, Union, Dict
 from typing_extensions import TypedDict, Literal
 
-import llama_cpp
+import falcon_cpp
 
 import anyio
 from anyio.streams.memory import MemoryObjectSendStream
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
         default=None,
         description="The alias of the model to use for generating completions.",
     )
-    n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_ctx: int = Field(default=8192, ge=1, description="The context size.")
     n_gpu_layers: int = Field(
         default=0,
         ge=0,
@@ -43,11 +43,11 @@ class Settings(BaseSettings):
     )
     f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
     use_mlock: bool = Field(
-        default=llama_cpp.llama_mlock_supported(),
+        default=falcon_cpp.falcon_mlock_supported(),
         description="Use mlock.",
     )
     use_mmap: bool = Field(
-        default=llama_cpp.llama_mmap_supported(),
+        default=falcon_cpp.falcon_mmap_supported(),
         description="Use mmap.",
     )
     embedding: bool = Field(default=True, description="Whether to use embeddings.")
@@ -90,14 +90,14 @@ class Settings(BaseSettings):
 router = APIRouter()
 
 settings: Optional[Settings] = None
-llama: Optional[llama_cpp.Llama] = None
+falcon: Optional[falcon_cpp.falcon] = None
 
 
 def create_app(settings: Optional[Settings] = None):
     if settings is None:
         settings = Settings()
     app = FastAPI(
-        title="🦙 llama.cpp Python API",
+        title="🦙 falcon.cpp Python API",
         version="0.0.1",
     )
     app.add_middleware(
@@ -108,8 +108,8 @@ def create_app(settings: Optional[Settings] = None):
         allow_headers=["*"],
     )
     app.include_router(router)
-    global llama
-    llama = llama_cpp.Llama(
+    global falcon
+    falcon = falcon_cpp.Falcon(
         model_path=settings.model,
         n_gpu_layers=settings.n_gpu_layers,
         seed=settings.seed,
@@ -129,14 +129,14 @@ def create_app(settings: Optional[Settings] = None):
         if settings.cache_type == "disk":
             if settings.verbose:
                 print(f"Using disk cache with size {settings.cache_size}")
-            cache = llama_cpp.LlamaDiskCache(capacity_bytes=settings.cache_size)
+            cache = falcon_cpp.FalconDiskCache(capacity_bytes=settings.cache_size)
         else:
             if settings.verbose:
                 print(f"Using ram cache with size {settings.cache_size}")
-            cache = llama_cpp.LlamaRAMCache(capacity_bytes=settings.cache_size)
+            cache = falcon_cpp.FalconRAMCache(capacity_bytes=settings.cache_size)
 
-        cache = llama_cpp.LlamaCache(capacity_bytes=settings.cache_size)
-        llama.set_cache(cache)
+        cache = falcon_cpp.FalconCache(capacity_bytes=settings.cache_size)
+        falcon.set_cache(cache)
 
     def set_settings(_settings: Settings):
         global settings
@@ -146,12 +146,12 @@ def set_settings(_settings: Settings):
     return app
 
 
-llama_lock = Lock()
+falcon_lock = Lock()
 
 
-def get_llama():
-    with llama_lock:
-        yield llama
+def get_falcon():
+    with falcon_lock:
+        yield falcon
 
 
 def get_settings():
@@ -276,7 +276,7 @@ class CreateCompletionRequest(BaseModel):
     best_of: Optional[int] = 1
     user: Optional[str] = Field(None)
 
-    # llama.cpp specific parameters
+    # falcon.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
@@ -290,11 +290,11 @@ class Config:
         }
 
 
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
+CreateCompletionResponse = create_model_from_typeddict(falcon_cpp.Completion)
 
 
 def make_logit_bias_processor(
-    llama: llama_cpp.Llama,
+    falcon: falcon_cpp.Falcon,
     logit_bias: Dict[str, float],
     logit_bias_type: Optional[Literal["input_ids", "tokens"]],
 ):
@@ -310,7 +310,7 @@ def make_logit_bias_processor(
     elif logit_bias_type == "tokens":
         for token, score in logit_bias.items():
             token = token.encode('utf-8')
-            for input_id in llama.tokenize(token, add_bos=False):
+            for input_id in falcon.tokenize(token, add_bos=False):
                 to_bias[input_id] = score
 
     def logit_bias_processor(
@@ -333,7 +333,7 @@ def logit_bias_processor(
 async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
+    falcon: falcon_cpp.Falcon = Depends(get_falcon),
 ):
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
@@ -349,8 +349,8 @@ async def create_completion(
     kwargs = body.dict(exclude=exclude)
 
     if body.logit_bias is not None:
-        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
-            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([
+            make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type),
         ])
 
     if body.stream:
@@ -359,7 +359,7 @@ async def create_completion(
         async def event_publisher(inner_send_chan: MemoryObjectSendStream):
             async with inner_send_chan:
                 try:
-                    iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs)  # type: ignore
+                    iterator: Iterator[falcon_cpp.CompletionChunk] = await run_in_threadpool(falcon, **kwargs)  # type: ignore
                     async for chunk in iterate_in_threadpool(iterator):
                         await inner_send_chan.send(dict(data=json.dumps(chunk)))
                         if await request.is_disconnected():
@@ -378,7 +378,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
             recv_chan, data_sender_callable=partial(event_publisher, send_chan)
         )
     else:
-        completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
+        completion: falcon_cpp.Completion = await run_in_threadpool(falcon, **kwargs)  # type: ignore
         return completion
 
 
@@ -395,7 +395,7 @@ class Config:
         }
 
 
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
+CreateEmbeddingResponse = create_model_from_typeddict(falcon_cpp.Embedding)
 
 
 @router.post(
@@ -403,10 +403,10 @@ class Config:
     response_model=CreateEmbeddingResponse,
 )
 async def create_embedding(
-    request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
+    request: CreateEmbeddingRequest, falcon: falcon_cpp.Falcon = Depends(get_falcon)
 ):
     return await run_in_threadpool(
-        llama.create_embedding, **request.dict(exclude={"user"})
+        falcon.create_embedding, **request.dict(exclude={"user"})
     )
 
 
@@ -438,7 +438,7 @@ class CreateChatCompletionRequest(BaseModel):
     n: Optional[int] = 1
     user: Optional[str] = Field(None)
 
-    # llama.cpp specific parameters
+    # falcon.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
@@ -458,7 +458,7 @@ class Config:
         }
 
 
-CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
+CreateChatCompletionResponse = create_model_from_typeddict(falcon_cpp.ChatCompletion)
 
 
 @router.post(
@@ -468,8 +468,8 @@ class Config:
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
-    llama: llama_cpp.Llama = Depends(get_llama),
-) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
+    falcon: falcon_cpp.Falcon = Depends(get_falcon),
+) -> Union[falcon_cpp.ChatCompletion, EventSourceResponse]:
     exclude = {
         "n",
         "logit_bias",
@@ -479,8 +479,8 @@ async def create_chat_completion(
     kwargs = body.dict(exclude=exclude)
 
     if body.logit_bias is not None:
-        kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
-            make_logit_bias_processor(llama, body.logit_bias, body.logit_bias_type),
+        kwargs['logits_processor'] = falcon_cpp.LogitsProcessorList([
+            make_logit_bias_processor(falcon, body.logit_bias, body.logit_bias_type),
         ])
 
     if body.stream:
@@ -489,7 +489,7 @@ async def create_chat_completion(
         async def event_publisher(inner_send_chan: MemoryObjectSendStream):
             async with inner_send_chan:
                 try:
-                    iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)  # type: ignore
+                    iterator: Iterator[falcon_cpp.ChatCompletionChunk] = await run_in_threadpool(falcon.create_chat_completion, **kwargs)  # type: ignore
                     async for chat_chunk in iterate_in_threadpool(iterator):
                         await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
                         if await request.is_disconnected():
@@ -509,8 +509,8 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
             data_sender_callable=partial(event_publisher, send_chan),
         )
     else:
-        completion: llama_cpp.ChatCompletion = await run_in_threadpool(
-            llama.create_chat_completion, **kwargs  # type: ignore
+        completion: falcon_cpp.ChatCompletion = await run_in_threadpool(
+            falcon.create_chat_completion, **kwargs  # type: ignore
         )
         return completion
 
@@ -533,7 +533,7 @@ class ModelList(TypedDict):
 @router.get("/v1/models", response_model=GetModelResponse)
 async def get_models(
     settings: Settings = Depends(get_settings),
-    llama: llama_cpp.Llama = Depends(get_llama),
+    falcon: falcon_cpp.Falcon = Depends(get_falcon),
 ) -> ModelList:
     return {
         "object": "list",
@@ -541,7 +541,7 @@ async def get_models(
             {
                 "id": settings.model_alias
                 if settings.model_alias is not None
-                else llama.model_path,
+                else falcon.model_path,
                 "object": "model",
                 "owned_by": "me",
                 "permissions": [],
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
deleted file mode 100644
index dce1764f6..000000000
--- a/llama_cpp/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .llama_cpp import *
-from .llama import *
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
deleted file mode 100644
index 52fc14e1d..000000000
--- a/llama_cpp/llama_cpp.py
+++ /dev/null
@@ -1,1024 +0,0 @@
-import sys
-import os
-import ctypes
-from ctypes import (
-    c_int,
-    c_float,
-    c_char_p,
-    c_void_p,
-    c_bool,
-    POINTER,
-    _Pointer,  # type: ignore
-    Structure,
-    Array,
-    c_uint8,
-    c_size_t,
-)
-import pathlib
-from typing import List, Union
-
-
-# Load the library
-def _load_shared_library(lib_base_name: str):
-    # Construct the paths to the possible shared library names
-    _base_path = pathlib.Path(__file__).parent.resolve()
-    # Searching for the library in the current directory under the name "libllama" (default name
-    # for llamacpp) and "llama" (default name for this repo)
-    _lib_paths: List[pathlib.Path] = []
-    # Determine the file extension based on the platform
-    if sys.platform.startswith("linux"):
-        _lib_paths += [
-            _base_path / f"lib{lib_base_name}.so",
-        ]
-    elif sys.platform == "darwin":
-        _lib_paths += [
-            _base_path / f"lib{lib_base_name}.so",
-            _base_path / f"lib{lib_base_name}.dylib",
-        ]
-    elif sys.platform == "win32":
-        _lib_paths += [
-            _base_path / f"{lib_base_name}.dll",
-        ]
-    else:
-        raise RuntimeError("Unsupported platform")
-
-    if "LLAMA_CPP_LIB" in os.environ:
-        lib_base_name = os.environ["LLAMA_CPP_LIB"]
-        _lib = pathlib.Path(lib_base_name)
-        _base_path = _lib.parent.resolve()
-        _lib_paths = [_lib.resolve()]
-
-    cdll_args = dict()  # type: ignore
-    # Add the library directory to the DLL search path on Windows (if needed)
-    if sys.platform == "win32" and sys.version_info >= (3, 8):
-        os.add_dll_directory(str(_base_path))
-        if "CUDA_PATH" in os.environ:
-            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
-            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
-        cdll_args["winmode"] = 0
-
-    # Try to load the shared library, handling potential errors
-    for _lib_path in _lib_paths:
-        if _lib_path.exists():
-            try:
-                return ctypes.CDLL(str(_lib_path), **cdll_args)
-            except Exception as e:
-                raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
-
-    raise FileNotFoundError(
-        f"Shared library with base name '{lib_base_name}' not found"
-    )
-
-
-# Specify the base name of the shared library to load
-_lib_base_name = "llama"
-
-# Load the library
-_lib = _load_shared_library(_lib_base_name)
-
-# Misc
-c_float_p = POINTER(c_float)
-c_uint8_p = POINTER(c_uint8)
-c_size_t_p = POINTER(c_size_t)
-
-# llama.h bindings
-
-GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
-GGML_CUDA_MAX_DEVICES = ctypes.c_int(16)
-LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else ctypes.c_int(1)
-
-# #define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
-LLAMA_FILE_MAGIC_GGJT = ctypes.c_uint(0x67676A74)
-# #define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
-LLAMA_FILE_MAGIC_GGLA = ctypes.c_uint(0x67676C61)
-# #define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
-LLAMA_FILE_MAGIC_GGMF = ctypes.c_uint(0x67676D66)
-# #define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
-LLAMA_FILE_MAGIC_GGML = ctypes.c_uint(0x67676D6C)
-# #define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
-LLAMA_FILE_MAGIC_GGSN = ctypes.c_uint(0x6767736E)
-
-# #define LLAMA_FILE_VERSION           3
-LLAMA_FILE_VERSION = c_int(3)
-LLAMA_FILE_MAGIC = LLAMA_FILE_MAGIC_GGJT
-LLAMA_FILE_MAGIC_UNVERSIONED = LLAMA_FILE_MAGIC_GGML
-LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-LLAMA_SESSION_VERSION = c_int(1)
-
-# struct llama_model;
-llama_model_p = c_void_p
-
-# struct llama_context;
-llama_context_p = c_void_p
-
-
-# typedef int llama_token;
-llama_token = c_int
-llama_token_p = POINTER(llama_token)
-
-
-# typedef struct llama_token_data {
-#     llama_token id; // token id
-#     float logit;    // log-odds of the token
-#     float p;        // probability of the token
-# } llama_token_data;
-class llama_token_data(Structure):
-    _fields_ = [
-        ("id", llama_token),
-        ("logit", c_float),
-        ("p", c_float),
-    ]
-
-
-llama_token_data_p = POINTER(llama_token_data)
-
-
-# typedef struct llama_token_data_array {
-#     llama_token_data * data;
-#     size_t size;
-#     bool sorted;
-# } llama_token_data_array;
-class llama_token_data_array(Structure):
-    _fields_ = [
-        ("data", llama_token_data_p),
-        ("size", c_size_t),
-        ("sorted", c_bool),
-    ]
-
-
-llama_token_data_array_p = POINTER(llama_token_data_array)
-
-# typedef void (*llama_progress_callback)(float progress, void *ctx);
-llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
-
-
-# struct llama_context_params {
-#     int seed;                              // RNG seed, -1 for random
-#     int n_ctx;                             // text context
-#     int n_batch;                           // prompt processing batch size
-#     int n_gpu_layers;                      // number of layers to store in VRAM
-#     int main_gpu;                          // the GPU that is used for scratch and small tensors
-#     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
-#     // called with a progress value between 0 and 1, pass NULL to disable
-#     llama_progress_callback progress_callback;
-#     // context pointer passed to the progress callback
-#     void * progress_callback_user_data;
-
-
-#     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
-#     bool f16_kv;     // use fp16 for KV cache
-#     bool logits_all; // the llama_eval() call computes all logits, not just the last one
-#     bool vocab_only; // only load the vocabulary, no weights
-#     bool use_mmap;   // use mmap if possible
-#     bool use_mlock;  // force system to keep model in RAM
-#     bool embedding;  // embedding mode only
-# };
-class llama_context_params(Structure):
-    _fields_ = [
-        ("seed", c_int),
-        ("n_ctx", c_int),
-        ("n_batch", c_int),
-        ("n_gpu_layers", c_int),
-        ("main_gpu", c_int),
-        ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
-        ("progress_callback", llama_progress_callback),
-        ("progress_callback_user_data", c_void_p),
-        ("low_vram", c_bool),
-        ("f16_kv", c_bool),
-        ("logits_all", c_bool),
-        ("vocab_only", c_bool),
-        ("use_mmap", c_bool),
-        ("use_mlock", c_bool),
-        ("embedding", c_bool),
-    ]
-
-
-llama_context_params_p = POINTER(llama_context_params)
-
-# enum llama_ftype {
-#     LLAMA_FTYPE_ALL_F32              = 0,
-#     LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-#     // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
-#     // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
-#     LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q2_K          = 10,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17,// except 1d tensors
-#     LLAMA_FTYPE_MOSTLY_Q6_K          = 18,// except 1d tensors
-# };
-LLAMA_FTYPE_ALL_F32 = c_int(0)
-LLAMA_FTYPE_MOSTLY_F16 = c_int(1)
-LLAMA_FTYPE_MOSTLY_Q4_0 = c_int(2)
-LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)
-LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(4)
-LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)
-LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)
-LLAMA_FTYPE_MOSTLY_Q5_1 = c_int(9)
-LLAMA_FTYPE_MOSTLY_Q2_K = c_int(10)
-LLAMA_FTYPE_MOSTLY_Q3_K_S = c_int(11)
-LLAMA_FTYPE_MOSTLY_Q3_K_M = c_int(12)
-LLAMA_FTYPE_MOSTLY_Q3_K_L = c_int(13)
-LLAMA_FTYPE_MOSTLY_Q4_K_S = c_int(14)
-LLAMA_FTYPE_MOSTLY_Q4_K_M = c_int(15)
-LLAMA_FTYPE_MOSTLY_Q5_K_S = c_int(16)
-LLAMA_FTYPE_MOSTLY_Q5_K_M = c_int(17)
-LLAMA_FTYPE_MOSTLY_Q6_K = c_int(18)
-
-
-# // model quantization parameters
-# typedef struct llama_model_quantize_params {
-#     int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-#     enum llama_ftype   ftype;    // quantize to this llama_ftype
-#     bool allow_requantize;       // allow quantizing non-f32/f16 tensors
-#     bool quantize_output_tensor; // quantize output.weight
-# } llama_model_quantize_params;
-class llama_model_quantize_params(Structure):
-    _fields_ = [
-        ("nthread", c_int),
-        ("ftype", c_int),
-        ("allow_requantize", c_bool),
-        ("quantize_output_tensor", c_bool),
-    ]
-
-
-# LLAMA_API struct llama_context_params llama_context_default_params();
-def llama_context_default_params() -> llama_context_params:
-    return _lib.llama_context_default_params()
-
-
-_lib.llama_context_default_params.argtypes = []
-_lib.llama_context_default_params.restype = llama_context_params
-
-
-# LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
-def llama_model_quantize_default_params() -> llama_model_quantize_params:
-    return _lib.llama_model_quantize_default_params()
-
-
-_lib.llama_model_quantize_default_params.argtypes = []
-_lib.llama_model_quantize_default_params.restype = llama_model_quantize_params
-
-
-# LLAMA_API bool llama_mmap_supported();
-def llama_mmap_supported() -> bool:
-    return _lib.llama_mmap_supported()
-
-
-_lib.llama_mmap_supported.argtypes = []
-_lib.llama_mmap_supported.restype = c_bool
-
-
-# LLAMA_API bool llama_mlock_supported();
-def llama_mlock_supported() -> bool:
-    return _lib.llama_mlock_supported()
-
-
-_lib.llama_mlock_supported.argtypes = []
-_lib.llama_mlock_supported.restype = c_bool
-
-
-# // TODO: not great API - very likely to change
-# // Initialize the llama + ggml backend
-# // If numa is true, use NUMA optimizations
-# // Call once at the start of the program
-# LLAMA_API void llama_init_backend(bool numa);
-def llama_init_backend(numa: c_bool):
-    return _lib.llama_init_backend(numa)
-
-
-_lib.llama_init_backend.argtypes = [c_bool]
-_lib.llama_init_backend.restype = None
-
-
-# LLAMA_API struct llama_model * llama_load_model_from_file(
-#                             const char * path_model,
-#         struct llama_context_params   params);
-def llama_load_model_from_file(
-    path_model: bytes, params: llama_context_params
-) -> llama_model_p:
-    return _lib.llama_load_model_from_file(path_model, params)
-
-
-_lib.llama_load_model_from_file.argtypes = [c_char_p, llama_context_params]
-_lib.llama_load_model_from_file.restype = llama_model_p
-
-
-# LLAMA_API void llama_free_model(struct llama_model * model);
-def llama_free_model(model: llama_model_p):
-    return _lib.llama_free_model(model)
-
-
-_lib.llama_free_model.argtypes = [llama_model_p]
-_lib.llama_free_model.restype = None
-
-
-# LLAMA_API struct llama_context * llama_new_context_with_model(
-#                     struct llama_model * model,
-#         struct llama_context_params   params);
-def llama_new_context_with_model(
-    model: llama_model_p, params: llama_context_params
-) -> llama_context_p:
-    return _lib.llama_new_context_with_model(model, params)
-
-
-_lib.llama_new_context_with_model.argtypes = [llama_model_p, llama_context_params]
-_lib.llama_new_context_with_model.restype = llama_context_p
-
-
-# LLAMA_API int64_t llama_time_us();
-def llama_time_us() -> int:
-    return _lib.llama_time_us()
-
-
-_lib.llama_time_us.argtypes = []
-_lib.llama_time_us.restype = ctypes.c_int64
-
-
-# // Various functions for loading a ggml llama model.
-# // Allocate (almost) all memory needed for the model.
-# // Return NULL on failure
-# LLAMA_API struct llama_context * llama_init_from_file(
-#                             const char * path_model,
-#         struct llama_context_params   params);
-def llama_init_from_file(
-    path_model: bytes, params: llama_context_params
-) -> llama_context_p:
-    return _lib.llama_init_from_file(path_model, params)
-
-
-_lib.llama_init_from_file.argtypes = [c_char_p, llama_context_params]
-_lib.llama_init_from_file.restype = llama_context_p
-
-
-# Frees all allocated memory
-# LLAMA_API void llama_free(struct llama_context * ctx);
-def llama_free(ctx: llama_context_p):
-    return _lib.llama_free(ctx)
-
-
-_lib.llama_free.argtypes = [llama_context_p]
-_lib.llama_free.restype = None
-
-
-# // Returns 0 on success
-# LLAMA_API int llama_model_quantize(
-#         const char * fname_inp,
-#         const char * fname_out,
-#         const llama_model_quantize_params * params);
-def llama_model_quantize(
-    fname_inp: bytes,
-    fname_out: bytes,
-    params,  # type: POINTER(llama_model_quantize_params) # type: ignore
-) -> int:
-    return _lib.llama_model_quantize(fname_inp, fname_out, params)
-
-
-_lib.llama_model_quantize.argtypes = [
-    c_char_p,
-    c_char_p,
-    POINTER(llama_model_quantize_params),
-]
-_lib.llama_model_quantize.restype = c_int
-
-
-# Apply a LoRA adapter to a loaded model
-# path_base_model is the path to a higher quality model to use as a base for
-# the layers modified by the adapter. Can be NULL to use the current loaded model.
-# The model needs to be reloaded before applying a new adapter, otherwise the adapter
-# will be applied on top of the previous one
-# Returns 0 on success
-# LLAMA_API int llama_apply_lora_from_file(
-#         struct llama_context * ctx,
-#                   const char * path_lora,
-#                   const char * path_base_model,
-#                          int   n_threads);
-def llama_apply_lora_from_file(
-    ctx: llama_context_p,
-    path_lora: c_char_p,
-    path_base_model: c_char_p,
-    n_threads: c_int,
-) -> int:
-    return _lib.llama_apply_lora_from_file(ctx, path_lora, path_base_model, n_threads)
-
-
-_lib.llama_apply_lora_from_file.argtypes = [llama_context_p, c_char_p, c_char_p, c_int]
-_lib.llama_apply_lora_from_file.restype = c_int
-
-
-# LLAMA_API int llama_model_apply_lora_from_file(
-#         const struct llama_model * model,
-#                     const char * path_lora,
-#                     const char * path_base_model,
-#                             int   n_threads);
-def llama_model_apply_lora_from_file(
-    model: llama_model_p,
-    path_lora: Union[c_char_p, bytes],
-    path_base_model: Union[c_char_p, bytes],
-    n_threads: c_int,
-) -> int:
-    return _lib.llama_model_apply_lora_from_file(
-        model, path_lora, path_base_model, n_threads
-    )
-
-
-_lib.llama_model_apply_lora_from_file.argtypes = [
-    llama_model_p,
-    c_char_p,
-    c_char_p,
-    c_int,
-]
-_lib.llama_model_apply_lora_from_file.restype = c_int
-
-
-# Returns the number of tokens in the KV cache
-# LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
-def llama_get_kv_cache_token_count(ctx: llama_context_p) -> int:
-    return _lib.llama_get_kv_cache_token_count(ctx)
-
-
-_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
-_lib.llama_get_kv_cache_token_count.restype = c_int
-
-
-# Sets the current rng seed.
-# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-def llama_set_rng_seed(ctx: llama_context_p, seed: c_int):
-    return _lib.llama_set_rng_seed(ctx, seed)
-
-
-_lib.llama_set_rng_seed.argtypes = [llama_context_p, c_int]
-_lib.llama_set_rng_seed.restype = None
-
-
-# Returns the maximum size in bytes of the state (rng, logits, embedding
-# and kv_cache) - will often be smaller after compacting tokens
-# LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
-def llama_get_state_size(ctx: llama_context_p) -> int:
-    return _lib.llama_get_state_size(ctx)
-
-
-_lib.llama_get_state_size.argtypes = [llama_context_p]
-_lib.llama_get_state_size.restype = c_size_t
-
-
-# Copies the state to the specified destination address.
-# Destination needs to have allocated enough memory.
-# Returns the number of bytes copied
-# LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
-def llama_copy_state_data(
-    ctx: llama_context_p, dst  # type: Array[c_uint8]
-) -> int:
-    return _lib.llama_copy_state_data(ctx, dst)
-
-
-_lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
-_lib.llama_copy_state_data.restype = c_size_t
-
-
-# Set the state reading from the specified address
-# Returns the number of bytes read
-# LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
-def llama_set_state_data(
-    ctx: llama_context_p, src  # type: Array[c_uint8]
-) -> int:
-    return _lib.llama_set_state_data(ctx, src)
-
-
-_lib.llama_set_state_data.argtypes = [llama_context_p, c_uint8_p]
-_lib.llama_set_state_data.restype = c_size_t
-
-
-# Save/load session file
-# LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
-def llama_load_session_file(
-    ctx: llama_context_p,
-    path_session: bytes,
-    tokens_out,  # type: Array[llama_token]
-    n_token_capacity: c_size_t,
-    n_token_count_out,  # type: _Pointer[c_size_t]
-) -> int:
-    return _lib.llama_load_session_file(
-        ctx, path_session, tokens_out, n_token_capacity, n_token_count_out
-    )
-
-
-_lib.llama_load_session_file.argtypes = [
-    llama_context_p,
-    c_char_p,
-    llama_token_p,
-    c_size_t,
-    c_size_t_p,
-]
-_lib.llama_load_session_file.restype = c_size_t
-
-
-# LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
-def llama_save_session_file(
-    ctx: llama_context_p,
-    path_session: bytes,
-    tokens,  # type: Array[llama_token]
-    n_token_count: c_size_t,
-) -> int:
-    return _lib.llama_save_session_file(ctx, path_session, tokens, n_token_count)
-
-
-_lib.llama_save_session_file.argtypes = [
-    llama_context_p,
-    c_char_p,
-    llama_token_p,
-    c_size_t,
-]
-_lib.llama_save_session_file.restype = c_size_t
-
-
-# Run the llama inference to obtain the logits and probabilities for the next token.
-# tokens + n_tokens is the provided batch of new tokens to process
-# n_past is the number of tokens to use from previous eval calls
-# Returns 0 on success
-# LLAMA_API int llama_eval(
-#         struct llama_context * ctx,
-#            const llama_token * tokens,
-#                          int   n_tokens,
-#                          int   n_past,
-#                          int   n_threads);
-def llama_eval(
-    ctx: llama_context_p,
-    tokens,  # type: Array[llama_token]
-    n_tokens: c_int,
-    n_past: c_int,
-    n_threads: c_int,
-) -> int:
-    return _lib.llama_eval(ctx, tokens, n_tokens, n_past, n_threads)
-
-
-_lib.llama_eval.argtypes = [llama_context_p, llama_token_p, c_int, c_int, c_int]
-_lib.llama_eval.restype = c_int
-
-
-# // Same as llama_eval, but use float matrix input directly.
-# LLAMA_API int llama_eval_embd(
-#         struct llama_context * ctx,
-#                     const float * embd,
-#                             int   n_tokens,
-#                             int   n_past,
-#                             int   n_threads);
-def llama_eval_embd(
-    ctx: llama_context_p,
-    embd,  # type: Array[c_float]
-    n_tokens: c_int,
-    n_past: c_int,
-    n_threads: c_int,
-) -> int:
-    return _lib.llama_eval_embd(ctx, embd, n_tokens, n_past, n_threads)
-
-
-_lib.llama_eval_embd.argtypes = [llama_context_p, c_float_p, c_int, c_int, c_int]
-_lib.llama_eval_embd.restype = c_int
-
-
-# Convert the provided text into tokens.
-# The tokens pointer must be large enough to hold the resulting tokens.
-# Returns the number of tokens on success, no more than n_max_tokens
-# Returns a negative number on failure - the number of tokens that would have been returned
-# TODO: not sure if correct
-# LLAMA_API int llama_tokenize(
-#         struct llama_context * ctx,
-#                   const char * text,
-#                  llama_token * tokens,
-#                          int   n_max_tokens,
-#                         bool   add_bos);
-def llama_tokenize(
-    ctx: llama_context_p,
-    text: bytes,
-    tokens,  # type: Array[llama_token]
-    n_max_tokens: c_int,
-    add_bos: c_bool,
-) -> int:
-    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
-
-
-_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
-_lib.llama_tokenize.restype = c_int
-
-
-# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
-def llama_n_vocab(ctx: llama_context_p) -> int:
-    return _lib.llama_n_vocab(ctx)
-
-
-_lib.llama_n_vocab.argtypes = [llama_context_p]
-_lib.llama_n_vocab.restype = c_int
-
-
-# LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
-def llama_n_ctx(ctx: llama_context_p) -> int:
-    return _lib.llama_n_ctx(ctx)
-
-
-_lib.llama_n_ctx.argtypes = [llama_context_p]
-_lib.llama_n_ctx.restype = c_int
-
-
-# LLAMA_API int llama_n_embd (const struct llama_context * ctx);
-def llama_n_embd(ctx: llama_context_p) -> int:
-    return _lib.llama_n_embd(ctx)
-
-
-_lib.llama_n_embd.argtypes = [llama_context_p]
-_lib.llama_n_embd.restype = c_int
-
-
-# // Get the vocabulary as output parameters.
-# // Returns number of results.
-# LLAMA_API int llama_get_vocab(
-#         const struct llama_context * ctx,
-#                         const char * * strings,
-#                                 float * scores,
-#                                 int   capacity);
-def llama_get_vocab(
-    ctx: llama_context_p,
-    strings,  # type: Array[c_char_p] # type: ignore
-    scores,  # type: Array[c_float] # type: ignore
-    capacity: c_int,
-) -> int:
-    return _lib.llama_get_vocab(ctx, strings, scores, capacity)
-
-
-_lib.llama_get_vocab.argtypes = [llama_context_p, c_char_p, c_float, c_int]
-_lib.llama_get_vocab.restype = c_int
-
-
-# Token logits obtained from the last call to llama_eval()
-# The logits for the last token are stored in the last row
-# Can be mutated in order to change the probabilities of the next token
-# Rows: n_tokens
-# Cols: n_vocab
-# LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-def llama_get_logits(
-    ctx: llama_context_p,
-):  # type: (...) -> Array[float] # type: ignore
-    return _lib.llama_get_logits(ctx)
-
-
-_lib.llama_get_logits.argtypes = [llama_context_p]
-_lib.llama_get_logits.restype = c_float_p
-
-
-# Get the embeddings for the input
-# shape: [n_embd] (1-dimensional)
-# LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-def llama_get_embeddings(
-    ctx: llama_context_p,
-):  # type: (...) -> Array[float] # type: ignore
-    return _lib.llama_get_embeddings(ctx)
-
-
-_lib.llama_get_embeddings.argtypes = [llama_context_p]
-_lib.llama_get_embeddings.restype = c_float_p
-
-
-# Token Id -> String. Uses the vocabulary in the provided context
-# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
-def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
-    return _lib.llama_token_to_str(ctx, token)
-
-
-_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
-_lib.llama_token_to_str.restype = c_char_p
-
-# Special tokens
-
-
-# LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
-def llama_token_bos() -> int:
-    return _lib.llama_token_bos()
-
-
-_lib.llama_token_bos.argtypes = []
-_lib.llama_token_bos.restype = llama_token
-
-
-# LLAMA_API llama_token llama_token_eos(); // end-of-sentence
-def llama_token_eos() -> int:
-    return _lib.llama_token_eos()
-
-
-_lib.llama_token_eos.argtypes = []
-_lib.llama_token_eos.restype = llama_token
-
-
-# LLAMA_API llama_token llama_token_nl(); // next-line
-def llama_token_nl() -> int:
-    return _lib.llama_token_nl()
-
-
-_lib.llama_token_nl.argtypes = []
-_lib.llama_token_nl.restype = llama_token
-
-
-# Sampling functions
-
-
-# @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-# LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
-def llama_sample_repetition_penalty(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: c_int,
-    penalty: c_float,
-):
-    return _lib.llama_sample_repetition_penalty(
-        ctx, candidates, last_tokens_data, last_tokens_size, penalty
-    )
-
-
-_lib.llama_sample_repetition_penalty.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    llama_token_p,
-    c_int,
-    c_float,
-]
-_lib.llama_sample_repetition_penalty.restype = None
-
-
-# @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-# LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-def llama_sample_frequency_and_presence_penalties(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    last_tokens_data,  # type: Array[llama_token]
-    last_tokens_size: c_int,
-    alpha_frequency: c_float,
-    alpha_presence: c_float,
-):
-    return _lib.llama_sample_frequency_and_presence_penalties(
-        ctx,
-        candidates,
-        last_tokens_data,
-        last_tokens_size,
-        alpha_frequency,
-        alpha_presence,
-    )
-
-
-_lib.llama_sample_frequency_and_presence_penalties.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    llama_token_p,
-    c_int,
-    c_float,
-    c_float,
-]
-_lib.llama_sample_frequency_and_presence_penalties.restype = None
-
-
-# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
-def llama_sample_softmax(
-    ctx: llama_context_p, candidates  # type: _Pointer[llama_token_data]
-):
-    return _lib.llama_sample_softmax(ctx, candidates)
-
-
-_lib.llama_sample_softmax.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-]
-_lib.llama_sample_softmax.restype = None
-
-
-# @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-# LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
-def llama_sample_top_k(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    k: c_int,
-    min_keep: c_size_t,
-):
-    return _lib.llama_sample_top_k(ctx, candidates, k, min_keep)
-
-
-_lib.llama_sample_top_k.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_int,
-    c_size_t,
-]
-_lib.llama_sample_top_k.restype = None
-
-
-# @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-# LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-def llama_sample_top_p(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    p: c_float,
-    min_keep: c_size_t,
-):
-    return _lib.llama_sample_top_p(ctx, candidates, p, min_keep)
-
-
-_lib.llama_sample_top_p.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-    c_size_t,
-]
-_lib.llama_sample_top_p.restype = None
-
-
-# @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-# LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
-def llama_sample_tail_free(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    z: c_float,
-    min_keep: c_size_t,
-):
-    return _lib.llama_sample_tail_free(ctx, candidates, z, min_keep)
-
-
-_lib.llama_sample_tail_free.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-    c_size_t,
-]
-_lib.llama_sample_tail_free.restype = None
-
-
-# @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-# LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
-def llama_sample_typical(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    p: c_float,
-    min_keep: c_size_t,
-):
-    return _lib.llama_sample_typical(ctx, candidates, p, min_keep)
-
-
-_lib.llama_sample_typical.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-    c_size_t,
-]
-_lib.llama_sample_typical.restype = None
-
-
-# LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
-def llama_sample_temperature(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    temp: c_float,
-):
-    return _lib.llama_sample_temperature(ctx, candidates, temp)
-
-
-_lib.llama_sample_temperature.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-]
-_lib.llama_sample_temperature.restype = None
-
-
-# @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-# LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
-def llama_sample_token_mirostat(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    tau: c_float,
-    eta: c_float,
-    m: c_int,
-    mu,  # type: _Pointer[c_float]
-) -> int:
-    return _lib.llama_sample_token_mirostat(ctx, candidates, tau, eta, m, mu)
-
-
-_lib.llama_sample_token_mirostat.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-    c_float,
-    c_int,
-    c_float_p,
-]
-_lib.llama_sample_token_mirostat.restype = llama_token
-
-
-# @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
-# @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
-# @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
-# @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
-# @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-# LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
-def llama_sample_token_mirostat_v2(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-    tau: c_float,
-    eta: c_float,
-    mu,  # type: _Pointer[c_float]
-) -> int:
-    return _lib.llama_sample_token_mirostat_v2(ctx, candidates, tau, eta, mu)
-
-
-_lib.llama_sample_token_mirostat_v2.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-    c_float,
-    c_float,
-    c_float_p,
-]
-_lib.llama_sample_token_mirostat_v2.restype = llama_token
-
-
-# @details Selects the token with the highest probability.
-# LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
-def llama_sample_token_greedy(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-) -> int:
-    return _lib.llama_sample_token_greedy(ctx, candidates)
-
-
-_lib.llama_sample_token_greedy.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-]
-_lib.llama_sample_token_greedy.restype = llama_token
-
-
-# @details Randomly selects a token from the candidates based on their probabilities.
-# LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
-def llama_sample_token(
-    ctx: llama_context_p,
-    candidates,  # type: _Pointer[llama_token_data_array]
-) -> int:
-    return _lib.llama_sample_token(ctx, candidates)
-
-
-_lib.llama_sample_token.argtypes = [
-    llama_context_p,
-    llama_token_data_array_p,
-]
-_lib.llama_sample_token.restype = llama_token
-
-
-# Performance information
-
-
-# LLAMA_API void llama_print_timings(struct llama_context * ctx);
-def llama_print_timings(ctx: llama_context_p):
-    _lib.llama_print_timings(ctx)
-
-
-_lib.llama_print_timings.argtypes = [llama_context_p]
-_lib.llama_print_timings.restype = None
-
-
-# LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-def llama_reset_timings(ctx: llama_context_p):
-    _lib.llama_reset_timings(ctx)
-
-
-_lib.llama_reset_timings.argtypes = [llama_context_p]
-_lib.llama_reset_timings.restype = None
-
-
-# Print system information
-# LLAMA_API const char * llama_print_system_info(void);
-def llama_print_system_info() -> bytes:
-    return _lib.llama_print_system_info()
-
-
-_lib.llama_print_system_info.argtypes = []
-_lib.llama_print_system_info.restype = c_char_p
-
-###################################################################################################
-
-
-_llama_initialized = False
-
-if not _llama_initialized:
-    llama_init_backend(c_bool(False))
-    _llama_initialized = True
diff --git a/mkdocs.yml b/mkdocs.yml
index 286581176..e4147790b 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,5 +1,5 @@
-site_name: llama-cpp-python
-repo_url: https://github.com/abetlen/llama-cpp-python
+site_name: falcon-cpp-python
+repo_url: https://github.com/sirajperson/falcon-cpp-python
 
 theme:
   name: "material"
@@ -9,7 +9,7 @@ plugins:
   - search
 
 watch:
-  - llama_cpp
+  - falcon_cpp
 
 markdown_extensions:
   - pymdownx.highlight:
diff --git a/pyproject.toml b/pyproject.toml
index e79d72eef..196aaedcb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,13 +1,13 @@
 [tool.poetry]
-name = "llama_cpp_python"
-version = "0.1.67"
-description = "Python bindings for the llama.cpp library"
-authors = ["Andrei Betlen <abetlen@gmail.com>"]
+name = "falcon_cpp_python"
+version = "0.0.1"
+description = "Python bindings for the ggllm.cpp library"
+authors = ["Jonathan Levin <sirajperson@gmail.com>"]
 license = "MIT"
 readme = "README.md"
-homepage = "https://github.com/abetlen/llama-cpp-python"
-repository = "https://github.com/abetlen/llama-cpp-python"
-packages = [{include = "llama_cpp"}]
+homepage = "https://github.com/abetlen/falcon-cpp-python"
+repository = "https://github.com/abetlen/falcon-cpp-python"
+packages = [{include = "falcon_cpp"}]
 include = [
     "LICENSE.md",
 ]
@@ -41,4 +41,4 @@ requires = [
     "cmake>=3.18",
     "ninja",
 ]
-build-backend = "setuptools.build_meta"
\ No newline at end of file
+build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 95593415a..4cc1ad765 100644
--- a/setup.py
+++ b/setup.py
@@ -6,16 +6,16 @@
 long_description = (this_directory / "README.md").read_text(encoding="utf-8")
 
 setup(
-    name="llama_cpp_python",
-    description="A Python wrapper for llama.cpp",
+    name="falcon_cpp_python",
+    description="A Python wrapper for ggllm.cpp to run Falcon models",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.67",
-    author="Andrei Betlen",
-    author_email="abetlen@gmail.com",
+    version="0.0.1",
+    author="Siraj Levin",
+    author_email="sirajperson@gmail.com",
     license="MIT",
-    package_dir={"llama_cpp": "llama_cpp", "llama_cpp.server": "llama_cpp/server"},
-    packages=["llama_cpp", "llama_cpp.server"],
+    package_dir={"falcon_cpp": "falcon_cpp", "falcon_cpp.server": "falcon_cpp/server"},
+    packages=["falcon_cpp", "falcon_cpp.server"],
     install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
     extras_require={
         "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],
diff --git a/tests/test_llama.py b/tests/test_falcon.py
similarity index 56%
rename from tests/test_llama.py
rename to tests/test_falcon.py
index 941287de6..d162cc6d6 100644
--- a/tests/test_llama.py
+++ b/tests/test_falcon.py
@@ -1,39 +1,39 @@
-import llama_cpp
+import falcon_cpp
 
-MODEL = "./vendor/llama.cpp/models/ggml-vocab.bin"
+MODEL = "./vendor/ggllm/models/ggml-vocab.bin"
 
 
-def test_llama():
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
+def test_falcon():
+    falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True)
 
-    assert llama
-    assert llama.ctx is not None
+    assert falcon
+    assert falcon.ctx is not None
 
     text = b"Hello World"
 
-    assert llama.detokenize(llama.tokenize(text)) == text
+    assert falcon.detokenize(falcon.tokenize(text)) == text
 
 
 # @pytest.mark.skip(reason="need to update sample mocking")
-def test_llama_patch(monkeypatch):
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
+def test_falcon_patch(monkeypatch):
+    falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True)
+    n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx)
 
     ## Set up mock function
     def mock_eval(*args, **kwargs):
         return 0
 
     def mock_get_logits(*args, **kwargs):
-        return (llama_cpp.c_float * n_vocab)(
-            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
+        return (falcon_cpp.c_float * n_vocab)(
+            *[falcon_cpp.c_float(0) for _ in range(n_vocab)]
         )
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits)
 
     output_text = " jumps over the lazy dog."
-    output_tokens = llama.tokenize(output_text.encode("utf-8"))
-    token_eos = llama.token_eos()
+    output_tokens = falcon.tokenize(output_text.encode("utf-8"))
+    token_eos = falcon.token_eos()
     n = 0
 
     def mock_sample(*args, **kwargs):
@@ -44,31 +44,31 @@ def mock_sample(*args, **kwargs):
         else:
             return token_eos
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_cpp_sample_token", mock_sample)
 
     text = "The quick brown fox"
 
     ## Test basic completion until eos
     n = 0  # reset
-    completion = llama.create_completion(text, max_tokens=20)
+    completion = falcon.create_completion(text, max_tokens=20)
     assert completion["choices"][0]["text"] == output_text
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test streaming completion until eos
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=20, stream=True)
+    chunks = falcon.create_completion(text, max_tokens=20, stream=True)
     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == output_text
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test basic completion until stop sequence
     n = 0  # reset
-    completion = llama.create_completion(text, max_tokens=20, stop=["lazy"])
+    completion = falcon.create_completion(text, max_tokens=20, stop=["lazy"])
     assert completion["choices"][0]["text"] == " jumps over the "
     assert completion["choices"][0]["finish_reason"] == "stop"
 
     ## Test streaming completion until stop sequence
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
+    chunks = falcon.create_completion(text, max_tokens=20, stream=True, stop=["lazy"])
     assert (
         "".join(chunk["choices"][0]["text"] for chunk in chunks) == " jumps over the "
     )
@@ -76,54 +76,54 @@ def mock_sample(*args, **kwargs):
 
     ## Test basic completion until length
     n = 0  # reset
-    completion = llama.create_completion(text, max_tokens=2)
+    completion = falcon.create_completion(text, max_tokens=2)
     assert completion["choices"][0]["text"] == " j"
     assert completion["choices"][0]["finish_reason"] == "length"
 
     ## Test streaming completion until length
     n = 0  # reset
-    chunks = llama.create_completion(text, max_tokens=2, stream=True)
+    chunks = falcon.create_completion(text, max_tokens=2, stream=True)
     assert "".join(chunk["choices"][0]["text"] for chunk in chunks) == " j"
     assert completion["choices"][0]["finish_reason"] == "length"
 
 
-def test_llama_pickle():
+def test_falcon_pickle():
     import pickle
     import tempfile
 
     fp = tempfile.TemporaryFile()
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    pickle.dump(llama, fp)
+    falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True)
+    pickle.dump(falcon, fp)
     fp.seek(0)
-    llama = pickle.load(fp)
+    falcon = pickle.load(fp)
 
-    assert llama
-    assert llama.ctx is not None
+    assert falcon
+    assert falcon.ctx is not None
 
     text = b"Hello World"
 
-    assert llama.detokenize(llama.tokenize(text)) == text
+    assert falcon.detokenize(falcon.tokenize(text)) == text
 
 
 def test_utf8(monkeypatch):
-    llama = llama_cpp.Llama(model_path=MODEL, vocab_only=True)
-    n_vocab = llama_cpp.llama_n_vocab(llama.ctx)
+    falcon = falcon_cpp.Falcon(model_path=MODEL, vocab_only=True)
+    n_vocab = falcon_cpp.falcon_n_vocab(falcon.ctx)
 
     ## Set up mock function
     def mock_eval(*args, **kwargs):
         return 0
 
     def mock_get_logits(*args, **kwargs):
-        return (llama_cpp.c_float * n_vocab)(
-            *[llama_cpp.c_float(0) for _ in range(n_vocab)]
+        return (falcon_cpp.c_float * n_vocab)(
+            *[falcon_cpp.c_float(0) for _ in range(n_vocab)]
         )
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_eval", mock_eval)
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_get_logits", mock_get_logits)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_eval", mock_eval)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_get_logits", mock_get_logits)
 
     output_text = "😀"
-    output_tokens = llama.tokenize(output_text.encode("utf-8"))
-    token_eos = llama.token_eos()
+    output_tokens = falcon.tokenize(output_text.encode("utf-8"))
+    token_eos = falcon.token_eos()
     n = 0
 
     def mock_sample(*args, **kwargs):
@@ -134,22 +134,22 @@ def mock_sample(*args, **kwargs):
         else:
             return token_eos
 
-    monkeypatch.setattr("llama_cpp.llama_cpp.llama_sample_token", mock_sample)
+    monkeypatch.setattr("falcon_cpp.falcon_cpp.falcon_sample_token", mock_sample)
 
     ## Test basic completion with utf8 multibyte
     n = 0  # reset
-    completion = llama.create_completion("", max_tokens=4)
+    completion = falcon.create_completion("", max_tokens=4)
     assert completion["choices"][0]["text"] == output_text
 
     ## Test basic completion with incomplete utf8 multibyte
     n = 0  # reset
-    completion = llama.create_completion("", max_tokens=1)
+    completion = falcon.create_completion("", max_tokens=1)
     assert completion["choices"][0]["text"] == ""
 
 
-def test_llama_server():
+def test_falcon_server():
     from fastapi.testclient import TestClient
-    from llama_cpp.server.app import create_app, Settings
+    from falcon_cpp.server.app import create_app, Settings
 
     settings = Settings(
         model=MODEL,
diff --git a/vendor/ggllm.cpp b/vendor/ggllm.cpp
new file mode 160000
index 000000000..8c019b677
--- /dev/null
+++ b/vendor/ggllm.cpp
@@ -0,0 +1 @@
+Subproject commit 8c019b67757538e7750cd30640fd00bbe8bc30de
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
deleted file mode 160000
index 96a712ca1..000000000
--- a/vendor/llama.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 96a712ca1b7f427e3bd7ffc0c70b2105cfc7fbf1