Merge branch 'main' into configurable-chat-templates

abetlen · abetlen · commit 6689d71a953c · 2023-09-18T16:18:11.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+- Update llama.cpp to 8781013ef654270cbead3e0011e33a6d690fb168
+
+## [0.2.6]
+
 - Update llama.cpp to 80291a1d02a07f7f66666fb576c5b1e75aa48b46
 
 ## [0.2.5]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.4...3.22)
+cmake_minimum_required(VERSION 3.21)
 
 project(llama_cpp)
 
@@ -33,4 +33,13 @@ if (LLAMA_BUILD)
         FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
         RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
     )
+    # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp
+    )
+    install(
+        FILES $<TARGET_RUNTIME_DLLS:llama>
+        DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp
+    )
 endif()
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -437,6 +437,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         n_tokens = llama_cpp.llama_tokenize_with_model(
             self.model,
             text,
+            len(text),
             tokens,
             n_ctx,
             add_bos,
@@ -447,6 +448,7 @@ def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
             n_tokens = llama_cpp.llama_tokenize_with_model(
                 self.model,
                 text,
+                len(text),
                 tokens,
                 n_tokens,
                 add_bos,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
@@ -58,7 +58,7 @@ def _load_shared_library(lib_base_name: str):
         if "CUDA_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
-        cdll_args["winmode"] = 0
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
 
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
@@ -950,42 +950,47 @@ def llama_token_nl(ctx: llama_context_p) -> llama_token:
 # LLAMA_API int llama_tokenize(
 #         struct llama_context * ctx,
 #                   const char * text,
+#                          int   text_len,
 #                  llama_token * tokens,
 #                          int   n_max_tokens,
 #                         bool   add_bos);
 def llama_tokenize(
     ctx: llama_context_p,
     text: bytes,
+    text_len: Union[c_int, int],
     tokens,  # type: Array[llama_token]
     n_max_tokens: Union[c_int, int],
     add_bos: Union[c_bool, int],
 ) -> int:
-    return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)
+    return _lib.llama_tokenize(ctx, text, text_len, tokens, n_max_tokens, add_bos)
 
 
-_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, llama_token_p, c_int, c_bool]
+_lib.llama_tokenize.argtypes = [llama_context_p, c_char_p, c_int, llama_token_p, c_int, c_bool]
 _lib.llama_tokenize.restype = c_int
 
 
 # LLAMA_API int llama_tokenize_with_model(
 #     const struct llama_model * model,
 #                   const char * text,
+#                          int   text_len,
 #                  llama_token * tokens,
 #                          int   n_max_tokens,
 #                         bool   add_bos);
 def llama_tokenize_with_model(
     model: llama_model_p,
     text: bytes,
+    text_len: Union[c_int, int],
     tokens,  # type: Array[llama_token]
     n_max_tokens: Union[c_int, int],
     add_bos: Union[c_bool, bool],
 ) -> int:
-    return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
+    return _lib.llama_tokenize_with_model(model, text, text_len, tokens, n_max_tokens, add_bos)
 
 
 _lib.llama_tokenize_with_model.argtypes = [
     llama_model_p,
     c_char_p,
+    c_int,
     llama_token_p,
     c_int,
     c_bool,
diff --git a/pyproject.toml b/pyproject.toml
@@ -54,7 +54,7 @@ all = [
 [tool.scikit-build]
 wheel.packages = ["llama_cpp"]
 cmake.verbose = true
-cmake.minimum-version = "3.12"
+cmake.minimum-version = "3.21"
 minimum-version = "0.5"
 sdist.exclude = [".git", "vendor/llama.cpp/.git"]
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 80291a1d02a07f7f66666fb576c5b1e75aa48b46
+Subproject commit 8781013ef654270cbead3e0011e33a6d690fb168