diff --git a/Package.swift b/Package.swift
index 18d610d6941d2..051765e5edb54 100644
--- a/Package.swift
+++ b/Package.swift
@@ -1,23 +1,22 @@
-// swift-tools-version:5.5
+// swift-tools-version:5.3
 
 import PackageDescription
 
 let package = Package(
     name: "llama",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
+    platforms: [.macOS(.v11),
+                .iOS(.v14),
+                .watchOS(.v4),
+                .tvOS(.v14)
     ],
     products: [
         .library(name: "llama", targets: ["llama"]),
+        .library(name: "Bert", targets: ["Bert"])
     ],
     targets: [
         .target(
             name: "llama",
             path: ".",
-            exclude: [],
             sources: [
                 "ggml.c",
                 "llama.cpp",
@@ -25,26 +24,79 @@ let package = Package(
                 "ggml-backend.c",
                 "ggml-quants.c",
                 "ggml-metal.m",
+                "common/common.cpp",
+                "common/grammar-parser.cpp",
+                "common/sampling.cpp"
             ],
             resources: [
                 .process("ggml-metal.metal")
             ],
             publicHeadersPath: "spm-headers",
             cSettings: [
-                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
+                .unsafeFlags(["-Wno-shorten-64-to-32",
+                              "-Ofast",
+                              "-DNDEBUG"]),
+                .define("GGML_USE_K_QUANTS"),
                 .define("GGML_USE_ACCELERATE"),
+                .define("NDEBUG"),
+                .define("_XOPEN_SOURCE", to: "600"),
+                .define("_DARWIN_C_SOURCE"),
                 .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_SWIFT"),
+                .define("GGML_USE_METAL")
+            ],
+            cxxSettings: [
+                .unsafeFlags(["-Wno-shorten-64-to-32",
+                              "-Ofast"]),
+                .define("GGML_USE_K_QUANTS"),
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags(["-fno-objc-arc"]),
+                .define("GGML_SWIFT"),
                 .define("GGML_USE_METAL"),
-                // NOTE: NEW_LAPACK will required iOS version 16.4+
-                // We should consider add this in the future when we drop support for iOS 14
-                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
-                // .define("ACCELERATE_NEW_LAPACK"),
-                // .define("ACCELERATE_LAPACK_ILP64")
+                .define("NDEBUG"),
+                .define("_XOPEN_SOURCE", to: "600"),
+                .define("_DARWIN_C_SOURCE")
+                ],
+            linkerSettings: [
+                .linkedFramework("Accelerate"),
+                .linkedFramework("Foundation"),
+                .linkedFramework("Metal"),
+                .linkedFramework("MetalKit")
+            ]
+        ),
+        .target(
+            name: "Bert",
+            dependencies: [ "llama" ],
+            resources: [
+                .process("Resources")
+            ],
+            publicHeadersPath: "include",
+            cSettings: [
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags([
+                    "-Ofast", "-DNDEBUG", "-std=gnu11"
+                ])
+            ],
+            cxxSettings: [
+                .define("GGML_USE_ACCELERATE"),
+                .unsafeFlags([
+                    "-Ofast", "-DNDEBUG", "-std=gnu++20"
+                ])
             ],
             linkerSettings: [
-                .linkedFramework("Accelerate")
+                .linkedFramework("Accelerate"),
+                .linkedFramework("Foundation"),
+                .linkedFramework("NaturalLanguage")
             ]
+        ),
+        .testTarget(name: "BertTests",
+                    dependencies: ["Bert"],
+                    resources: [
+                        .process("resources")
+                    ]
         )
     ],
+    cLanguageStandard: .c11,
     cxxLanguageStandard: .cxx11
 )
+
diff --git a/common/common.cpp b/common/common.cpp
index b3425ab09eaf8..d76ed526a21a9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -78,6 +78,13 @@ int32_t get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
+// Assuming these are in global_variables.cpp or a similarly named file
+
+int LLAMA_BUILD_NUMBER = 1; // Replace with your actual build number
+char const *LLAMA_COMMIT = ""; // Replace with your actual commit hash
+char const *LLAMA_COMPILER = ""; // Replace with your actual compiler info
+char const *LLAMA_BUILD_TARGET = ""; // Replace with your actual build target
+
 void process_escapes(std::string& input) {
     std::size_t input_len = input.length();
     std::size_t output_idx = 0;
diff --git a/sources/Bert/BERTEmbeddingsData.m b/sources/Bert/BERTEmbeddingsData.m
new file mode 100644
index 0000000000000..d4803d1ad62d6
--- /dev/null
+++ b/sources/Bert/BERTEmbeddingsData.m
@@ -0,0 +1,31 @@
+//
+//  EmbeddingsData.m
+//  
+//
+//  Created by Marc Terns on 9/23/23.
+//
+
+#import "BERTEmbeddingsData.h"
+
+@implementation BERTEmbeddingsData
+
+- (instancetype)initWithResourceURL:(NSURL *)resourceURL embeddings:(NSArray<NSArray<NSNumber *> *> *)embeddings {
+    self = [super init];
+    if (self) {
+        _resourceURL = resourceURL;
+        _embeddings = embeddings;
+    }
+    return self;
+}
+
+- (instancetype)initWithFileContents:(NSString *)fileContents embeddings:(NSArray<NSArray<NSNumber *> *> *)embeddings {
+    self = [super init];
+    if (self) {
+        _fileContents = fileContents;
+        _embeddings = embeddings;
+    }
+    return self;
+}
+
+@end
+
diff --git a/sources/Bert/BertEncoder.mm b/sources/Bert/BertEncoder.mm
new file mode 100644
index 0000000000000..6580cae66f589
--- /dev/null
+++ b/sources/Bert/BertEncoder.mm
@@ -0,0 +1,348 @@
+//
+//  BertEncoder.mm
+//  Bert
+//
+//  Created by Marc Terns on 9/10/23.
+//
+
+#include "BertEncoder.h"
+#include "bert.h"
+#include <vector>
+#include <string>
+#include <thread>
+#import <Foundation/Foundation.h>
+#import <NaturalLanguage/NaturalLanguage.h>
+#import "NSArray+Vector.h"
+
+@interface BertEncoder ()
+@property (nonatomic, assign, nullable) struct bert_ctx *bctx;
+@property (nonatomic, assign) int n_embd;
+@property (nonatomic, strong) NSURL *modelURL;
+@property (nonatomic, assign) int n_threads;
+@property (nonatomic, strong) NSLock *lock;
+@property (nonatomic, assign) BOOL isRunning;
+@end
+
+@implementation BertEncoder
+
+- (instancetype)init {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    return [self initWithModelURL:resourceURL];
+}
+
+- (instancetype)initWithModelURL:(NSURL *)modelURL {
+    if (self = [super init]) {
+        self.modelURL = modelURL;
+        unsigned int threads = std::thread::hardware_concurrency();
+        self.n_threads = threads > 0 ? (threads <= 4 ? threads : threads / 2) : 4;
+        self.lock = [[NSLock alloc] init];
+    }
+    return self;
+}
+
+- (void)start {
+    [self.lock lock];
+    if (!self.isRunning) {
+        self.isRunning = YES;
+        struct bert_hparams bertParams;
+        self.bctx = bert_load_from_file(self.modelURL.path.UTF8String, bertParams);
+        self.n_embd = bert_n_embd(self.bctx);
+    }
+    [self.lock unlock];
+}
+
+- (void)stop {
+    [self.lock lock];
+    if (self.isRunning) {
+        self.isRunning = NO;
+        bert_free(self.bctx);
+    }
+    [self.lock unlock];
+}
+
+- (std::vector<std::vector<float>>)embeddingsFromResourceURL:(NSURL *)resourceURL {
+    NSArray *texts = [self sentencesFromResourceURL:resourceURL];
+    return [self embeddingsFromSentences:texts];
+}
+
+- (std::vector<std::vector<float>>)embeddingsFromSentences:(NSArray<NSString *> *)sentences {
+    std::vector<std::vector<float>> allEmbeddings;
+    [self.lock lock];
+    for (NSString *sentence in sentences) {
+        std::vector<float> embeddings(self.n_embd);
+        const char *input_str = [sentence UTF8String];
+        bert_encode(self.bctx, self.n_threads, input_str, embeddings.data());
+        allEmbeddings.push_back(embeddings);
+    }
+    [self.lock unlock];
+    return allEmbeddings;
+}
+
+- (std::vector<float>)embeddingsForSentence:(NSString *)sentence {
+    [self.lock lock];
+    std::vector<float> inputEmbedding = std::vector<float>(self.n_embd);
+    bert_encode(self.bctx, self.n_threads, sentence.UTF8String, inputEmbedding.data());
+    [self.lock unlock];
+    return inputEmbedding;
+}
+
+// Function to find the N most similar texts
+-(std::vector<std::pair<float, size_t>>)findTopNSimilarInputVector:(const std::vector<float>&)inputVector textVectors:(const std::vector<std::vector<float>>&)textVectors N:(size_t)N {
+    std::vector<std::pair<float, size_t>> similarities;
+    
+    for (size_t i = 0; i < textVectors.size(); ++i) {
+        float similarity = cosineSimilarity(inputVector, textVectors[i]);
+        similarities.emplace_back(similarity, i);
+    }
+    
+    // Sort the similarities in descending order
+    std::sort(similarities.begin(), similarities.end(), std::greater<std::pair<float, size_t>>());
+    
+    // Get the top N similar texts
+    std::vector<std::pair<float, size_t>> topNSimilarities(similarities.begin(), similarities.begin() + N);
+    
+    return topNSimilarities;
+}
+
+- (std::vector<float>)calculateMean:(const std::vector<std::vector<float>>&)sentenceEmbeddings {
+    size_t numSentences = sentenceEmbeddings.size();
+    if (numSentences == 0) {
+        // We might want to handle this case at the application level.
+        return std::vector<float>();
+    }
+    
+    // Determine the maximum embedding size among all sentence embeddings.
+    // We will handle embeddings with different dimensions.
+    // Alternitevly, we could check if the size of the sentence embedding
+    // matches the expected size.
+    size_t maxEmbeddingSize = 0;
+    for (const auto& sentenceEmbedding : sentenceEmbeddings) {
+        maxEmbeddingSize = std::max(maxEmbeddingSize, sentenceEmbedding.size());
+    }
+    
+    std::vector<float> documentEmbedding(maxEmbeddingSize, 0.0);
+    for (const auto& sentenceEmbedding : sentenceEmbeddings) {
+        for (size_t i = 0; i < sentenceEmbedding.size(); ++i) {
+            documentEmbedding[i] += sentenceEmbedding[i];
+        }
+    }
+    
+    // Calculate the mean by dividing by the number of sentences
+    for (size_t i = 0; i < maxEmbeddingSize; ++i) {
+        documentEmbedding[i] /= numSentences;
+    }
+    
+    return documentEmbedding;
+}
+
+- (NSArray<NSString *> *)findClosestTextForSentence:(NSString *)sentence inResourceURL:(NSURL *)resourceURL topN:(NSInteger)topN {
+    std::vector<std::vector<float>> allEmbeddings = [self embeddingsFromResourceURL:resourceURL];
+    std::vector<float> sentenceEmbedding = [self embeddingsForSentence:sentence];
+    if (allEmbeddings.size() <= 0) {
+        return [NSArray new];
+    }
+    std::vector<std::pair<float, size_t>> topNSimilarities = [self findTopNSimilarInputVector:sentenceEmbedding textVectors:allEmbeddings N:topN];
+    NSArray *sentenceArray = [self sentencesFromResourceURL:resourceURL];
+    
+    NSMutableArray<NSString *> *result = [NSMutableArray new];
+    for (const auto& similarity : topNSimilarities) {
+        size_t index = similarity.second;
+        // Check if the index is within the bounds of the 'texts' array
+        if (index < [sentenceArray count]) {
+            NSString *similarText = sentenceArray[index];
+            [result addObject:similarText];
+        }
+    }
+    return [result copy];
+}
+
+- (NSArray<NSString *> *)sentencesFromResourceURL:(NSURL *)resourceURL {
+    NSError *error;
+    NSString *fileContents = [NSString stringWithContentsOfFile:resourceURL.path encoding:NSUTF8StringEncoding error:&error];
+    
+    if (error) {
+        NSLog(@"Error reading file: %@", error.localizedDescription);
+        return @[];
+    }
+    
+    return [self sentencesFromFileContent:fileContents];
+}
+
+-(NSArray<NSString *> *)sentencesFromFileContent:(NSString *)fileContent {
+    NLTokenizer *tokenizer = [[NLTokenizer alloc] initWithUnit:NLTokenUnitSentence];
+    [tokenizer setString:fileContent];
+    
+    NSMutableArray *sentenceArray = [NSMutableArray array];
+    
+    [tokenizer enumerateTokensInRange:NSMakeRange(0, [fileContent length])
+                           usingBlock:^(NSRange tokenRange, NLTokenizerAttributes attributes, BOOL *stop) {
+        NSString *sentence = [fileContent substringWithRange:tokenRange];
+        // Check if the sentence is not empty or only consists of whitespace
+        if ([[sentence stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]] length] > 0) {
+            [sentenceArray addObject:sentence];
+        }
+    }];
+    return [sentenceArray copy];
+}
+
+-(std::vector<std::string>)selectSentences:(const std::vector<std::vector<float>>&)sentenceEmbeddings documentEmbedding:(const std::vector<float>&)documentEmbedding threshold:(double)threshold sentences:(NSArray<NSString *> *)sentences {
+    std::vector<std::string> selectedSentences;
+    
+    for (size_t i = 0; i < sentenceEmbeddings.size(); ++i) {
+        double similarity = cosineSimilarity(sentenceEmbeddings[i], documentEmbedding);
+        // Check if the similarity score is above the threshold.
+        // Small modifications on threshold can have a huge impact on similarity.
+        // 0.7 vs 0.77 can make a big difference in sumarization.
+        if (similarity >= threshold) {
+            if (i < [sentences count]) {
+                // We extract the actual text sentence from the original document
+                // given its score.
+                selectedSentences.push_back([sentences objectAtIndex:i].UTF8String);
+            }
+        }
+    }
+    
+    return selectedSentences;
+}
+
+- (NSString *)summarizeFromResourceURL:(NSURL *)resourceURL threshold:(double)threshold {
+    std::vector<std::vector<float>> sentenceEmbeddings = [self embeddingsFromResourceURL:resourceURL];
+    std::vector<float> documentEmbedding = [self calculateMean:sentenceEmbeddings];
+    NSArray<NSString *> *sentences = [self sentencesFromResourceURL:resourceURL];
+    
+    // Select sentences based on the threshold
+    std::vector<std::string> selectedSentences = [self selectSentences:sentenceEmbeddings documentEmbedding:documentEmbedding threshold:threshold sentences:sentences];
+    std::string summary;
+    
+    for (const auto& sentence : selectedSentences) {
+        summary += sentence;
+    }
+    
+    return [NSString stringWithUTF8String:summary.c_str()];
+}
+
+- (NSString *)summarizeFileContents:(NSString *)fileContents threshold:(double)threshold {
+    NSArray<NSString *> *sentences = [self sentencesFromFileContent:fileContents];
+    std::vector<std::vector<float>> sentenceEmbeddings = [self embeddingsFromSentences:sentences];
+    std::vector<float> documentEmbedding = [self calculateMean:sentenceEmbeddings];
+    
+    // Select sentences based on the threshold
+    std::vector<std::string> selectedSentences = [self selectSentences:sentenceEmbeddings documentEmbedding:documentEmbedding threshold:threshold sentences:sentences];
+    std::string summary;
+    
+    for (const auto& sentence : selectedSentences) {
+        summary += sentence;
+    }
+    
+    return [NSString stringWithUTF8String:summary.c_str()];
+}
+
+- (BERTEmbeddingsData *)embeddingsForResourceURL:(NSURL *)resourceURL {
+    std::vector<std::vector<float>> allEmbeddings = [self embeddingsFromResourceURL:resourceURL];
+    NSMutableArray<NSArray<NSNumber *> *> *result = [NSMutableArray array];
+    for (const auto& innerVector : allEmbeddings) {
+        NSMutableArray<NSNumber *> *innerArray = [NSMutableArray array];
+        for (float floatValue : innerVector) {
+            [innerArray addObject:@(floatValue)];
+        }
+        [result addObject:innerArray];
+    }
+    BERTEmbeddingsData *data = [[BERTEmbeddingsData alloc] initWithResourceURL:resourceURL embeddings:result];
+    return data;
+}
+
+- (BERTEmbeddingsData *)embeddingsForFileContent:(NSString *)fileContent {
+    NSArray<NSString *> *sentences = [self sentencesFromFileContent:fileContent];
+    std::vector<std::vector<float>> allEmbeddings = [self embeddingsFromSentences:sentences];
+    NSMutableArray<NSArray<NSNumber *> *> *result = [NSMutableArray array];
+    for (const auto& innerVector : allEmbeddings) {
+        NSMutableArray<NSNumber *> *innerArray = [NSMutableArray array];
+        for (float floatValue : innerVector) {
+            [innerArray addObject:@(floatValue)];
+        }
+        [result addObject:innerArray];
+    }
+    BERTEmbeddingsData *data = [[BERTEmbeddingsData alloc] initWithFileContents:fileContent embeddings:result];
+    return data;
+}
+
+- (NSArray<NSString *> *)findClosestTextForSentence:(NSString *)sentence embeddingsData:(NSArray<BERTEmbeddingsData *> *)embeddingsData topN:(NSInteger)topN {
+    if (embeddingsData.count == 0) {
+        return nil;
+    }
+    NSMutableArray<NSString *> *closestTexts = [NSMutableArray array];
+
+    std::vector<float> inputEmbedding = [self embeddingsForSentence:sentence];
+    std::vector<std::pair<float, NSString *>> allSimilarities;
+    
+    for (BERTEmbeddingsData *data in embeddingsData) {
+        // Original sentences array from the resource file. we will use this to find the sentence match after applying the math on the embeddings.
+        NSArray<NSString *> *sentenceArray;
+        if (data.resourceURL) {
+            sentenceArray = [self sentencesFromResourceURL:[data resourceURL]];
+        }
+        if (data.fileContents) {
+            sentenceArray = [self sentencesFromFileContent:[data fileContents]];
+        }
+        std::vector<std::vector<float>> allEmbeddings = [[data embeddings] stdVectorArray];
+        if (allEmbeddings.size() <= 0) {
+            return [NSArray new];
+        }
+        std::vector<std::pair<float, size_t>> topNSimilarities = [self findTopNSimilarInputVector:inputEmbedding textVectors:allEmbeddings N:topN];
+        
+        // Get the actual sentences corresponding to the indices and store them with their similarity scores
+        for (const auto& similarity : topNSimilarities) {
+            size_t index = similarity.second;
+            if (index < [sentenceArray count]) {
+                NSString *similarText = sentenceArray[index];
+                float similarityScore = similarity.first;
+                // Storing the tuple will allow us to later sort based on similarity score.
+                allSimilarities.push_back(std::make_pair(similarityScore, similarText));
+            }
+        }
+    }
+    
+    // We need to sort based on similarity score to make sure we get the most accurate results across all files.
+    std::sort(allSimilarities.begin(), allSimilarities.end(), [](const auto& a, const auto& b) {
+        return a.first > b.first;
+    });
+    
+    // Return the top N closest texts from all data combined
+    for (size_t i = 0; i < MIN(topN, allSimilarities.size()); ++i) {
+        [closestTexts addObject:allSimilarities[i].second];
+    }
+    
+    return [closestTexts copy];
+}
+
+// Calculate cosine similarity between two vectors
+float cosineSimilarity(const std::vector<float>& vec1, const std::vector<float>& vec2) {
+    // Check if the vectors have the same length
+    if (vec1.size() != vec2.size()) {
+        return 0.0;
+    }
+
+    // Calculate dot product
+    float dotProduct = 0.0;
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        dotProduct += vec1[i] * vec2[i];
+    }
+
+    // Calculate magnitudes
+    float magnitude1 = 0.0;
+    float magnitude2 = 0.0;
+    for (size_t i = 0; i < vec1.size(); ++i) {
+        magnitude1 += vec1[i] * vec1[i];
+        magnitude2 += vec2[i] * vec2[i];
+    }
+
+    // Calculate cosine similarity
+    if (magnitude1 == 0.0 || magnitude2 == 0.0) {
+        return 0.0;
+    } else {
+        return dotProduct / (sqrt(magnitude1) * sqrt(magnitude2));
+    }
+}
+
+@end
diff --git a/sources/Bert/NSArray+Vector.h b/sources/Bert/NSArray+Vector.h
new file mode 100644
index 0000000000000..40b7fc015f825
--- /dev/null
+++ b/sources/Bert/NSArray+Vector.h
@@ -0,0 +1,19 @@
+//
+//  NSArray+Vector.h
+//  
+//
+//  Created by Marc Terns on 9/23/23.
+//
+
+#import <Foundation/Foundation.h>
+#include <vector>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface NSArray (Vector)
+
+- (std::vector<std::vector<float>>)stdVectorArray;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/sources/Bert/NSArray+Vector.mm b/sources/Bert/NSArray+Vector.mm
new file mode 100644
index 0000000000000..45d4b21fe69f8
--- /dev/null
+++ b/sources/Bert/NSArray+Vector.mm
@@ -0,0 +1,29 @@
+//
+//  NSArray+Vector.m
+//  
+//
+//  Created by Marc Terns on 9/23/23.
+//
+
+#import "NSArray+Vector.h"
+
+@implementation NSArray (Vector)
+
+- (std::vector<std::vector<float>>)stdVectorArray {
+    std::vector<std::vector<float>> result;
+    
+    for (NSArray<NSNumber *> *innerArray in self) {
+        std::vector<float> innerVector;
+        
+        for (NSNumber *number in innerArray) {
+            float floatValue = [number floatValue];
+            innerVector.push_back(floatValue);
+        }
+        
+        result.push_back(innerVector);
+    }
+    
+    return result;
+}
+
+@end
diff --git a/sources/Bert/Resources/ggml-model-f32.bin b/sources/Bert/Resources/ggml-model-f32.bin
new file mode 100644
index 0000000000000..f655b11346776
Binary files /dev/null and b/sources/Bert/Resources/ggml-model-f32.bin differ
diff --git a/sources/Bert/bert.cpp b/sources/Bert/bert.cpp
new file mode 100644
index 0000000000000..29510b907ccd6
--- /dev/null
+++ b/sources/Bert/bert.cpp
@@ -0,0 +1,1011 @@
+#include "bert.h"
+#include <ggml.h>
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <regex>
+#include <thread>
+#include <algorithm>
+
+struct bert_layer
+{
+    // normalization
+    struct ggml_tensor *ln_att_w;
+    struct ggml_tensor *ln_att_b;
+
+    struct ggml_tensor *ln_out_w;
+    struct ggml_tensor *ln_out_b;
+
+    // attention
+    struct ggml_tensor *q_w;
+    struct ggml_tensor *q_b;
+    struct ggml_tensor *k_w;
+    struct ggml_tensor *k_b;
+    struct ggml_tensor *v_w;
+    struct ggml_tensor *v_b;
+
+    struct ggml_tensor *o_w;
+    struct ggml_tensor *o_b;
+
+    // ff
+    struct ggml_tensor *ff_i_w;
+    struct ggml_tensor *ff_i_b;
+
+    struct ggml_tensor *ff_o_w;
+    struct ggml_tensor *ff_o_b;
+};
+
+struct bert_vocab
+{
+    std::map<std::string, bert_vocab_id> token_to_id;
+    std::map<std::string, bert_vocab_id> subword_token_to_id;
+
+    std::map<bert_vocab_id, std::string> _id_to_token;
+    std::map<bert_vocab_id, std::string> _id_to_subword_token;
+};
+
+struct bert_model
+{
+    bert_hparams hparams;
+
+    // embeddings weights
+    struct ggml_tensor *word_embeddings;
+    struct ggml_tensor *token_type_embeddings;
+    struct ggml_tensor *position_embeddings;
+    struct ggml_tensor *ln_e_w;
+    struct ggml_tensor *ln_e_b;
+
+    std::vector<bert_layer> layers;
+
+    struct ggml_context *ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+};
+
+// Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
+struct bert_buffer {
+    uint8_t * data = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] data;
+        data = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~bert_buffer() {
+        delete[] data;
+    }
+};
+
+
+struct bert_ctx
+{
+    bert_model model;
+    bert_vocab vocab;
+
+    size_t mem_per_token;
+    int64_t mem_per_input;
+    int32_t max_batch_n;
+    bert_buffer buf_compute;
+};
+
+int32_t bert_n_embd(bert_ctx * ctx)
+{
+    return ctx->model.hparams.n_embd;
+}
+
+int32_t bert_n_max_tokens(bert_ctx * ctx)
+{
+    return ctx->model.hparams.n_max_tokens;
+}
+
+const char* bert_vocab_id_to_token(bert_ctx * ctx, bert_vocab_id id) {
+    bert_vocab & vocab = ctx->vocab;
+    auto it = vocab._id_to_token.find(id);
+    if (it != vocab._id_to_token.end())
+    {
+        return it->second.c_str();
+    }
+    it = vocab._id_to_subword_token.find(id);
+    if (it != vocab._id_to_subword_token.end())
+    {
+        return it->second.c_str();
+    }
+    return "[UNK TOKEN from bert_vocab]";
+}
+
+//
+// Cli interface
+//
+
+void bert_print_usage(char **argv, const bert_params &params)
+{
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  --port p     port to bind in server mode (default: %d)\n", params.port);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model);
+    fprintf(stderr, "\n");
+}
+
+
+bool bert_params_parse(int argc, char **argv, bert_params &params)
+{
+    for (int i = 1; i < argc; i++)
+    {
+        std::string arg = argv[i];
+
+        if (arg == "-t" || arg == "--threads")
+        {
+            params.n_threads = std::stoi(argv[++i]);
+        }
+        else if (arg == "-p" || arg == "--prompt")
+        {
+            params.prompt = argv[++i];
+        }
+        else if (arg == "--port")
+        {
+            params.port = std::stoi(argv[++i]);
+        }
+        else if (arg == "-m" || arg == "--model")
+        {
+            params.model = argv[++i];
+        }
+        else if (arg == "-h" || arg == "--help")
+        {
+            bert_print_usage(argv, params);
+            exit(0);
+        }
+        else
+        {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            bert_print_usage(argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+//
+// Tokenizing
+//
+
+static size_t utf8_len(char src)
+{
+    const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
+    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
+    return lookup[highbits];
+}
+
+std::string stripAccents(const std::string &inputString)
+{
+    std::string resultString;
+    std::map<std::string, char> accentMap = {{"À", 'A'},{"Á", 'A'},
+        {"Â", 'A'},{"Ã", 'A'},{"Ä", 'A'},{"Å", 'A'},{"à", 'a'},{"á", 'a'},
+        {"â", 'a'},{"ã", 'a'},{"ä", 'a'},{"å", 'a'},{"È", 'E'},{"É", 'E'},
+        {"Ê", 'E'},{"Ë", 'E'},{"è", 'e'},{"é", 'e'},{"ê", 'e'},{"ë", 'e'},
+        {"Ì", 'I'},{"Í", 'I'},{"Î", 'I'},{"Ï", 'I'},{"ì", 'i'},{"í", 'i'},
+        {"î", 'i'},{"ï", 'i'},{"Ò", 'O'},{"Ó", 'O'},{"Ô", 'O'},{"Õ", 'O'},
+        {"Ö", 'O'},{"ò", 'o'},{"ó", 'o'},{"ô", 'o'},{"õ", 'o'},{"ö", 'o'},
+        {"Ù", 'U'},{"Ú", 'U'},{"Û", 'U'},{"Ü", 'U'},{"ù", 'u'},{"ú", 'u'},
+        {"û", 'u'},{"ü", 'u'},{"Ý", 'Y'},{"ý", 'y'},{"Ç", 'C'},{"ç", 'c'},
+        {"Ñ", 'N'},{"ñ", 'n'},
+    };
+
+    for (size_t i = 0; i < inputString.length();)
+    {
+        int len = utf8_len(inputString[i]);
+        std::string curChar = inputString.substr(i, len);
+        auto iter = accentMap.find(curChar);
+        if (iter != accentMap.end())
+        {
+            resultString += iter->second;
+        }
+        else
+        {
+            resultString += curChar;
+        }
+        i += len;
+    }
+
+    return resultString;
+}
+
+std::string bert_normalize_prompt(const std::string &text)
+{
+    // TODO: handle chinese characters? https://github.com/huggingface/tokenizers/blob/ef5f50605ddf9f8caef1598c0e4853862b9707a7/tokenizers/src/normalizers/bert.rs#L98
+    std::string text2 = stripAccents(text);
+    for (size_t i = 0; i < text2.size(); i += utf8_len(text2[i]))
+    {
+        char c = text2[i];
+        if (c >= 'A' && c <= 'Z')
+            text2[i] = c - 'A' + 'a';
+    }
+    return text2;
+}
+void bert_tokenize(
+    struct bert_ctx * ctx,
+    const char * text,
+    bert_vocab_id * tokens,
+    int32_t * n_tokens,
+    int32_t n_max_tokens)
+{
+    int cls_tok_id = 101;
+    int sep_tok_id = 102;
+    const bert_vocab &vocab = ctx->vocab;
+
+    std::string str = text;
+
+    std::vector<std::string> words;
+    // first split the text into words
+    {
+        str = bert_normalize_prompt(str);
+
+        std::string pat = R"([[:punct:]]|[[:alpha:]]+|[[:digit:]]+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re))
+        {
+            for (std::string x : m)
+            {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    int32_t t = 0;
+    tokens[t++] = cls_tok_id;
+
+    // find the longest tokens that form the words:
+    for (const auto &word : words)
+    {
+        if (word.size() == 0)
+            continue;
+
+        int i = 0;
+        int n = word.size();
+        auto *token_map = &vocab.token_to_id;
+    loop:
+        while (i < n)
+        {
+            if (t >= n_max_tokens - 1)
+                break;
+            int j = n;
+            while (j > i)
+            {
+                auto it = token_map->find(word.substr(i, j - i));
+                if (it != token_map->end())
+                {
+                    tokens[t++] = it->second;
+                    i = j;
+                    token_map = &vocab.subword_token_to_id;
+                    goto loop;
+                }
+                --j;
+            }
+            if (j == i)
+            {
+                fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
+                token_map = &vocab.subword_token_to_id;
+                ++i;
+            }
+        }
+    }
+    tokens[t++] = sep_tok_id;
+    *n_tokens = t;
+}
+
+//
+// Loading and setup
+//
+
+struct bert_ctx * bert_load_from_file(const char *fname, struct bert_hparams bert_params)
+{
+    auto fin = std::ifstream(fname, std::ios::binary);
+    if (!fin)
+    {
+        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname);
+        return nullptr;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        fin.read((char *)&magic, sizeof(magic));
+        if (magic != 0x67676d6c)
+        {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname);
+            return nullptr;
+        }
+    }
+
+    bert_ctx * new_bert = new bert_ctx;
+    bert_model & model = new_bert->model;
+    bert_vocab & vocab = new_bert->vocab;
+
+    // load hparams
+    {
+        auto &hparams = model.hparams;
+
+        fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
+        fin.read((char *)&hparams.n_max_tokens, sizeof(hparams.n_max_tokens));
+        fin.read((char *)&hparams.n_embd, sizeof(hparams.n_embd));
+        fin.read((char *)&hparams.n_intermediate, sizeof(hparams.n_intermediate));
+        fin.read((char *)&hparams.n_head, sizeof(hparams.n_head));
+        fin.read((char *)&hparams.n_layer, sizeof(hparams.n_layer));
+        fin.read((char *)&hparams.f16, sizeof(hparams.f16));
+
+        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_max_tokens   = %d\n", __func__, hparams.n_max_tokens);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_intermediate  = %d\n", __func__, hparams.n_intermediate);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: f16     = %d\n", __func__, hparams.f16);
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = model.hparams.n_vocab;
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++)
+        {
+            uint32_t len;
+            fin.read((char *)&len, sizeof(len));
+
+            word.resize(len);
+            fin.read((char *)word.data(), len);
+
+            if (word[0] == '#' && word[1] == '#')
+            {
+                vocab.subword_token_to_id[word.substr(2)] = i;
+                vocab._id_to_subword_token[i] = word;
+            }
+
+            if (vocab.token_to_id.count(word) == 0)
+            {
+                vocab.token_to_id[word] = i;
+                vocab._id_to_token[i] = word;
+            }
+        }
+    }
+
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+    // in order to save memory and also to speed up the computation
+    ggml_type wtype = GGML_TYPE_COUNT;
+    switch (model.hparams.f16)
+    {
+    case 0:
+        wtype = GGML_TYPE_F32;
+        break;
+    case 1:
+        wtype = GGML_TYPE_F16;
+        break;
+    case 2:
+        wtype = GGML_TYPE_Q4_0;
+        break;
+    case 3:
+        wtype = GGML_TYPE_Q4_1;
+        break;
+    default:
+    {
+        fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
+                __func__, fname, model.hparams.f16);
+        bert_free(new_bert);
+        return nullptr;
+    }
+    }
+
+    auto &ctx = model.ctx;
+
+    size_t model_mem_req = 0;
+
+    {
+        const auto &hparams = model.hparams;
+
+        const int n_embd = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_max_tokens = hparams.n_max_tokens;
+        const int n_intermediate = hparams.n_intermediate;
+        const int n_vocab = hparams.n_vocab;
+
+        // Calculate size requirements
+
+        model_mem_req += n_embd * n_vocab * ggml_type_sizef(wtype); // word_embeddings
+        model_mem_req += n_embd * 2 * ggml_type_sizef(wtype); // token_type_embeddings
+        model_mem_req += n_embd * n_max_tokens * ggml_type_sizef(wtype); // position_embeddings
+
+        model_mem_req += 2 * n_embd * ggml_type_sizef(GGML_TYPE_F32); // ln_e_*
+
+        model_mem_req += 4 * n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ln_*
+
+        model_mem_req += 4 * n_layer * (n_embd * n_embd * ggml_type_sizef(wtype)); // kqvo weights
+        model_mem_req += 4 * n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // kqvo bias
+
+        model_mem_req += 2 * n_layer * (n_embd * n_intermediate * ggml_type_sizef(wtype)); // ff_*_w
+        model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b
+        model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b
+
+        model_mem_req += (5 + 16 * n_layer) * 512; // object overhead
+
+        printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
+    }
+
+    // create the ggml context
+    {
+        struct ggml_init_params params = {
+            .mem_size = model_mem_req,
+            .mem_buffer = NULL,
+            .no_alloc = false,
+        };
+
+        model.ctx = ggml_init(params);
+        if (!model.ctx)
+        {
+            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+            bert_free(new_bert);
+            return nullptr;
+        }
+    }
+
+    // prepare memory for the weights
+    {
+        const auto &hparams = model.hparams;
+
+        const int n_embd = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_intermediate = hparams.n_intermediate;
+        const int n_max_tokens = hparams.n_max_tokens;
+        const int n_vocab = hparams.n_vocab;
+
+        model.layers.resize(n_layer);
+
+        model.word_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+        model.token_type_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, 2);
+        model.position_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_max_tokens);
+
+        model.ln_e_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+        model.ln_e_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+        // map by name
+        model.tensors["embeddings.word_embeddings.weight"] = model.word_embeddings;
+        model.tensors["embeddings.token_type_embeddings.weight"] = model.token_type_embeddings;
+        model.tensors["embeddings.position_embeddings.weight"] = model.position_embeddings;
+
+        model.tensors["embeddings.LayerNorm.weight"] = model.ln_e_w;
+        model.tensors["embeddings.LayerNorm.bias"] = model.ln_e_b;
+
+        for (int i = 0; i < n_layer; ++i)
+        {
+            auto &layer = model.layers[i];
+
+            layer.ln_att_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.ln_att_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.ln_out_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.ln_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+            layer.q_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+            layer.q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.k_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+            layer.k_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.v_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+            layer.v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+            layer.o_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+            layer.o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+            layer.ff_i_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_intermediate);
+            layer.ff_i_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_intermediate);
+
+            layer.ff_o_w = ggml_new_tensor_2d(ctx, wtype, n_intermediate, n_embd);
+            layer.ff_o_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+            // map by name
+
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.query.weight"] = layer.q_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.query.bias"] = layer.q_b;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.key.weight"] = layer.k_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.key.bias"] = layer.k_b;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.value.weight"] = layer.v_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.self.value.bias"] = layer.v_b;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.output.LayerNorm.weight"] = layer.ln_att_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.output.LayerNorm.bias"] = layer.ln_att_b;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.output.dense.weight"] = layer.o_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".attention.output.dense.bias"] = layer.o_b;
+
+            model.tensors["encoder.layer." + std::to_string(i) + ".intermediate.dense.weight"] = layer.ff_i_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".intermediate.dense.bias"] = layer.ff_i_b;
+
+            model.tensors["encoder.layer." + std::to_string(i) + ".output.LayerNorm.weight"] = layer.ln_out_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".output.LayerNorm.bias"] = layer.ln_out_b;
+            model.tensors["encoder.layer." + std::to_string(i) + ".output.dense.weight"] = layer.ff_o_w;
+            model.tensors["encoder.layer." + std::to_string(i) + ".output.dense.bias"] = layer.ff_o_b;
+        }
+    }
+
+    // load weights
+    {
+        int n_tensors = 0;
+        size_t total_size = 0;
+
+        printf("%s: ", __func__);
+
+        while (true)
+        {
+            int32_t n_dims;
+            int32_t length;
+            int32_t ftype;
+
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+
+            if (fin.eof())
+            {
+                break;
+            }
+
+            int64_t nelements = 1;
+            int64_t ne[2] = {1, 1};
+            for (int i = 0; i < n_dims; ++i)
+            {
+                int32_t ne_cur;
+                fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
+                ne[i] = ne_cur;
+                nelements *= ne[i];
+            }
+
+            std::string name(length, 0);
+            fin.read(&name[0], length);
+
+            if (model.tensors.find(name.data()) == model.tensors.end())
+            {
+                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+                bert_free(new_bert);
+                return nullptr;
+            }
+
+            auto tensor = model.tensors[name.data()];
+            if (ggml_nelements(tensor) != nelements)
+            {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                bert_free(new_bert);
+                return nullptr;
+            }
+
+            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1])
+            {
+                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
+                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                bert_free(new_bert);
+                return nullptr;
+            }
+
+            if (0)
+            {
+                static const char *ftype_str[] = {
+                    "f32",
+                    "f16",
+                    "q4_0",
+                    "q4_1",
+                };
+                printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor) / 1024.0 / 1024.0, ggml_nbytes(tensor));
+            }
+
+            size_t bpe = 0;
+
+            switch (ftype)
+            {
+            case 0:
+                bpe = ggml_type_size(GGML_TYPE_F32);
+                break;
+            case 1:
+                bpe = ggml_type_size(GGML_TYPE_F16);
+                break;
+            case 2:
+                bpe = ggml_type_size(GGML_TYPE_Q4_0);
+                assert(ne[0] % 64 == 0);
+                break;
+            case 3:
+                bpe = ggml_type_size(GGML_TYPE_Q4_1);
+                assert(ne[0] % 64 == 0);
+                break;
+            default:
+            {
+                fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
+                bert_free(new_bert);
+                return nullptr;
+            }
+            };
+
+            if ((nelements * bpe) / ggml_blck_size(tensor->type) != ggml_nbytes(tensor))
+            {
+                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
+                        __func__, name.data(), ggml_nbytes(tensor), nelements * bpe);
+                bert_free(new_bert);
+                return nullptr;
+            }
+
+            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+
+            // printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += ggml_nbytes(tensor);
+            if (++n_tensors % 8 == 0)
+            {
+                printf(".");
+                fflush(stdout);
+            }
+        }
+
+        printf(" done\n");
+
+        printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size / 1024.0 / 1024.0, n_tensors);
+    }
+
+    fin.close();
+
+    // Calculate space requirements for setting up context buffers later
+    {
+        bert_vocab_id tokens[] = {0, 1, 2, 3};
+        // TODO: We set the initial buffer size to 32MB and hope it's enough. Maybe there is a better way to do this?
+        new_bert->buf_compute.resize(32 * 1024 * 1024);
+        bert_eval(new_bert, 1, tokens, 4, nullptr);
+        new_bert->max_batch_n = 0;
+
+        int32_t N = bert_params.n_max_tokens;
+        new_bert->mem_per_input = 1.1 * (new_bert->mem_per_token * N); // add 10% to account for ggml object overhead
+
+    }
+    printf("%s: mem_per_token %zu KB, mem_per_input %lld MB\n", __func__, new_bert->mem_per_token / (1 << 10), new_bert->mem_per_input / (1 << 20));
+
+    return new_bert;
+}
+
+void bert_resize_ctx(bert_ctx * ctx, int32_t new_size) {    
+    int64_t buf_size_new = ctx->mem_per_input * new_size;
+
+    // TODO: Max memory should be a param? Now just 1 GB
+    int64_t GB = 1 << 30;
+    //printf("%s: requested_buf_size %lldMB\n", __func__, buf_size_new / (1 << 20));
+    if (buf_size_new > GB) {
+        int32_t adjusted_new_size = GB / ctx->mem_per_input;
+        if (adjusted_new_size < 1) adjusted_new_size = 1;
+        //printf("%s: requested batch size %d, actual new batch size %d\n", __func__, new_size, adjusted_new_size);
+        new_size = adjusted_new_size;
+        buf_size_new = ctx->mem_per_input * new_size;
+    }
+    if (new_size > ctx->max_batch_n) {
+        ctx->buf_compute.resize(buf_size_new);
+        ctx->max_batch_n = new_size;
+    }
+}
+
+void bert_free(bert_ctx * ctx) {
+    ggml_free(ctx->model.ctx);
+    delete ctx;
+}
+
+void bert_eval(
+    struct bert_ctx *ctx,
+    int32_t n_threads,
+    bert_vocab_id *tokens,
+    int32_t n_tokens,
+    float *embeddings)
+{
+    bert_eval_batch(ctx, n_threads, 1, &tokens, &n_tokens, embeddings ? &embeddings : nullptr);
+}
+
+void bert_eval_batch(
+    bert_ctx * ctx,
+    int32_t n_threads,
+    int32_t n_batch_size,
+    bert_vocab_id ** batch_tokens,
+    int32_t * n_tokens,
+    float ** batch_embeddings)
+{
+    const bert_model& model = ctx->model;
+    bool mem_req_mode = !batch_embeddings;
+    // batch_embeddings is nullptr for the initial memory requirements run
+    if (!mem_req_mode && n_batch_size > ctx->max_batch_n) {
+        bert_resize_ctx(ctx, n_batch_size);
+        if (n_batch_size > ctx->max_batch_n) {
+            fprintf(stderr, "%s: tried to increase buffers to batch size %d but failed\n", __func__, n_batch_size);
+            return;
+        }
+    }
+
+    // TODO: implement real batching
+    for (int ba = 0; ba < n_batch_size; ba++)
+    {
+        const int N = n_tokens[ba];
+        const auto &tokens = batch_tokens[ba];
+
+        const auto &hparams = model.hparams;
+
+        const int n_embd = hparams.n_embd;
+        const int n_layer = hparams.n_layer;
+        const int n_max_tokens = hparams.n_max_tokens;
+        const int n_head = hparams.n_head;
+
+        const int d_head = n_embd / n_head;
+
+        std::vector<float> result;
+        if (N > n_max_tokens)
+        {
+            fprintf(stderr, "Too many tokens, maximum is %d\n", n_max_tokens);
+            return;
+        }
+
+        auto & mem_per_token = ctx->mem_per_token;
+        auto & buf_compute   = ctx->buf_compute;
+
+        struct ggml_init_params params = {
+            .mem_size = buf_compute.size,
+            .mem_buffer = buf_compute.data,
+            .no_alloc = false,
+        };
+
+        struct ggml_context *ctx0 = ggml_init(params);
+        struct ggml_cgraph *gf =  ggml_new_graph(ctx0);
+
+        // Embeddings. word_embeddings + token_type_embeddings + position_embeddings
+        struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        memcpy(token_layer->data, tokens, N * ggml_element_size(token_layer));
+
+        struct ggml_tensor *token_types = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        ggml_set_zero(token_types);
+
+        struct ggml_tensor *positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+        for (int i = 0; i < N; i++)
+        {
+            ggml_set_i32_1d(positions, i, i);
+        }
+
+        struct ggml_tensor *inpL = ggml_get_rows(ctx0, model.word_embeddings, token_layer);
+
+        inpL = ggml_add(ctx0,
+                        ggml_get_rows(ctx0, model.token_type_embeddings, token_types),
+                        inpL);
+        inpL = ggml_add(ctx0,
+                        ggml_get_rows(ctx0, model.position_embeddings, positions),
+                        inpL);
+
+        // embd norm
+        {
+            inpL = ggml_norm(ctx0, inpL, 1e-5);
+
+            inpL = ggml_add(ctx0,
+                            ggml_mul(ctx0,
+                                     ggml_repeat(ctx0, model.ln_e_w, inpL),
+                                     inpL),
+                            ggml_repeat(ctx0, model.ln_e_b, inpL));
+        }
+        // layers
+        for (int il = 0; il < n_layer; il++)
+        {
+            struct ggml_tensor *cur = inpL;
+
+            // self-attention
+            {
+                struct ggml_tensor *Qcur = cur;
+                Qcur = ggml_reshape_3d(ctx0,
+                                       ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].q_b, Qcur),
+                                                ggml_mul_mat(ctx0, model.layers[il].q_w, Qcur)),
+                                       d_head, n_head, N);
+                struct ggml_tensor *Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+
+                struct ggml_tensor *Kcur = cur;
+                Kcur = ggml_reshape_3d(ctx0,
+                                       ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].k_b, Kcur),
+                                                ggml_mul_mat(ctx0, model.layers[il].k_w, Kcur)),
+                                       d_head, n_head, N);
+                struct ggml_tensor *K = ggml_permute(ctx0, Kcur, 0, 2, 1, 3);
+
+                struct ggml_tensor *Vcur = cur;
+                Vcur = ggml_reshape_3d(ctx0,
+                                       ggml_add(ctx0, ggml_repeat(ctx0, model.layers[il].v_b, Vcur),
+                                                ggml_mul_mat(ctx0, model.layers[il].v_w, Vcur)),
+                                       d_head, n_head, N);
+                struct ggml_tensor *V = ggml_permute(ctx0, Vcur, 0, 2, 1, 3);
+
+                struct ggml_tensor *KQ = ggml_mul_mat(ctx0, K, Q);
+                // KQ = soft_max(KQ / sqrt(head width))
+                KQ = ggml_soft_max(ctx0,
+                                   ggml_scale(ctx0,
+                                              KQ,
+                                              1.0f / sqrt((float)d_head)));
+
+                V = ggml_cont(ctx0, ggml_transpose(ctx0, V));
+                struct ggml_tensor *KQV = ggml_mul_mat(ctx0, V, KQ);
+                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+                cur = ggml_cpy(ctx0,
+                               KQV,
+                               ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            }
+            // attention output
+            cur = ggml_add(ctx0,
+                           ggml_repeat(ctx0, model.layers[il].o_b, cur),
+                           ggml_mul_mat(ctx0, model.layers[il].o_w, cur));
+
+            // re-add the layer input
+            cur = ggml_add(ctx0, cur, inpL);
+
+            // attention norm
+            {
+                cur = ggml_norm(ctx0, cur, 1e-5);
+
+                cur = ggml_add(ctx0,
+                               ggml_mul(ctx0,
+                                        ggml_repeat(ctx0, model.layers[il].ln_att_w, cur),
+                                        cur),
+                               ggml_repeat(ctx0, model.layers[il].ln_att_b, cur));
+            }
+            struct ggml_tensor *att_output = cur;
+            // intermediate_output = self.intermediate(attention_output)
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+            cur = ggml_add(ctx0,
+                           ggml_repeat(ctx0, model.layers[il].ff_i_b, cur),
+                           cur);
+            cur = ggml_gelu(ctx0, cur);
+
+            // layer_output = self.output(intermediate_output, attention_output)
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
+            cur = ggml_add(ctx0,
+                           ggml_repeat(ctx0, model.layers[il].ff_o_b, cur),
+                           cur);
+            // attentions bypass the intermediate layer
+            cur = ggml_add(ctx0, att_output, cur);
+
+            // output norm
+            {
+                cur = ggml_norm(ctx0, cur, 1e-5);
+
+                cur = ggml_add(ctx0,
+                               ggml_mul(ctx0,
+                                        ggml_repeat(ctx0, model.layers[il].ln_out_w, cur),
+                                        cur),
+                               ggml_repeat(ctx0, model.layers[il].ln_out_b, cur));
+            }
+            inpL = cur;
+        }
+        inpL = ggml_cont(ctx0, ggml_transpose(ctx0, inpL));
+        // pooler
+        struct ggml_tensor *sum = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, 1);
+        ggml_set_f32(sum, 1.0f / N);
+        inpL = ggml_mul_mat(ctx0, inpL, sum);
+
+        // normalizer
+        ggml_tensor *length = ggml_sqrt(ctx0,
+                                        ggml_sum(ctx0, ggml_sqr(ctx0, inpL)));
+        
+//        inpL = ggml_scale(ctx0, inpL, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        struct ggml_tensor* divResult = ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length);
+        float divScalar = *ggml_get_data_f32(divResult);
+        inpL = ggml_scale_inplace(ctx0, inpL, divScalar);
+        
+        ggml_tensor *output = inpL;
+        // run the computation
+        ggml_build_forward_expand(gf, output);
+        ggml_graph_compute_with_ctx(ctx0, gf, n_threads);
+
+
+        // float *dat = ggml_get_data_f32(output);
+        // pretty_print_tensor(dat, output->ne, output->nb, output->n_dims - 1, "");
+
+        #ifdef GGML_PERF
+            // print timing information per ggml operation (for debugging purposes)
+            // requires GGML_PERF to be defined
+            ggml_graph_print(&gf);
+        #endif
+
+        if (!mem_req_mode) {
+            memcpy(batch_embeddings[ba], (float *)ggml_get_data(output), sizeof(float) * n_embd);
+        } else {
+            mem_per_token = ggml_used_mem(ctx0) / N;
+
+            // printf("used_mem = %zu KB \n", ggml_used_mem(ctx0) / 1024);
+            // printf("mem_per_token = %zu KB \n", mem_per_token / 1024);
+        }
+
+        ggml_free(ctx0);
+    }
+}
+
+void bert_encode(
+    struct bert_ctx *ctx,
+    int32_t n_threads,
+    const char *texts,
+    float *embeddings)
+{
+    bert_encode_batch(ctx, n_threads, 1, 1, &texts, &embeddings);
+}
+
+void bert_encode_batch(
+    struct bert_ctx *ctx,
+    int32_t n_threads,
+    int32_t n_batch_size,
+    int32_t n_inputs,
+    const char ** texts,
+    float **embeddings)
+{
+    // TODO: Disable batching for now
+    n_batch_size = 1;
+    /*
+    if (n_batch_size > n_inputs) {
+        n_batch_size = n_inputs;
+    }
+    if (n_batch_size > ctx->max_batch_n) {
+        bert_resize_ctx(ctx, n_batch_size);
+        n_batch_size = ctx->max_batch_n;
+    }
+    */
+
+    int32_t N = bert_n_max_tokens(ctx);
+
+    std::vector<bert_vocab_id> buf_tokens;
+    // Most of this buffer will be unused in typical case where inputs are not that long.
+    buf_tokens.resize(N * n_inputs);
+    std::vector<int32_t> n_tokens = std::vector<int32_t>(n_inputs);
+    std::vector<bert_vocab_id*> unsorted_tokens(n_inputs);
+    bert_vocab_id* it_tokens = buf_tokens.data();
+    for (int i = 0; i < n_inputs; i++) {
+        unsorted_tokens[i] = it_tokens;
+        bert_tokenize(ctx, texts[i], it_tokens, &n_tokens[i], N);
+        it_tokens += n_tokens[i];
+    }
+
+    if (n_batch_size == n_inputs) {
+        bert_eval_batch(ctx, n_threads, n_batch_size, unsorted_tokens.data(), n_tokens.data(), embeddings);
+    } else {
+        // sort the inputs by tokenized length, batch and eval
+
+        std::vector<int> indices;
+        indices.reserve(n_inputs);
+        for (int i = 0; i < n_inputs; i++)
+        {
+            indices.push_back(i);
+        }
+
+        std::vector<int32_t> sorted_n_tokens = std::vector<int32_t>(n_inputs);
+
+        std::vector<bert_vocab_id *> sorted_tokens(n_inputs);
+
+        std::sort(indices.begin(), indices.end(), [&](int a, int b)
+                  { return n_tokens[a] < n_tokens[b]; });
+
+        std::vector<float *> sorted_embeddings(n_inputs);
+        memcpy(sorted_embeddings.data(), embeddings, n_inputs * sizeof(float *));
+
+        for (int i = 0; i < n_inputs; i++) {
+            sorted_embeddings[i] = embeddings[indices[i]];
+            sorted_tokens[i] = unsorted_tokens[indices[i]];
+            sorted_n_tokens[i] = n_tokens[indices[i]];
+        }
+
+        for (int i = 0; i < n_inputs; i += n_batch_size)
+        {
+            if (i + n_batch_size > n_inputs) {
+                n_batch_size = n_inputs - i;
+            }
+            bert_eval_batch(ctx, n_threads, n_batch_size, &sorted_tokens[i], &sorted_n_tokens[i], &sorted_embeddings[i]);
+        }
+    }
+}
diff --git a/sources/Bert/bert.h b/sources/Bert/bert.h
new file mode 100644
index 0000000000000..808b82bedd8cc
--- /dev/null
+++ b/sources/Bert/bert.h
@@ -0,0 +1,99 @@
+#ifndef BERT_H
+#define BERT_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#if defined(_WIN32)
+#define BERT_API __declspec(dllexport)
+#else
+#define BERT_API __attribute__ ((visibility ("default")))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct bert_params
+{
+    int32_t n_threads = 6;
+    int32_t port = 8080; // server mode port to bind
+
+    const char* model = "models/all-MiniLM-L6-v2/ggml-model-q4_0.bin"; // model path
+    const char* prompt = "test prompt";
+};
+
+struct bert_hparams
+{
+    int32_t n_vocab = 30522;
+    int32_t n_max_tokens = 2048;
+    int32_t n_embd = 256;
+    int32_t n_intermediate = 1536;
+    int32_t n_head = 12;
+    int32_t n_layer = 6;
+    int32_t f16 = 1;
+};
+
+BERT_API bool bert_params_parse(int argc, char **argv, bert_params &params);
+
+struct bert_ctx;
+
+typedef int32_t bert_vocab_id;
+
+BERT_API struct bert_ctx * bert_load_from_file(const char * fname, struct bert_hparams bert_params);
+BERT_API void bert_free(bert_ctx * ctx);
+
+// Main api, does both tokenizing and evaluation
+
+BERT_API void bert_encode(
+    struct bert_ctx * ctx,
+    int32_t n_threads,
+    const char * texts,
+    float * embeddings);
+
+// n_batch_size - how many to process at a time
+// n_inputs     - total size of texts and embeddings arrays
+BERT_API void bert_encode_batch(
+    struct bert_ctx * ctx,
+    int32_t n_threads,
+    int32_t n_batch_size,
+    int32_t n_inputs,
+    const char ** texts,
+    float ** embeddings);
+
+// Api for separate tokenization & eval
+
+BERT_API void bert_tokenize(
+    struct bert_ctx * ctx,
+    const char * text,
+    bert_vocab_id * tokens,
+    int32_t * n_tokens,
+    int32_t n_max_tokens);
+
+BERT_API void bert_eval(
+    struct bert_ctx * ctx,
+    int32_t n_threads,
+    bert_vocab_id * tokens,
+    int32_t n_tokens,
+    float * embeddings);
+
+// NOTE: for batch processing the longest input must be first
+BERT_API void bert_eval_batch(
+    struct bert_ctx * ctx,
+    int32_t n_threads,
+    int32_t n_batch_size,
+    bert_vocab_id ** batch_tokens,
+    int32_t * n_tokens,
+    float ** batch_embeddings);
+
+BERT_API int32_t bert_n_embd(bert_ctx * ctx);
+BERT_API int32_t bert_n_max_tokens(bert_ctx * ctx);
+
+BERT_API const char* bert_vocab_id_to_token(bert_ctx * ctx, bert_vocab_id id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // BERT_H
diff --git a/sources/Bert/include/BERTEmbeddingsData.h b/sources/Bert/include/BERTEmbeddingsData.h
new file mode 100644
index 0000000000000..d7615614f03a5
--- /dev/null
+++ b/sources/Bert/include/BERTEmbeddingsData.h
@@ -0,0 +1,24 @@
+//
+//  EmbeddingsData.h
+//  
+//
+//  Created by Marc Terns on 9/23/23.
+//
+
+#import <Foundation/Foundation.h>
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface BERTEmbeddingsData : NSObject
+
+@property (nonatomic, strong, nullable) NSURL *resourceURL;
+@property (nonatomic, strong, nullable) NSString *fileContents;
+@property (nonatomic, strong) NSArray<NSArray<NSNumber *> *> *embeddings;
+
+- (instancetype)initWithResourceURL:(NSURL *)resourceURL embeddings:(NSArray<NSArray<NSNumber *> *> *)embeddings;
+
+- (instancetype)initWithFileContents:(NSString *)fileContents embeddings:(NSArray<NSArray<NSNumber *> *> *)embeddings;
+@end
+
+
+NS_ASSUME_NONNULL_END
diff --git a/sources/Bert/include/BertEncoder.h b/sources/Bert/include/BertEncoder.h
new file mode 100644
index 0000000000000..a1e2b1ba2331b
--- /dev/null
+++ b/sources/Bert/include/BertEncoder.h
@@ -0,0 +1,73 @@
+//
+//  BertEncoder.h
+//  Bert
+//
+//  Created by Marc Terns on 9/10/23.
+//
+
+#import <Foundation/Foundation.h>
+#import "BERTEmbeddingsData.h"
+
+NS_ASSUME_NONNULL_BEGIN
+
+@interface BertEncoder : NSObject
+
+- (instancetype)initWithModelURL:(NSURL *)modelURL;
+
+- (void)start;
+
+- (void)stop;
+
+/// Given a sentence and fa file to to look from,  returns the top N results that closely match the input sentence in semantic meaning.
+///
+/// - Parameters:
+///   - sentence: The input sentence you want to find the closest texts from.
+///   - resouceURL: The file you want to find similarities from.
+///   - topN: The topN results.
+///
+- (NSArray<NSString *> *)findClosestTextForSentence:(NSString *)sentence inResourceURL:(NSURL *)resourceURL topN:(NSInteger)topN;
+
+/// Given a text file, and a threshold, summarizes the contents within the resource file.
+/// - Parameters:
+///   - resourceURL: The file to summarize
+///   - threshold: The threhsold for summarization. Values range from 0.0 to 1.0. The higher the value, the smaller the length of the summary.
+///                Values over 0.79 might not produce a summary.
+///                For consice summary, with only the most critical information, you might set a relatively high threshold (0.7 or higher).
+///                The larger the text, the larget the customization of threshold values.
+///
+- (NSString *)summarizeFromResourceURL:(NSURL *)resourceURL threshold:(double)threshold;
+
+/// Given a the contents of a file, and a threshold, summarizes the contents of it.
+/// - Parameters:
+///   - fileContents: The file contents to summarize
+///   - threshold: The threhsold for summarization. Values range from 0.0 to 1.0. The higher the value, the smaller the length of the summary.
+///                Values over 0.79 might not produce a summary.
+///                For consice summary, with only the most critical information, you might set a relatively high threshold (0.7 or higher).
+///                The larger the text, the larget the customization of threshold values.
+///
+- (NSString *)summarizeFileContents:(NSString *)fileContents threshold:(double)threshold;
+
+/// Returns all the sentence embeddings found in a given file. This API allows customers to store their own embeddings to disk and avoid having to calculate them every time.
+/// - Parameters:
+///   - resourceURL: The file where you want to get the embeddings from.
+///   
+- (BERTEmbeddingsData *)embeddingsForResourceURL:(NSURL *)resourceURL;
+
+/// Returns all the sentence embeddings found in a given file as NSString. This API allows customers to store their own embeddings to disk and avoid having to calculate them every time.
+/// - Parameters:
+///   - resourceURL: The file where you want to get the embeddings from.
+///
+- (BERTEmbeddingsData *)embeddingsForFileContent:(NSString *)fileContent;
+
+/// Given a sentence and embeddings to look from,  returns the top N results that closely match the input sentence in semantic meaning.
+///
+/// - Parameters:
+///   - sentence: The input sentence you want to find the closest texts from.
+///   - embeddingsData: The embeddings to perform the similarity check on the sentence.
+///   - topN: The topN results.
+///
+- (nullable NSArray<NSString *> *)findClosestTextForSentence:(NSString *)sentence embeddingsData:(NSArray<BERTEmbeddingsData *> *)embeddingsData topN:(NSInteger)topN;
+
+@end
+
+NS_ASSUME_NONNULL_END
diff --git a/spm-headers/common.h b/spm-headers/common.h
new file mode 120000
index 0000000000000..a7a0ec76817bf
--- /dev/null
+++ b/spm-headers/common.h
@@ -0,0 +1 @@
+../common/common.h
\ No newline at end of file
diff --git a/spm-headers/ggml-alloc.h b/spm-headers/ggml-alloc.h
new file mode 120000
index 0000000000000..a49d385a1b864
--- /dev/null
+++ b/spm-headers/ggml-alloc.h
@@ -0,0 +1 @@
+../ggml-alloc.h
\ No newline at end of file
diff --git a/spm-headers/ggml-backend.h b/spm-headers/ggml-backend.h
new file mode 120000
index 0000000000000..17c2cf14fe02b
--- /dev/null
+++ b/spm-headers/ggml-backend.h
@@ -0,0 +1 @@
+../ggml-backend.h
\ No newline at end of file
diff --git a/spm-headers/ggml-metal.h b/spm-headers/ggml-metal.h
new file mode 120000
index 0000000000000..71a79bb4e70f3
--- /dev/null
+++ b/spm-headers/ggml-metal.h
@@ -0,0 +1 @@
+../ggml-metal.h
\ No newline at end of file
diff --git a/spm-headers/grammar-parser.h b/spm-headers/grammar-parser.h
new file mode 120000
index 0000000000000..4a163770c8270
--- /dev/null
+++ b/spm-headers/grammar-parser.h
@@ -0,0 +1 @@
+../common/grammar-parser.h
\ No newline at end of file
diff --git a/spm-headers/log.h b/spm-headers/log.h
new file mode 120000
index 0000000000000..aad60170eedde
--- /dev/null
+++ b/spm-headers/log.h
@@ -0,0 +1 @@
+../common/log.h
\ No newline at end of file
diff --git a/spm-headers/sampling.h b/spm-headers/sampling.h
new file mode 120000
index 0000000000000..2f12a8713b726
--- /dev/null
+++ b/spm-headers/sampling.h
@@ -0,0 +1 @@
+../common/sampling.h
\ No newline at end of file
diff --git a/tests/BertTests/BertTests.mm b/tests/BertTests/BertTests.mm
new file mode 100644
index 0000000000000..3bc0383b726ea
--- /dev/null
+++ b/tests/BertTests/BertTests.mm
@@ -0,0 +1,109 @@
+//
+//  BertTests.m
+//  
+//
+//  Created by Marc Terns on 9/13/23.
+//
+
+#import <XCTest/XCTest.h>
+#import "BertEncoder.h"
+
+@interface BertTests : XCTestCase
+
+@end
+
+@implementation BertTests
+
+- (void)setUp {
+    // Put setup code here. This method is called before the invocation of each test method in the class.
+}
+
+- (void)tearDown {
+    // Put teardown code here. This method is called after the invocation of each test method in the class.
+}
+
+- (void)testSimilarTexts {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"earnings" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] init];
+    [encoder start];
+    NSArray<NSString *> * result = [encoder findClosestTextForSentence:@"Who is the CEO?" inResourceURL:earningsResourceURL topN:3];
+    NSLog(@"%@", result);
+    XCTAssertEqual(result.count, 3);
+    XCTAssertTrue([result.firstObject isEqualToString:@"Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer\n"]);
+}
+
+- (void)testSimilarTextsFromEmbeddings {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"earnings" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    BERTEmbeddingsData *data = [encoder embeddingsForResourceURL:earningsResourceURL];
+    NSArray<NSString *> * result = [encoder findClosestTextForSentence:@"Who is the CEO?" embeddingsData:@[data] topN:3];
+    NSLog(@"%@", result);
+    XCTAssertEqual(result.count, 3);
+    XCTAssertTrue([result.firstObject isEqualToString:@"Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer\n"]);
+}
+
+- (void)testSimilarTextsFromEmbeddingsInMultipleFiles {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"earnings" withExtension:@"txt"];
+    NSURL *example_EN = [bundle URLForResource:@"example_EN" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    BERTEmbeddingsData *earningsData = [encoder embeddingsForResourceURL:earningsResourceURL];
+    BERTEmbeddingsData *exampleData = [encoder embeddingsForResourceURL:example_EN];
+    NSArray<NSString *> * result = [encoder findClosestTextForSentence:@"What is summarization?" embeddingsData:@[earningsData, exampleData] topN:3];
+    NSLog(@"%@", result);
+    XCTAssertEqual(result.count, 3);
+    XCTAssertTrue([result.firstObject isEqualToString:@"In text summarization, the goal is to create a concise and coherent summary of a document. "]);
+}
+
+- (void)testSummary_earnings {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"earnings" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    NSLog(@"%@", [encoder summarizeFromResourceURL:earningsResourceURL threshold:0.50]);
+}
+
+- (void)testSummary_EN {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"example_EN" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    NSLog(@"%@", [encoder summarizeFromResourceURL:earningsResourceURL threshold:0.75]);
+}
+
+- (void)testSummary_large_EN {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"example_large_EN" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    NSLog(@"%@", [encoder summarizeFromResourceURL:earningsResourceURL threshold:0.6]);
+}
+
+-(void)testSummary_ES {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"example_ES" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    NSLog(@"%@", [encoder summarizeFromResourceURL:earningsResourceURL threshold:0.75]);
+}
+
+-(void)testSummary_CAT {
+    NSBundle *bundle = SWIFTPM_MODULE_BUNDLE;
+    NSURL *resourceURL = [bundle URLForResource:@"ggml-model-f32" withExtension:@"bin"];
+    NSURL *earningsResourceURL = [bundle URLForResource:@"example_CAT" withExtension:@"txt"];
+    BertEncoder * encoder = [[BertEncoder alloc] initWithModelURL:resourceURL];
+    [encoder start];
+    NSLog(@"%@", [encoder summarizeFromResourceURL:earningsResourceURL threshold:0.55]);
+}
+@end
diff --git a/tests/BertTests/resources/earnings.txt b/tests/BertTests/resources/earnings.txt
new file mode 100644
index 0000000000000..c3144e3922f1d
--- /dev/null
+++ b/tests/BertTests/resources/earnings.txt
@@ -0,0 +1,605 @@
+Zscaler (ZS) Q4 2023 Earnings Call Transcript
+ZS earnings call for the period ending June 30, 2023.
+
+Motley Fool Transcribing
+Zscaler (ZS 1.69%)
+Q4 2023 Earnings Call
+Sep 05, 2023, 4:30 p.m. ET
+
+Contents:
+
+Prepared Remarks
+Questions and Answers
+Call Participants
+Prepared Remarks:
+
+
+Operator
+
+Thank you for standing by, and welcome to Zscaler earnings announcement fiscal year 2023 fourth quarter conference call. At this time, all participants are on a listen-only mode. After the speakers' presentation, there will be a question-and-answer session. [Operator instructions] As a reminder, today's call is being recorded.
+
+I will now turn the conference over to your host, Mr. Bill Choi, senior vice president of investor relations and strategic finance. Please go ahead.
+
+Bill Choi -- Senior Vice President, Investor Relations and Strategic Finance
+
+Good afternoon, everyone, and welcome to the Zscaler fiscal fourth quarter and full year 2023 earnings conference call. On the call with me today are Jay Chaudhry, chairman and CEO; and Remo Canessa, CFO. Please note that we have posted our earnings release and a supplemental financial schedule to our investor relations website. Unless otherwise noted, all numbers we talk about today will be on an adjusted non-GAAP basis.
+
+You will find the reconciliation of GAAP to the non-GAAP financial measures in our earnings release. I'd like to remind you that today's discussion will contain forward-looking statements, including but not limited to the company's anticipated future revenue, calculated billings, operating performance, gross margin, operating expenses, operating income, net income, free cash flow, dollar-based net retention rate, future hiring decisions, remaining performance obligations, income taxes, earnings per share, our objectives and outlook, our customer response to our products, and our market share and market opportunity. These statements and other comments are not guarantees of future performance, but rather are subject to risk and uncertainty, some of which are beyond our control. These forward-looking statements apply as of today, and you should not rely on them as representing our views in the future.
+
+10 stocks we like better than Zscaler
+When our award-winning analyst team has a stock tip, it can pay to listen. After all, the newsletter they have run for over a decade, Motley Fool Stock Advisor, has tripled the market.* 
+
+They just revealed what they believe are the ten best stocks for investors to buy right now… and Zscaler wasn't one of them! That's right -- they think these 10 stocks are even better buys.
+
+See the 10 stocks
+
+*Stock Advisor returns as of September 5, 2023
+
+We undertake no obligation to update these statements after this call. For a more complete discussion of the risks and uncertainties, please see our filings with the SEC, as well as in today's earnings release. I would also like to inform you that we'll be attending the following upcoming events in September: Goldman Sachs Communacopia and Technology Conference on September 6th, Wolfe Research TMT Conference on September 7th, and Piper Sandler Growth Frontiers Conference on September 12th. Now, I'll turn the call over to Jay.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you, Bill. We had a strong close to our fiscal year. In Q4, we delivered 43% revenue growth and 38% billings growth, with balanced growth across all verticals, customer segments, and geographies. For the full year, our revenue grew 48% to $1.6 billion, and billings grew 37% to over $2 billion.
+
+In addition to achieving record billings in the quarter, we also set records across several other measures. We added the highest number of $1 million ARR customers, generated record new pipeline for Q4, and attained record operating profit margin. I'm proud of our team's achievements and humbled by the trust our customers are placing in our platform. While the macro environment remains challenging, we are executing well.
+
+With cybersecurity as a high priority, IT executives are moving forward with zero trust initiatives driving our business. As I mentioned before, we are partnering earlier with CXOs to create compelling CFO-ready business cases with clear ROI and payback periods. As our results demonstrate, refining our high touch sales process is helping get large deals across the finish line. We have a blueprint for delivering immediate value, which drives faster upsells, often within 12 months of initial purchase.
+
+We closed a record number of deals over $1 million ACV in Q4, driven by broad-based strength across our key industry verticals. In addition to our industry-leading top-line growth, we are generating record profitability. Due to our spending discipline, we achieved a record 19% operating margin as we more than doubled our operating income on a year-over-year basis. These outstanding results reflect the strong unit economics of our business with best-in-class 80% gross margins.
+
+Our innovation and customer obsession drove our net promoter score to exceed 80, which is more than two times the average for SaaS companies and contributed to our high 90% gross retention rate. I am very pleased to announce that we doubled our annual recurring revenue from $1 billion to over $2 billion in seven quarters, reaching a milestone only a select handful of SaaS companies have achieved. We secure over 7,700 customers and protect over 41 million users. With every customer looking to adopt zero trust architecture in today's world of cloud, AI, and mobility, we believe we are in the early stages of capturing a large share of our $72 billion market opportunity.
+
+We have our sights set on achieving our next goal of $5 billion in ARR. We are on a mission to take zero trust everywhere to users, workloads, and OT systems and become the go-to platform for vendor consolidation, cost savings, increased business agility, and better cyber and data protection. To fully realize the business value enabled by our platform, customers are increasingly buying Zscaler for users, our complete zero trust solution for user protection, which includes ZIA, ZPA, ZDX, and data protection. In addition, we are gaining traction with workload protection powered by the same core ZIA and ZPA technology.
+
+These broader platform purchases drove 37% year-over-year growth in customers with greater than $1 million in ARR. We ended the quarter with nearly 450 such customers, including 43 customers exceeding $5 million. Let me highlight one deal where the customer purchased all product pillars. A large global system integrator partner headquartered in Asia became a customer and adopted our platform to enable their work-from-anywhere strategy.
+
+They purchased ZIA, ZDX, and advanced data protection for 300,000 users and ZPA for 270,000 users. They also purchased workload protection and deception technology to improve application security for their hybrid environment. This customer can now open new offices and offshore development centers much faster and more securely. They are also seeing a 50% reduction in the time to onboard employees as a highly distributed organization with data everywhere.
+
+Data protection was a major consideration for them and accounted for 20% of the deal value. Data protection is an important new pillar of growth for us, approaching a $250 million in ARR and growing 60% per year. We are increasingly replacing incumbent legacy DLP in the largest of enterprises with data protection representing a $10 billion-plus opportunity for us. Due to targeted investments and rapid innovation, we believe our data protection solution is now the widest and the deepest in the market.
+
+And we are taking data protection beyond users to workloads and devices. Let me highlight a new logo win led by data protection. A large telecom operator purchased Zscaler for users for 80,000 employees. Data protection was a key driver for the win as this customer became increasingly uncomfortable with gaps left by their firewall and VPN-based security, which struggles with data protection for TLS-encrypted traffic that comprises over 85% of their internet traffic.
+
+With Zscaler, this customer is consolidating multiple point products and expects a payback on the purchase within nine months. Our emerging products, including ZDX and Zscaler for Workloads, continued to see increased adoption and contributed 18% of our new business in fiscal '23. We expect emerging products contribution to increase to over 20% in fiscal '24. I'd like to highlight two deals that were driven by our emerging products.
+
+In a seven-figure ACV upsell deal, a Fortune 50 insurance company purchased ZDX Advanced for all 170,000 users after realizing value from their initial ZIA deployment. With the Zscaler platform already in place, ZDX gets deployed quickly, reduces troubleshooting time, and improves field agent productivity. We directly impact the customers revenue and their agents ability to earn commissions. ZDX exemplifies the platform benefits of our zero trust exchange and expands our share of customer spending beyond security.
+
+In another seven-figure ACV upsell win, a Fortune 10 healthcare company purchased Zscaler for Workloads just one quarter after making their initial purchase of Zscaler for Users for all 150,000 employees. With workload protection, this customer is accelerating their plans to move most of their on-prem workloads to the cloud, as well as protect workload traffic from over 9,000 locations. Lastly, let me highlight our success in the federal vertical. Twelve of the 15 cabinet level agencies are our customers, and we are starting to see larger awards from these agencies.
+
+Let me highlight one such agencywide deal. We were awarded a multiyear contract from an agency with more than 100,000 users. The value of this contract will be realized over time based on deployment with the field units. Against this award, we received a mid seven-figure ACV task order for ZIA and ZPA.
+
+This customer chose the Zscaler over firewall vendors because our cloud-native architecture delivers better security and user experience, all while meeting FedRAMP requirements. We remain the only cloud security service to have two key products at the highest level of FedRAMP certification. These certifications and the executive order for zero trust security are driving a significant opportunity for us in the federal market. Next, let me discuss some key industry trends.
+
+Cybersecurity remains the No. 1 IT priority, and having the right security architecture is fundamental to reducing cyber risk. According to our latest Zscaler ThreatLabZ VPN risk report, nearly half of enterprises reported they were targeted by cyberattackers who exploited a VPN vulnerability, and a third of enterprises fell victim to ransomware attacks within the past year. Growing cyberthreats, including ransomware, are driving IT leaders to transform security from legacy network security to zero trust architecture.
+
+True zero trust security can't be built by spinning up a bunch of virtual firewalls and VPNs in a public cloud. Do you know any VPN vendor whose products have not been compromised? Our architectural differentiation gives us long-term advantage. As you may have seen, investors and regulators are increasing pressure on companies to improve cybersecurity. With the new SEC requirement to report a material security incident in four business days, there will be increased executive and board level focus on cybersecurity.
+
+Zscaler's zero trust exchange platform delivers comprehensive security controls, full visibility, and fast reporting, each of which is now a must have for meeting corporate governance requirements. In this environment, customers cannot risk transformational and mission-critical projects with immature offerings from unproven vendors. Both legacy vendors and newcomers in the security industry have tried to mimic our messaging. The reality is that no vendor comes close to providing a depth of functionality and level of performance at our scale.
+
+Good enough in cybersecurity is never good enough. Next, let me discuss AI, which is top of mind for customers and investors. Generative AI has tremendous potential to unlock insights, improve employee productivity, and solve complex problems. However, the risk of data loss and issue of data sovereignty are limiting the potential of this new technology.
+
+To address these concerns, we already delivered data protection capabilities that prevent the leakage of sensitive data through AI prompts and appropriated into public training models. For example, our browser isolation session would not let employees paste or type any sensitive information into gen AI prompts. This not only enables employees to use AI, but also do so in a secure manner for enterprises. Our engineering teams have innovated rapidly on new AI-driven functionality that's available in our premium price bundles, such as auto classification of unstructured data for advanced data protection and auto segmentation of applications for zero trust access.
+
+Second, we will have new products based on AI that will provide significant upsell opportunities with our customers. We recently launched Risk360, which enables executive teams and boards to better understand the risk posture of their organizations and provides unparalleled visibility with up-to-date security status and corrective actions they can implement in a timely fashion. Using AI, I believe we will be able to use our unique data set to also predict and prevent most of today's ransomware and other sophisticated attacks on our customers. AI-driven cyber insights and prevention have the potential to add tremendous customer value, and we believe we can monetize that opportunity.
+
+Zscaler has AI experts and data scientists and valuable anonymized private data to customize and train LLM models for the security domain. Based on a proxy architecture, our zero trust exchange is like a private switchboard that captures all communication logs. We have the largest inline security cloud, inspecting over 320 billion transactions daily with transactions doubling every 18 months. These logs provide more than 500 trillion signals per day that feed our AI models for better detection of user and application traffic anomalies, resulting in a positive network effect of superior threat protection for our customers.
+
+We have been investing in AI for quite a while, including our first AI acquisition in 2018, and we will continue to invest in fiscal '24 for rapid AI innovations, cloud enhancements, and go-to market to take our AI solutions to the market. All investments will be made within the envelope of margin guidance that Remo will discuss. In closing, we are excited about the opportunities ahead. We have a track record of building and growing new innovations like ZDX data protection and zero trust for workloads, and we are now turning our attention to AI.
+
+We believe these new products will contribute increasingly to our future growth. Our business value message is resonating in this challenging macro environment, and more customers are adopting our broader platform to consolidate multiple-point products. We believe customers trust Zscaler more than any other provider for securing their zero trust journey. We have grown our global team to nearly 6,000 employees with the mission to secure the hyperconnected world of cloud, AI, and mobility.
+
+I'm extremely proud of the strong growth and profitability we delivered in fiscal '23. I want to thank our employees and our partners for their tireless efforts and commitment to our customers success. We will invest aggressively to delight our customers and capture the large opportunity ahead of us while continuing to deliver operational excellence. Now, I'd like to turn over the call to Remo for our financial results.
+
+Remo Canessa -- Chief Financial Officer
+
+Thank you, Jay. We are pleased with our strong performance in Q4 and solid execution even with ongoing customer scrutiny of large deals. Revenue was $455 million, up 43% year over year and up 9% sequentially. ZPA product revenue grew 57% year over year.
+
+This will be the last quarter that we break out ZPA revenue separately as we are increasingly selling solution bundles that involve multiple product pillars. From a geographic perspective, Americas represented 53% of revenue, EMEA was 32%, and APJ was 15%. For the full year, revenue was $1.62 billion, up 48% year over year. Our total calculated billings in Q4 grew 38% year over year to $719 million.
+
+On a sequential basis, total billings grew 49% quarter over quarter. Total billings benefited from a $20 million upfront billing on a multiyear deal. As a reminder, our contract terms are typically one to three years. We primarily invoice our customers one year in advance.
+
+Our calculated current billings grew 33% year over year and 42% quarter over quarter. From a vertical perspective, we saw strong growth across our key verticals. Our remaining performance obligations, or RPO, grew 35% from a year ago to $3.51 billion. The current RPO is approximately 49% of the total RPO.
+
+We ended the year with 449 customers with greater than $1 million in ARR, adding a record 49 $1 million ARR customers in the quarter. The continued strength of this large customer metric speaks to the strategic role we play in our customers' digital transformation initiatives. We also ended the quarter with 2,609 customers with greater than $100,000 in ARR. Our 12-month trailing dollar-based net retention rate was 121% in Q4.
+
+Turning to the rest of our financial performance total gross margin of 80.7% compares to 80.2% in the prior quarter and 81.6% in the year-ago quarter. Higher public cloud usage for emerging products drove the year-over-year change in gross margin. As Jay mentioned, emerging products are growing as a percentage of our new business and contributed 18% of new ACV in fiscal 2023 compared to 14% in fiscal 2022. Moving on, our total operating expenses increased 3% sequentially and 27% year over year to $281 million.
+
+Due to our focus on spending discipline this year, we generated significant operating leverage in our model, with operating margin reaching 19%, an increase of approximately 200 basis points year over year. Our free cash flow margin was 22%, including data center capex of approximately 6% of revenue. For the full year, our operating margin was 15%, and free cash flow margin was 21%. We ended the quarter with over $2 billion in cash, cash equivalents, and short-term investments.
+
+In August, we've completed an assessment of the useful lives of our servers and network equipment. With advances in technology and efficiencies in how we operate our server and network equipment, starting in fiscal 2024, we're extending the depreciable useful life for these assets in our cloud infrastructure for four to five years. We expect the impact of this change to be approximately a 50 basis-point benefit to our gross margin for the full year. Next, let me share some observations about the macro environment and our framework for guidance.
+
+From our perspective, the global macro environment remains uncertain, and customers continue to scrutinize large deals. In addition, in select instances, we will continue to enable new strategic customers to ramp into larger subscription commitments. Typically, these ramp deals reduce our first year billings but will grow into a higher annual run rate level in the second year. In Q4, for example, the net impact of ramp deals was a headwind of approximately 1 percentage point to billings growth.
+
+We expect the net impact of ramps will be neutral to billings in fiscal 2024. In our outlook for fiscal '24, we're balancing our business optimism and confidence in our improved execution with ongoing macroeconomic uncertainties. We are entering Q1 with a record pipeline, and our customer engagements remain strong. However, we are mindful that in this environment predicting close rates and ramps in any 90-day period remains challenging.
+
+With a large market opportunity and customers increasingly adopting the broader platform, we'll invest aggressively to position us for long-term growth and profitability. With that in mind, let me provide our guidance for Q1 and full year fiscal 2024. As a reminder, these numbers are all non-GAAP. For the first quarter, we expect revenue in the range of $472 million to $474 million, reflecting a year-over-year growth of approximately 33%; gross margins of 80%, including the change in accounting for useful life of server equipment.
+
+I would also like to remind investors that a number of our emerging products, including newer products like ZDX and Zscaler for workloads will initially have lower gross margins than our core products. We're currently managing the emerging products for time to market and grow, not optimizing them for gross margins. In addition, we'll continue to invest in our cloud and AI infrastructure to scale with the growing demand; operating profit in the range of $70 million to $72 million; net other income of $14 million; income taxes of $8 million; earnings per share in the range of $0.48 to $0.49, assuming 159 million fully diluted shares. For the full year fiscal 2024, we expect revenue in the range of $2.05 billion to $2.065 billion, or year-over-year growth of approximately 27% to 28%; calculated billings in the range of $2.52 billion to $2.56 billion, or a year-over-year growth of 24% to 26%.
+
+We expect Q1 billings to grow approximately 30% on a year-over-year basis. We also expect our first-half mix to be approximately 42% of our full year billings guide; operating profit in the range of $330 million to $340 million, which reflects approximately 100 to 150 basis points of operating margin improvement compared to last year; income taxes of $35 million; earnings per share in the range of $2.20 to $2.25, assuming approximately 161 million fully diluted shares. We expect our free cash flow margin to be slightly above 20%. Finally, we expect our data center capex to be high single-digit percentage of revenue for the full year.
+
+Operator, you may now open the call for questions.
+
+Questions & Answers:
+
+
+Operator
+
+[Operator instructions] Our first question comes from the line of Brad Zelnick of Deutsche Bank. Your line is open.
+
+Brad Zelnick -- Deutsche Bank -- Analyst
+
+Great, thanks so much. And congrats on such a strong finish to the year. Jay, can you address why Zscaler wasn't included in Gartner's most recent Magic Quadrant for Single-Vendor SASE and if that's slowing you down at all? Because it certainly doesn't seem to be. And maybe just a quick one for Remo.
+
+Remo, guiding 25% billings growth off of a year where you're clearly outperforming peers is quite impressive. What should we be mindful of beyond what you've already told us in terms of your inputs and degrees of upside and downside risk versus what you've delivered historically? Thanks.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Right. So, Brad, this MQ is not slowing us down at all. SASE is a broad generic term. When Gartner started it a few years ago, it was an integration of SD-WAN and SSE, the gateway products we have.
+
+And we had done integration with every SD-WAN vendor that matters out there. So, but the MQ that got started this time was for single vendors SASE with SD-WAN offering in it. You know, we have often said that SD-WAN is the opposite of zero trust. We do offer zero trust SASE, but we don't offer SD-WAN SASE.
+
+So, we are not in MQ. We are about changing the world, not really propagating the old world, and we are very successful in doing so.
+
+Remo Canessa -- Chief Financial Officer
+
+And, Brad, related to guidance, you know, the positives are, our pipeline is record pipeline. Our execution was very good in Q4, which gives us confidence. You know, the potential downside is the global macro environment. We've taken that into account.
+
+From our view, you know, for fiscal '24, we're seeing a similar environment as we did in '23. You know, the guidance that we have, you called out, you know, 24% to 26%, you know, we feel is very strong guidance, which is supported by, you know, our optimism that we're seeing in our business related to also what we see, you know, global macro, you know, situation.
+
+Brad Zelnick -- Deutsche Bank -- Analyst
+
+Thank you very much, guys.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Matt Hedberg of RBC. Your line is open.
+
+Matt Hedberg -- RBC Capital Markets -- Analyst
+
+Great, thanks for taking my questions, guys. Congrats on the quarter. Jay, I was particularly impressed. You know, comments on emerging product success seems to be really kicking in here this year and even maybe more so next year.
+
+I was curious on workload protection seems to be resonating in a lot of partner conversations, and you mentioned on the call. Maybe just a little bit more details on why that's resonating because I think if some of these add-on products continue to do what they're doing, it certainly opens up much-larger TAM than kind of core ZIA/ZPA.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+So, when we started ZIA/ZPA, we started for users -- a user can securely communicate with applications. It's natural to extend ZIA for Workloads, ZPA for Workloads because workflows need to talk to each other in zero trust fashion. So, our customers understand it and appreciate it. It's a great upsell opportunity for us to expand our ARPU, as well as customer spend with us.
+
+So, we are seeing good success. The deal size is still smaller because the number of workloads in various stages, various customers. But we literally have no competition when it comes to this area because we are the only provider that's actually offering zero trust communication among workloads or workloads through internet. We are upbeat about it for fiscal 24 'as well.
+
+Matt Hedberg -- RBC Capital Markets -- Analyst
+
+Thanks, Jay. Congrats.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Roger Boyd of UBS. Again, Roger Boyd of Your Line is open.
+
+Roger Boyd -- UBS -- Analyst
+
+Hey, great. Thanks for taking the question, and congrats on a nice quarter. Jay, a lot has been made about Microsoft's entry into this market. Very high level, but Microsoft pretty specifically is targeting the SSE security edge space and not SASE space.
+
+And just maybe a follow-up to Brad's question, do you see that distinction as maybe further legitimizing the SSE approach relative to single-vendor SASE? Thanks.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you. So, SSE widely understood and accepted. It's essentially the combination of ZIA and ZPA we built. But the TAM for SSE is large, and it's getting larger than the market appreciates today.
+
+You know, for Microsoft, it was natural to be in the market for identity and endpoint security as they have traditionally owned Active Directory and Windows operating system. But inline security is a totally new area for them. This area of SSE that we pioneered has a very high bar, high performance, great security, no slowing down, and supporting a range of destinations. In fact, it's like being the [Inaudible] we have to connect to applications that are in Microsoft and AWS and GCP and a thousand SaaS applications out there.
+
+So, customers like the positioning of a provider like Zscaler that's not tied to applications itself. And Zscaler has earned the trust of large enterprises that take time for any new entrant. Yes, Microsoft entries further validation, but we don't believe it will impact us because of a positioning of the large enterprise market. And there may be some impact on the lower end of the market.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Ittai Kidron of Oppenheimer and Company. Your line is open.
+
+Ittai Kidron -- Oppenheimer and Company -- Analyst
+
+Thanks. Hi, gentlemen. Congrats, great finish for the year. I guess I want to talk about the competitive landscape of the more traditional firewall guys, Palo Alto and Fortinet more recently.
+
+And, I guess, checkpoint for an acquisition recently as well and clearly trying to make big efforts here. Maybe you could talk about the competitive environment. In what way is it today different than what it was a year ago? And what you seeing from your competitors more kind of near term here?
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you. On the high end of the market, we -- where we do extremely well, we really haven't seen a change. If there's any change, I would say our position has further solidified, and I wouldn't say it has gotten somewhat easier on the higher end of the market. When it comes to the firewall market, you know, we are replacing firewalls in the branches.
+
+When Zscaler gets deployed with zero trust architecture, there's no room for any firewall on the branch office. Now, there are still firewalls in the data center and the like because we don't play inside the data center for east-west traffic and the like. But as customers are doing local breakout or traffic from every location, the amount of traffic going through the data center is slowing down, which is bound to impact the sales and demand for firewalls out there. So, we do believe that the shift we have been talking about to truly zero trust away from traditional firewalls and VPNs is happening, and it will continue.
+
+Ittai Kidron -- Oppenheimer and Company -- Analyst
+
+Appreciate it. Thank you.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Andrew Nowinski of Wells Fargo. Your line is open.
+
+Andy Nowinski -- Wells Fargo Securities -- Analyst
+
+Great, thank you. Congrats on a great quarter, particularly on the billings. I mean it looks like you solidly exceeded that even if you pull out that $20 million deal upfront. So, I wanted to ask about the data protection products that you call out.
+
+You spent a lot of time talking about it on the call. And, you know, based on the info in your slide deck, it looks like there's a lot of components to that beyond just data protection. So, maybe could you just talk about some of the vendors that you're competing against there? I think you said one customer, you replaced two different vendors. If you just expand on, you know, what you're seeing there from a competitive perspective and how competitive is it relative to like the markets for Zia and ZPA? Thanks.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Yeah. So, data protection becomes a natural thing for us when traffic starts flowing through Zscaler. So, literally, it's hard to have any other data protection vendor when Zscaler is actually sitting in the traffic path. So, what are some of the pieces of success here? First of all, DLP, data loss prevention, which essentially is done on the traffic that's in line.
+
+We are replacing -- the No. 1 vendor we are replacing there is Symantec, which is widely deployed. So, first, we deployed Symantec Blue Coat boxes, and now, the DLP, is the secondary piece. The third big area we are replacing is some of the CASB vendors.
+
+CASB early on got sold as a point product as a module. For us, it's the feature. So, any customer who has pure-play CASB deployed essentially gets kind of replaced by our data protection platform. So, those are two big areas.
+
+But in addition, now we are seeing some of the newer areas coming up. For example, our end-point DLP which we recently launched, is getting tremendous attraction out there. The email DLP module we added is wonderful as well. Through acquisition of Canonic, we have added the SaaS -- the ecosystem -- sorry, supply chain data protection.
+
+So, all this has made these pillars the most comprehensive platform, and it's setting traffic line. That's why we kind of called it out because the growth is great over 60% year over year, and we are close to a quarter of $1 billion in. ARR.
+
+Andy Nowinski -- Wells Fargo Securities -- Analyst
+
+That's great. Thanks, Jay. Keep up the good work.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Brian Essex of JPMorgan. Your line is open.
+
+Brian Essex -- JPMorgan Chase and Company -- Analyst
+
+Hi, good afternoon. Thank you for taking the question, and congrats on the results. Jay, I was wondering if you may, or Remo as well, give us a little bit more color in terms of the ramp deals that you saw this quarter. Is there a way to quantify what percentage of deals were ramp versus prior quarters? And how does that typically -- how do the dynamics of those deals work in terms of the amounts of commitments, the pricing, and the flexibility around ramping? And does it give you more flexibility? Or does it give you more visibility around what you see in the pipeline in terms of coverage over future periods? Thanks.
+
+Remo Canessa -- Chief Financial Officer
+
+You know, I mean -- I'll start and, you know, maybe Jay can contribute also. Ramp was a headwind about 1%, you know, for us. For larger deals that we're doing as we talked about in the past is that, you know, we used ramp with our customers to basically ramp into the full, you know, suite of products that we have. What we're seeing is we're seeing customers buy more of our platform upfront.
+
+We're seeing also existing customers, you know, expanding their platform with, you know, ZIA, ZPA, ZDX, and data protection. So, when they're buying the full suite of our products, you know, basically, we use it as a vehicle to allow our customers to ramp into our products and we get that type of pricing. I would expect, you know, ramps, you know, just to kind of follow into fiscal '24, I would expect the same level of ramps in fiscal '24, you know, at this point. You know, the ramps do give us better visibility into billings because, basically, they'll start off with, you know, lower billings and then ramp up their billings in future periods.
+
+But, you know, it is a vehicle that we use. It's been very effective, and so we'll continue to use it. And, Jay?
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+The bigger the platform we sell, the more likely you need to provide ramp because of the more pieces to be done. Number two, there's some of the tighter macro environment with bigger deal scrutiny, ramp did pick up in the past year or so as compared to two years ago. We factor that in as a part of doing business, and it's not a bad thing. We just have to manage it right away.
+
+Brian Essex -- JPMorgan Chase and Company -- Analyst
+
+Great, thank you very much.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Gray Powell of BTIG. Your line is open.
+
+Gray Powell -- BTIG -- Analyst
+
+OK, great. Yeah, Gray Powell from BTIG. Thanks for taking the question, and congrats on the good results. So, a couple related questions on my side.
+
+Can you talk about the visibility you have on late-stage pipeline today relative to this time last year? And I guess I'm just trying to get your confidence in billings. The higher percentage of deals today, does that help give you better visibility on the growth outlook?
+
+Remo Canessa -- Chief Financial Officer
+
+Our visibility is good. So, you know, as we talked about, you know, we've had a record pipeline and our execution also in Q4. So, I would say, you know, visibility for us is good and supports our guidance. You know, certainly the billings, you know, with the ramps, you know, give us also good visibility to.
+
+So, you know, our guidance I feel, you know, 24% to 26% is, you know, very good guidance and takes into account our visibility and all factors.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+And also, if I may add, we have a record pipeline, and we're seeing pretty good momentum on our business. We are in Vegas doing our sales kickoff a couple of weeks ago. The energy and excitement in the room could be felt actually. It was very good because our sales team have a lot of confidence.
+
+We talked about the record pipeline, the record deals out there. We talked about the new momentum the channel is adding to us. So, we feel very good about our fiscal '24 business.
+
+Gray Powell -- BTIG -- Analyst
+
+That's perfect. Thank you very much.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Jonathan Ruykhaver of Cantor. Your line is open.
+
+Jonathan Ruykhaver -- Cantor Fitzgerald -- Analyst
+
+Yes, thank you. So, I have a question on the emerging product portfolio. You highlighted how it represented 18% of new business in fiscal '23. You expect that to get to plus 20% in fiscal '24.
+
+It just seems to me like, you know, plus 20%, I'm not sure, you know, what that plus could be. But it seems like a low bar just given your comments on data security, ZDX, and cloud, including the tailwind that you talked about from AI that is going to benefit some of those products. So, can you just help us understand, you know, demand dynamics within that portfolio, the puts and takes on specific products where you expect to see the strongest demand?
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Yes, if I may start, it's all coming out of 100% only, OK? That's the most important point, when the overall growth is pretty strong, OK? And for the new product, even if it grows 70%, 80% a year, it has to work very hard to even take away 1% or 2% from the total new ACV. That's pretty significant. So, the growth of these new products is much faster than the growth of the overall new ACV. But they are fairly small and they're growing into good business.
+
+We called out data protection as it became a significant part of the business. And I think here, too, you're seeing a combination of workload protection and ZDX. Both are growing at a much faster rate. So, I think we are actually happy with the growth rate.
+
+But the point I was trying to make is they are trying to steal away the market share from the rest of the overall portfolio, which has a much bigger base. That's why the math looks small, but it is pretty impressive.
+
+Jonathan Ruykhaver -- Cantor Fitzgerald -- Analyst
+
+Yeah, understood. Good point, Jay. Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Joshua Tilton of Wolfe Research line is open.
+
+Josh Tilton -- Wolfe Research -- Analyst
+
+Hey, guys, thanks for taking my question, and I echo my congrats on a good quarter. I kind of want to go back to the first question. And if you look at the guidance, the implied new billings kind of looks like a little bit more aggressive, I would say, in the last two years. So, maybe just, you know, level set for us or set some guardrails or expectations around kind of the puts and takes on what it would take for you guys to kind of outperform what you laid out for the next 12 months for us, please.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Let me start to give you a big picture, and, Remo, you can add your color to it. So, our guidance is starting with, first of all, a record pipeline and the momentum we have in the business. And we have plenty of product to sell. We have a growing market opportunity and many business drivers.
+
+We have a lot of new customers who are really buying zero trust for better security, lots of market that's not covered. We are still in about 30% of the Global 2000 market. And we are seeing public sector coming strong, federal market coming strong, and that's really further pushed by some of the mandates that are happening out there. And you saw in some of the deals that are announced, customers are increasingly buying, more of these kind of platform leading to bigger deals.
+
+ZIA, ZPA, our flagship products, are still doing strong. We have factored into good growth for data protection, which complements -- it actually needs to go in every Zscaler customer. Emerging products are contributing nicely, and they are contributing. So, when you look at all this, that's what we took into factor to give you our guidance.
+
+Remo Canessa -- Chief Financial Officer
+
+Yeah, I mean, you know our guidance basically balances our business optimism that we see with our company, also with the macro environment. We feel, you know, in this market, this is very solid guidance.
+
+Josh Tilton -- Wolfe Research -- Analyst
+
+Super helpful. Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Saket Kalia of Barclays. Your line is open.
+
+Saket Kalia -- Barclays -- Analyst
+
+OK, great. Hey, guys, thanks for taking my question here. And I echo my congrats on the result. Remo, maybe for you just to switch it up a little bit, great to see the free cash flow margin expansion thus far.
+
+Maybe looking forward, can you just talk about some of the puts and takes on free cash flow margin with, I think, the 20%-plus guide for next year? You know, how are you thinking about things like billing duration or capex or anything that's maybe influencing that number? Because it's great to see, you know, operating margin expand, why isn't free cash flow margin maybe expanding in the same way? Thanks.
+
+Remo Canessa -- Chief Financial Officer
+
+Yeah, Saket, that's a great question. You know, the billing duration, I would assume, so similar type billing duration as we had in fiscal '23, so no change. Capex, we do expect capex to be a higher percent. If you take a look at the last two years, our capex has been in the 6% range of our revenue.
+
+We expect our capex to be in the high single-digit type range. We're seeing the expansion of our business. We're making investments in our cloud. And again, it's -- that's the main reason.
+
+Saket Kalia -- Barclays -- Analyst
+
+Very helpful. Thanks.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Gregg Moskowitz of Mizuho. Your line is open.
+
+Gregg Moskowitz -- Mizuho Securities -- Analyst
+
+All right, thank you for taking the question, and I'll echo my congrats. Jay, getting back to data protection. So, your multimodal DLP that combines video and audio formats, that's interesting to me. Curious if you had any early customer feedback on that feature.
+
+And then, secondly, what has been the early uptake around your new Risk360 offering just in terms of visualizing risk, etc.? Thank you.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+So, multimodal DLP is still in development. It's not shipping, but there's a bunch of engagement with customers. And interest is high because there's nothing like this out there in the market. You know, Zscaler always likes to pioneer new things that no one has done out there.
+
+Risk360, on the other hand, is actually shipping. We have taken a bunch of orders. This product has more interest upfront than any of the other products I can think of because our engagement with CISOs are strong. When a CISO looks at Risk360 and say, "Wow, I have actually a single point to really tell me the holistic view of my business and actually where my risk factors are and what tangible prioritized actions I could take to improve my risk." So, this is a significant velocity going on.
+
+Very, very good feedback, and the product is growing very rapidly in terms of functionality as well.
+
+Gregg Moskowitz -- Mizuho Securities -- Analyst
+
+Perfect. Thank you.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of John DiFucci of Guggenheim. Your line is open.
+
+Again, Mr. DiFucci, your line is open.
+
+John DiFucci -- Guggenheim Partners -- Analyst
+
+Oh, thank you. So, it does -- everybody has said it, the numbers look good. It was a nice quarter. But when we try to look at the new business signings, it looks like it saw a tick down against a similar comp versus, what I think, was sort of a monster quarter, the third quarter.
+
+And, Remo, you mentioned the continued macro pressure. And, of course, everybody sees that. But is this quarter how we should expect Zscaler to sort of be going forward in regards to business momentum and new business signings? Or was the third quarter a better gauge on what we should expect going forward against this macro backdrop?
+
+Remo Canessa -- Chief Financial Officer
+
+Yeah, I mean our new and upsell bookings were up year over year, so, you know, for Q4. So, it was a good quarter. You know, from my perspective, John, I mean, it's a huge market opportunity, you know, on the part of Zscaler. I don't want to make any projections related to our doing upsell billings because we don't give that guidance.
+
+But let me just say that, you know, our pipeline for new and upsell is very strong. You know, we had really strong and good execution in our Q4 which gives us confidence. So, I feel that we're well positioned to go forward and really, you know, do well. As I mentioned, it has come up a few times, you know, the wild card really is the global macro backdrop.
+
+And so, we're expecting the global macro environment to stay similar year over year. And so, we'll see how that plays through. But from a company perspective, for what we did in Q4 and how our business tracked, we had a very strong quarter, which gives us optimism going into fiscal '24.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+The point I'll also add is, since Zscaler can actually reduce cost while reducing business risk, there's added attractions. We are able to engage with customers and close deals even when the macro is tight. We also kind of felt very good about the record number of $1 million deals we closed in Q4. So, I see strengths across all major areas, major vertical regions, and that's why we feel good about it.
+
+John DiFucci -- Guggenheim Partners -- Analyst
+
+That all makes sense, and thanks for taking my question. I mean what you guys are doing is better than most everybody out there. But I guess just a quick follow-up, you know, you said your bookings were up year over year. If you look at current billings -- and I don't know if you look at it that way, I guess I do, but -- and I'm looking -- I'm trying to back it with new ACV or new ARR, depending upon the company.
+
+For you, it's new ACV. Was that up year over year? For most companies, it's not. And we calculate it being down a little bit, but it's still better than most.
+
+Remo Canessa -- Chief Financial Officer
+
+Yeah, we're not commenting on new ACV. But our bookings were up year over year, and I think that's a good way to look at the business.
+
+John DiFucci -- Guggenheim Partners -- Analyst
+
+Thank you.
+
+Remo Canessa -- Chief Financial Officer
+
+Thank you.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Adam Borg of Stifel. Your line is open.
+
+Adam Borg -- Stifel Financial Corp. -- Analyst
+
+Awesome, and thanks so much for taking the question. Maybe just for Remo, a couple of related housekeeping follow-up. So, NRR, I think, was at 121, below 125 for the first time after a number of quarters, was hoping you could talk a little bit more about that and expectations for next year. And I apologize if I missed it as a follow-up, but billing duration in the quarter, just what did that turn out to be? Was that a headwind or tailwind? Thanks so much.
+
+Remo Canessa -- Chief Financial Officer
+
+Yeah, so NRR, 121%, you know, we feel is outstanding. What we're seeing is that we're seeing more customers buying more of our platform upfront. So, when customers are buying more of your platform upfront, that'll impact, you know, purchase in the future. Also, you know, we called out on the call a Fortune 10 company, you know, bought basically within one quarter.
+
+So, if you're buying within the year based on the calculation for NRR, that impacts it. You know, from my perspective, you know, we've been saying this since our public offering, is billings is really the best measure to really, you know, look at Zscaler. And we still feel that. The only time we look at NRR is really at the end of the quarter.
+
+It's a metric that we look at. But, you know, really, what I look at is basically our total billings, whether it comes from new or upsell. You know, related to, you know, billing duration, it was a tailwind in Q4.
+
+Adam Borg -- Stifel Financial Corp. -- Analyst
+
+Great. Thanks so much.
+
+Operator
+
+Thank you. One moment, please. Our next question comes from the line of Peter Levine of Evercore. Your line is open.
+
+Peter Levine -- Evercore ISI -- Analyst
+
+Great, thanks for squeezing me in here, guys. Maybe one just follow-up on the gen AI opportunity, you know, your analyst -- not your analyst day, but your customer conference a couple of weeks ago, you announced the security autopilot. Maybe, Remo, for one, can you just put a final point in terms of how you plan on monetizing that as usage-based? Is it just an upsell to kind of the normal contract subscription? And then, second to Jay, is, you know, your competitors are all saying the same thing in terms of their data is proprietary. Maybe the same question to you is what do you think makes, you know, Zscaler data proprietary? What's your competitive advantage when you go into an RFP versus, call it, like a Palo, whoever it might be, that your data is proprietary? There is a moat around your business.
+
+What would your answer to that be?
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+So, let's start with data. There are lots of companies have lots of data. What's exciting about Zscaler is we are designed as a switchboard for all communication between different parties. A firewall is not a switchboard.
+
+A firewall is a door, it's a gate. It says you are inside, you are outside. That's number one. Number two, firewall logs are often what's known as short logs.
+
+Still, a small number of firewall transactions, whether they are on-prem or in the cloud [Inaudible] access and decrypted. If the records aren't -- if the transactions aren't decrypted, your logs aren't of much use at all. Or if you take DNS logs, they're not very useful at all. It says which DNS domain are you going to? We have full logs after decryption about all the information, including the full URL.
+
+The URL has a lot of useful information that we have. Before any reconnaissance -- before any breach happens, reconnaissance starts, it could be trying to exploit software vulnerabilities. It could be trying to phish those employees. All of that traffic goes through us, and we collect those logs.
+
+And that's why we can actually do some of the things like being able to predict potential breach and the like. So, that's one big part on the log side of it. The second question of how do we monetize AI-driven products, don't think of Zscaler having only AI-driven products as a separate charge. It is going to influence all of our product lines.
+
+Today, some of our premium bundles include AI-powered products. So, we are actually monetizing it as a part of the premium bundles. And they are actually fairly highly priced as compared to other bundles. Products like Risk360, leveraging AI/ML big time, is charged SKU.
+
+Some of the new products we are building, they will have upsell, they'll have their own SKU. So, we think there's plenty of opportunities to charge because customers are seeing values from the products we're building.
+
+Peter Levine -- Evercore ISI -- Analyst
+
+Thank you.
+
+Operator
+
+Thank you. Our next question comes from the line of Patrick Colville. Your line is open. One moment.
+
+Patrick Colville -- Deutsche Bank -- Analyst
+
+All right. Thank you so much for putting me in.
+
+Remo Canessa -- Chief Financial Officer
+
+ Hello?
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Hello?
+
+Remo Canessa -- Chief Financial Officer
+
+Hey, Patrick, are you on? Patrick? Can we take our last question?
+
+Operator
+
+Our last -- Patrick's line was open. I don't know if he disconnected. One moment. Patrick, if you could star 1, 1 again.
+
+Remo Canessa -- Chief Financial Officer
+
+Let's try the next call.
+
+Operator
+
+OK, thank you. One moment. Our next question comes from the line of Ben Bollin of Cleveland Research. Your line is open.
+
+Ben Bollin -- Cleveland Research Company -- Analyst
+
+Good afternoon. Thanks for taking the question. I'm interested in how you feel about your progress throughout the broader channel, traditional two-tier cloud marketplace GSI. Any thoughts you have around kind of what you've learned over the last six, 12 months and how that's playing into your strategy over the next several years? Thank you.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Yeah, as you know, traditionally channel has played a limited role for us. We have been working on it. We are seeing more and more progress being made. Just to clarify, I mean, we do take all these to channel with some exceptions when customers insist on us.
+
+So, that's one part. We are seeing year after year, or in the past several quarters, channel providing more leverage to us. That means doing more work for us. We recently hired our channel chief, Karl Soderlund.
+
+He comes from extensive experience and strong relationships in the channel. And he has done a lot of work in three areas of the channel, system integrators, VARs, and service providers. In the system integration space, we have actually most of the large SIs. They have selected and deployed Zscaler for their own transformation and zero trust architecture.
+
+So, they are embedding our solution into the SI advisory services, which is very good because, then, it becomes a lot better. We also -- so we are counting on more leverage from broad channel, even the wider channel itself. They're going through some focused training program, enablement program, which will help them do more transformation with us. In fact, we had nearly 200 channel partners come to our sales kickoff and join hands and conduct session with our sales teams.
+
+We think that's bringing our teams a lot more closer to work together, common account planning, and the like. So, we're pleased of progress, and we think we're moving in the right direction.
+
+Ben Bollin -- Cleveland Research Company -- Analyst
+
+Thanks guys. Have a great night.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Thank you.
+
+Remo Canessa -- Chief Financial Officer
+
+Thank you.
+
+Operator
+
+Thank you. I'd like to turn the call back over to Jay Chaudhry for any closing remarks.
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Well, thank you all for your interest in Zscaler. I'm looking forward to seeing you at some of our investor conferences. Thank you again.
+
+Remo Canessa -- Chief Financial Officer
+
+Thank you.
+
+Operator
+
+Thank you. Ladies and gentlemen, that concludes today's conference. Thank you all for participating. [Operator signoff]
+
+Duration: 0 minutes
+
+Call participants:
+
+Bill Choi -- Senior Vice President, Investor Relations and Strategic Finance
+
+Jay Chaudhry -- Founder, Chairman, and Chief Executive Officer
+
+Remo Canessa -- Chief Financial Officer
+
+Brad Zelnick -- Deutsche Bank -- Analyst
+
+Matt Hedberg -- RBC Capital Markets -- Analyst
+
+Roger Boyd -- UBS -- Analyst
+
+Ittai Kidron -- Oppenheimer and Company -- Analyst
+
+Andy Nowinski -- Wells Fargo Securities -- Analyst
+
+Brian Essex -- JPMorgan Chase and Company -- Analyst
+
+Gray Powell -- BTIG -- Analyst
+
+Jonathan Ruykhaver -- Cantor Fitzgerald -- Analyst
+
+Josh Tilton -- Wolfe Research -- Analyst
+
+Saket Kalia -- Barclays -- Analyst
+
+Gregg Moskowitz -- Mizuho Securities -- Analyst
+
+John DiFucci -- Guggenheim Partners -- Analyst
+
+Adam Borg -- Stifel Financial Corp. -- Analyst
+
+Peter Levine -- Evercore ISI -- Analyst
+
+Patrick Colville -- Deutsche Bank -- Analyst
+
+Ben Bollin -- Cleveland Research Company -- Analyst
+
+More ZS analysis
+
+All earnings call transcripts
+
+This article is a transcript of this conference call produced for The Motley Fool. While we strive for our Foolish Best, there may be errors, omissions, or inaccuracies in this transcript. As with all our articles, The Motley Fool does not assume any responsibility for your use of this content, and we strongly encourage you to do your own research, including listening to the call yourself and reading the company's SEC filings. Please see our Terms and Conditions for additional details, including our Obligatory Capitalized Disclaimers of Liability.
\ No newline at end of file
diff --git a/tests/BertTests/resources/example_CAT.txt b/tests/BertTests/resources/example_CAT.txt
new file mode 100644
index 0000000000000..8e3ec67070ff9
--- /dev/null
+++ b/tests/BertTests/resources/example_CAT.txt
@@ -0,0 +1,8 @@
+En l'era de la tecnologia de la informació, les dades són la nova moneda. Cada dia, generem grans quantitats de dades, des de publicacions a les xarxes socials i correus electrònics fins a lectures de sensors i transaccions financeres. El repte no és recopilar dades, sinó donar-los sentit.
+L'anàlisi de dades és un procés fonamental per extreure coneixements valuosos de les dades. Una de les tècniques clau en l'anàlisi de dades és la agrupació. La agrupació és un mètode per agrupar punts de dades similars. S'utilitza en diversos camps, incloent-hi l'aprenentatge automàtic, el màrqueting i la biologia.
+Un dels algoritmes d'agrupació més populars és el k-means. El k-means és un algoritme d'aprenentatge no supervisat que divideix les dades en k grups. L'objectiu és minimitzar la distància entre els punts de dades dins del mateix grup i maximitzar la distància entre els grups. El k-means s'utilitza àmpliament per a la segmentació de clients, la compressió d'imatges i més.
+Quan es tracta de dades de text, l'agrupació k-means pot ser una eina potent per a tasques com la resumificació de text. En la resumificació de text, l'objectiu és crear un resum concís i coherent d'un document. En lloc de llegir llargs articles o documents, pots comprendre ràpidament els punts principals d'un bon resum.
+Per utilitzar l'agrupació k-means per a la resumificació de text, primer has de convertir les dades de text en representacions numèriques. Aquí és on entren en joc tècniques com les embeddings de paraules i les embeddings de frases. Les embeddings de paraules transformen les paraules en vectors, capturant el seu significat semàntic. Les embeddings de frases, així mateix, representen les frases com a vectors basats en les embeddings de les seves paraules constituents.
+Un cop tinguis les embeddings de frases per a un document, pots aplicar l'agrupació k-means. Els grups resultants representen conjunts de frases similars. En seleccionar una o més frases de cada grup, pots crear un resum que abasti diversos aspectes del document.
+La resumificació de text no consisteix només en seleccionar frases a l'atzar; requereix un enfocament sistemàtic. Per exemple, pots triar la frase amb la major similitud al centre del grup o utilitzar altres criteris. El repte rau a trobar l'equilibri adequat entre la diversitat i la rellevància en el resum.
+En conclusió, l'agrupació k-means és una eina valuosa en el camp de la resumificació de text. Permet la generació automàtica de resums que capturin l'essència d'un document. A mesura que seguim tractant amb un volum cada vegada major de dades de text, tècniques com l'agrupació k-means es tornen cada vegada més importants per extreure informació significativa.
diff --git a/tests/BertTests/resources/example_EN.txt b/tests/BertTests/resources/example_EN.txt
new file mode 100644
index 0000000000000..05c0259185bee
--- /dev/null
+++ b/tests/BertTests/resources/example_EN.txt
@@ -0,0 +1 @@
+In the age of information technology, data is the new currency. Every day, we generate vast amounts of data, from social media posts and emails to sensor readings and financial transactions. The challenge is not collecting data but making sense of it all. Data analysis is a fundamental process for extracting valuable insights from data. One of the key techniques in data analysis is clustering. Clustering is a method of grouping similar data points together. It's used in various fields, including machine learning, marketing, and biology. One popular clustering algorithm is k-means. K-means is an unsupervised learning algorithm that partitions data into k clusters. The goal is to minimize the distance between data points within the same cluster and maximize the distance between clusters. K-means is widely used for customer segmentation, image compression, and more. When it comes to text data, k-means clustering can be a powerful tool for tasks like text summarization. In text summarization, the goal is to create a concise and coherent summary of a document. Instead of reading through long articles or documents, you can quickly grasp the main points from a well-generated summary. To use k-means clustering for text summarization, you first need to convert text data into numerical representations. This is where techniques like word embeddings and sentence embeddings come into play. Word embeddings transform words into vectors, capturing their semantic meaning. Sentence embeddings, in turn, represent sentences as vectors based on the embeddings of their constituent words. Once you have sentence embeddings for a document, you can apply k-means clustering. The resulting clusters represent groups of similar sentences. By selecting one or more sentences from each cluster, you can create a summary that covers various aspects of the document. Text summarization is not just about selecting sentences randomly; it requires a systematic approach. For instance, you can choose the sentence with the highest similarity to the cluster center or use other criteria. The challenge lies in finding the right balance between diversity and relevance in the summary. In conclusion, k-means clustering is a valuable tool in the field of text summarization. It enables the automatic generation of summaries that capture the essence of a document. As we continue to deal with an ever-increasing volume of text data, techniques like k-means clustering become increasingly important for extracting meaningful information.
diff --git a/tests/BertTests/resources/example_ES.txt b/tests/BertTests/resources/example_ES.txt
new file mode 100644
index 0000000000000..800a937c6ff94
--- /dev/null
+++ b/tests/BertTests/resources/example_ES.txt
@@ -0,0 +1,8 @@
+En la era de la tecnología de la información, los datos son la nueva moneda. Todos los días, generamos grandes cantidades de datos, desde publicaciones en redes sociales y correos electrónicos hasta lecturas de sensores y transacciones financieras. El desafío no es recopilar datos, sino darles sentido.
+El análisis de datos es un proceso fundamental para extraer conocimientos valiosos de los datos. Una de las técnicas clave en el análisis de datos es la agrupación. La agrupación es un método para agrupar puntos de datos similares. Se utiliza en diversos campos, incluyendo el aprendizaje automático, el marketing y la biología.
+Uno de los algoritmos de agrupación más populares es el k-means. K-means es un algoritmo de aprendizaje no supervisado que divide los datos en k grupos. El objetivo es minimizar la distancia entre los puntos de datos dentro del mismo grupo y maximizar la distancia entre los grupos. K-means se utiliza ampliamente para la segmentación de clientes, la compresión de imágenes y más.
+Cuando se trata de datos de texto, la agrupación k-means puede ser una herramienta poderosa para tareas como la summarización de texto. En la summarización de texto, el objetivo es crear un resumen conciso y coherente de un documento. En lugar de leer largos artículos o documentos, puedes comprender rápidamente los puntos principales de un resumen bien generado.
+Para utilizar la agrupación k-means para la summarización de texto, primero debes convertir los datos de texto en representaciones numéricas. Aquí es donde entran en juego técnicas como las embeddings de palabras y las embeddings de frases. Las embeddings de palabras transforman las palabras en vectores, capturando su significado semántico. Las embeddings de frases, a su vez, representan las frases como vectores basados en las embeddings de sus palabras constituyentes.
+Una vez que tengas las embeddings de frases para un documento, puedes aplicar la agrupación k-means. Los grupos resultantes representan conjuntos de frases similares. Al seleccionar una o más frases de cada grupo, puedes crear un resumen que abarque diversos aspectos del documento.
+La summarización de texto no consiste solo en seleccionar frases al azar; requiere un enfoque sistemático. Por ejemplo, puedes elegir la frase con la mayor similitud al centro del grupo o utilizar otros criterios. El desafío radica en encontrar el equilibrio adecuado entre la diversidad y la relevancia en el resumen.
+En conclusión, la agrupación k-means es una herramienta valiosa en el campo de la summarización de texto. Permite la generación automática de resúmenes que capturan la esencia de un documento. A medida que seguimos lidiando con un volumen cada vez mayor de datos de texto, técnicas como la agrupación k-means se vuelven cada vez más importantes para extraer información significativa.
diff --git a/tests/BertTests/resources/example_large_EN.txt b/tests/BertTests/resources/example_large_EN.txt
new file mode 100644
index 0000000000000..1ef88bf166e70
--- /dev/null
+++ b/tests/BertTests/resources/example_large_EN.txt
@@ -0,0 +1,21 @@
+The rapid advancement of technology in the 21st century has brought about profound changes in nearly every aspect of our lives. From the way we communicate and work to how we entertain ourselves and solve complex problems, technology is at the core of our modern existence.
+At the heart of this technological revolution is artificial intelligence (AI). AI is a broad field that encompasses a range of technologies and applications. Machine learning, natural language processing, computer vision, and robotics are just a few of the branches of AI that have seen remarkable progress.
+One of the most exciting applications of AI is in autonomous vehicles. Self-driving cars are no longer a distant dream but a reality on our roads. These vehicles use a combination of sensors, cameras, and AI algorithms to navigate safely and efficiently. The potential benefits in terms of reducing accidents and congestion are substantial.
+In the realm of healthcare, AI has made significant strides. Medical image analysis powered by deep learning can detect diseases earlier and with higher accuracy. AI-driven drug discovery is accelerating the development of new treatments. Virtual health assistants provide personalized medical advice, improving patient outcomes.
+Natural language processing, a subfield of AI, has transformed how we interact with technology. Voice assistants like Siri and Alexa understand and respond to our spoken commands. Language translation apps enable us to communicate seamlessly across borders. Sentiment analysis tools help businesses gauge customer feedback and make informed decisions.
+In content creation and curation, AI is playing a central role. News articles are being generated by AI algorithms, and content recommendation systems personalize our online experiences. Video games use AI to create dynamic and immersive virtual worlds. AI-generated art challenges our notions of creativity.
+Despite the promise of AI, ethical considerations and challenges remain. Bias in AI algorithms can lead to unfair outcomes. Privacy concerns arise as AI systems collect and analyze vast amounts of personal data. The future of work is being reshaped as automation and AI impact various industries.
+To harness the potential of AI and address these challenges, interdisciplinary collaboration is essential. Ethicists, engineers, policymakers, and the public must engage in meaningful dialogue. AI education and literacy are crucial for individuals to navigate this evolving landscape.
+As AI continues to advance, it will touch every facet of our lives. The possibilities are limitless, from predicting climate change and understanding the universe's mysteries to transforming education and revolutionizing transportation. How we navigate this future depends on our collective wisdom and responsible stewardship of this remarkable technology.
+The journey of AI has been nothing short of transformative. From its early days as a concept in computer science to today's real-world applications, AI has come of age. It has crossed the threshold from theory to practicality, fundamentally altering industries and reshaping society.
+The foundations of AI were laid in the mid-20th century when computer scientists and mathematicians began exploring the idea of machines that could mimic human intelligence. Early AI pioneers like Alan Turing and John McCarthy paved the way for what would become a revolutionary field.
+The birth of AI can be traced to the Dartmouth Conference in 1956, where the term "artificial intelligence" was coined. It was a time of great optimism, with researchers believing that human-level AI was just around the corner. However, progress proved to be slower than anticipated, leading to what became known as the "AI winter."
+During the AI winter, funding and interest in AI research waned. The grand visions of intelligent machines gave way to more modest goals and practical applications. Expert systems, which used rule-based reasoning to solve specific problems, became a focus of AI research.
+In the 1990s, AI experienced a resurgence. Advances in machine learning, fueled by the availability of large datasets and more powerful computers, reignited interest in the field. Researchers began to develop algorithms that could learn from data, a departure from the rule-based systems of the past.
+Machine learning algorithms, including neural networks, support vector machines, and decision trees, became the building blocks of AI applications. These algorithms could recognize patterns in data, enabling tasks like image recognition, natural language understanding, and recommendation systems.
+The rise of the internet and the abundance of digital data further propelled AI. Companies like Google, Facebook, and Amazon invested heavily in AI research, driving innovations in search engines, social media, and e-commerce. AI-powered applications became integral to our daily lives.
+The breakthroughs in AI were exemplified by the success of IBM's Watson, which defeated human champions in the quiz show "Jeopardy!" Watson's ability to understand and respond to natural language questions marked a significant milestone in AI.
+Today, AI is everywhere. It powers virtual assistants like Siri and Alexa, making our smartphones and smart homes smarter. AI algorithms help diagnose diseases from medical images, recommend products, and optimize supply chains. AI-driven chatbots provide customer support, and autonomous vehicles are becoming a reality.
+The future of AI holds even greater promise. Researchers are pushing the boundaries of AI with advancements in reinforcement learning, generative adversarial networks, and quantum computing. These technologies have the potential to revolutionize industries and address complex global challenges.
+The ethical and societal implications of AI cannot be ignored. As AI becomes more integrated into our lives, questions of bias, transparency, and accountability come to the fore. Ensuring that AI benefits all of humanity and respects our values is a pressing concern.
+In conclusion, AI's journey from concept to practicality has been marked by challenges and triumphs. It has transformed industries, enhanced our capabilities, and opened up new frontiers of possibility. As AI continues to advance, it is up to us to shape its path and ensure that it serves as a force for good in the world.