@@ -284,6 +284,27 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa
284
284
"""BERT tokenizer based on WordPiece"""
285
285
286
286
287
+ # // pre-tokenization types
288
+ # enum llama_vocab_pre_type {
289
+ # LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
290
+ # LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
291
+ # LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
292
+ # LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
293
+ # LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
294
+ # LLAMA_VOCAB_PRE_TYPE_MPT = 5,
295
+ # LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
296
+ # LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
297
+ # };
298
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0
299
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1
300
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2
301
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3
302
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4
303
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5
304
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6
305
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7
306
+
307
+
287
308
# // note: these values should be synchronized with ggml_rope
288
309
# // TODO: maybe move this enum to ggml.h (ggml_rope_type)
289
310
# enum llama_rope_type {
0 commit comments