Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 2fffc52

Browse files
llama : fix Roberta embeddings (ggml-org#10856)
* fix: Use gpt2 tokenizer for roberta and add eos/bos tokens Branch: RobertaTokenizer Signed-off-by: Gabe Goodhart <[email protected]> * fixes to position embeddings Signed-off-by: Sukriti-Sharma4 <[email protected]> * map roberta-bpe to gpt-2 Signed-off-by: Sukriti-Sharma4 <[email protected]> * fix linting Signed-off-by: Sukriti-Sharma4 <[email protected]> --------- Signed-off-by: Gabe Goodhart <[email protected]> Signed-off-by: Sukriti-Sharma4 <[email protected]> Co-authored-by: Gabe Goodhart <[email protected]>
1 parent 7585edb commit 2fffc52

File tree

2 files changed

+48
-2
lines changed

2 files changed

+48
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2628,7 +2628,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
26282628
return [(self.map_tensor_name(name), data_torch)]
26292629

26302630

2631-
@Model.register("BertModel", "CamembertModel", "RobertaModel")
2631+
@Model.register("BertModel", "CamembertModel")
26322632
class BertModel(Model):
26332633
model_arch = gguf.MODEL_ARCH.BERT
26342634

@@ -2701,6 +2701,51 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
27012701
return [(self.map_tensor_name(name), data_torch)]
27022702

27032703

2704+
@Model.register("RobertaModel")
2705+
class RobertaModel(BertModel):
2706+
model_arch = gguf.MODEL_ARCH.BERT
2707+
2708+
def __init__(self, *args, **kwargs):
2709+
super().__init__(*args, **kwargs)
2710+
2711+
# we need the pad_token_id to know how to chop down position_embd matrix
2712+
if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
2713+
self._position_offset = 1 + pad_token_id
2714+
if "max_position_embeddings" in self.hparams:
2715+
self.hparams["max_position_embeddings"] -= self._position_offset
2716+
else:
2717+
self._position_offset = None
2718+
2719+
def set_vocab(self):
2720+
"""Support BPE tokenizers for roberta models"""
2721+
bpe_tok_path = self.dir_model / "tokenizer.json"
2722+
if bpe_tok_path.exists():
2723+
self._set_vocab_gpt2()
2724+
self.gguf_writer.add_add_bos_token(True)
2725+
self.gguf_writer.add_add_eos_token(True)
2726+
2727+
# we need this to validate the size of the token_type embeddings
2728+
# though currently we are passing all zeros to the token_type embeddings
2729+
# "Sequence A" or "Sequence B"
2730+
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
2731+
2732+
else:
2733+
return super().set_vocab()
2734+
2735+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
2736+
# if name starts with "roberta.", remove the prefix
2737+
# e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
2738+
if name.startswith("roberta."):
2739+
name = name[8:]
2740+
2741+
# position embeddings start at pad_token_id + 1, so just chop down the weight tensor
2742+
if name == "embeddings.position_embeddings.weight":
2743+
if self._position_offset is not None:
2744+
data_torch = data_torch[self._position_offset:,:]
2745+
2746+
return super().modify_tensors(data_torch, name, bid)
2747+
2748+
27042749
@Model.register("NomicBertModel")
27052750
class NomicBertModel(BertModel):
27062751
model_arch = gguf.MODEL_ARCH.NOMIC_BERT

src/llama.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6592,7 +6592,8 @@ static void llm_load_vocab(
65926592
tokenizer_pre == "jina-v1-en" ||
65936593
tokenizer_pre == "jina-v2-es" ||
65946594
tokenizer_pre == "jina-v2-de" ||
6595-
tokenizer_pre == "jina-v2-code") {
6595+
tokenizer_pre == "jina-v2-code" ||
6596+
tokenizer_pre == "roberta-bpe") {
65966597
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
65976598
} else if (
65986599
tokenizer_pre == "refact") {

0 commit comments

Comments
 (0)