Skip to content

Commit 3681540

Browse files
committed
gguf : deprecate old FIM token KVs
1 parent 3ae8670 commit 3681540

File tree

3 files changed

+36
-14
lines changed

3 files changed

+36
-14
lines changed

gguf-py/gguf/constants.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ class Tokenizer:
152152
MERGES = "tokenizer.ggml.merges"
153153
BOS_ID = "tokenizer.ggml.bos_token_id"
154154
EOS_ID = "tokenizer.ggml.eos_token_id"
155+
EOT_ID = "tokenizer.ggml.eot_token_id"
156+
EOM_ID = "tokenizer.ggml.eom_token_id"
155157
UNK_ID = "tokenizer.ggml.unknown_token_id"
156158
SEP_ID = "tokenizer.ggml.seperator_token_id"
157159
PAD_ID = "tokenizer.ggml.padding_token_id"
@@ -168,11 +170,16 @@ class Tokenizer:
168170
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
169171
CHAT_TEMPLATES = "tokenizer.chat_templates"
170172
# FIM/Infill special tokens constants
173+
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
174+
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
175+
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
176+
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
177+
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
178+
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
179+
# deprecated:
171180
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
172181
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
173182
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
174-
EOT_ID = "tokenizer.ggml.eot_token_id"
175-
EOM_ID = "tokenizer.ggml.eom_token_id"
176183

177184
class Adapter:
178185
TYPE = "adapter.type"
@@ -1579,15 +1586,24 @@ def get_type(val: Any) -> GGUFValueType:
15791586
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
15801587
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
15811588
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
1589+
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1590+
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
15821591
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
15831592
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
15841593
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
15851594
KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
15861595
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
15871596
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
15881597
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
1589-
KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID
1598+
1599+
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
1600+
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
1601+
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
1602+
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
1603+
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
1604+
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
1605+
1606+
# deprecated
1607+
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
15901608
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
15911609
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
1592-
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
1593-
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID

gguf-py/gguf/gguf_writer.py

-9
Original file line numberDiff line numberDiff line change
@@ -843,15 +843,6 @@ def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
843843

844844
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
845845

846-
def add_prefix_token_id(self, id: int) -> None:
847-
self.add_uint32(Keys.Tokenizer.PREFIX_ID, id)
848-
849-
def add_suffix_token_id(self, id: int) -> None:
850-
self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id)
851-
852-
def add_middle_token_id(self, id: int) -> None:
853-
self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id)
854-
855846
def add_eot_token_id(self, id: int) -> None:
856847
self.add_uint32(Keys.Tokenizer.EOT_ID, id)
857848

src/llama.cpp

+15
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,11 @@ enum llm_kv {
368368

369369
LLM_KV_ADAPTER_TYPE,
370370
LLM_KV_ADAPTER_LORA_ALPHA,
371+
372+
// deprecated:
373+
LLM_KV_TOKENIZER_PREFIX_ID,
374+
LLM_KV_TOKENIZER_SUFFIX_ID,
375+
LLM_KV_TOKENIZER_MIDDLE_ID,
371376
};
372377

373378
static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
@@ -479,6 +484,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
479484

480485
{ LLM_KV_ADAPTER_TYPE, "adapter.type" },
481486
{ LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" },
487+
488+
// deprecated
489+
{ LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" },
490+
{ LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" },
491+
{ LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" },
482492
};
483493

484494
struct LLM_KV {
@@ -6533,6 +6543,11 @@ static void llm_load_vocab(
65336543
{ LLM_KV_TOKENIZER_FIM_PAD_ID, vocab.special_fim_pad_id },
65346544
{ LLM_KV_TOKENIZER_FIM_REP_ID, vocab.special_fim_rep_id },
65356545
{ LLM_KV_TOKENIZER_FIM_SEP_ID, vocab.special_fim_sep_id },
6546+
6547+
// deprecated
6548+
{ LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_fim_pre_id },
6549+
{ LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_fim_suf_id },
6550+
{ LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_fim_mid_id },
65366551
};
65376552

65386553
for (const auto & it : special_token_types) {

0 commit comments

Comments
 (0)