Skip to content

Commit 23bd0a6

Browse files
richardddngxsonCISC
authored
model : add HunyuanOCR support (ggml-org#21395)
* HunyuanOCR: add support for text and vision models - Add HunyuanOCR vision projector (perceiver-based) with Conv2d merge - Add separate HUNYUAN_OCR chat template (content-before-role format) - Handle HunyuanOCR's invalid pad_token_id=-1 in converter - Fix EOS/EOT token IDs from generation_config.json - Support xdrope RoPE scaling type - Add tensor mappings for perceiver projector (mm.before_rms, mm.after_rms, etc.) - Register HunYuanVLForConditionalGeneration for both text and mmproj conversion * fix proper mapping * Update gguf-py/gguf/tensor_mapping.py Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * Update tools/mtmd/clip.cpp Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> * address comments * update * Fix typecheck * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> * Update convert_hf_to_gguf.py Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com> --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com> Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
1 parent af03221 commit 23bd0a6

12 files changed

Lines changed: 273 additions & 10 deletions

File tree

convert_hf_to_gguf.py

Lines changed: 91 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11521,13 +11521,50 @@ def prepare_tensors(self):
1152111521
raise ValueError(f"Unprocessed experts: {experts}")
1152211522

1152311523

11524-
@ModelBase.register("HunYuanDenseV1ForCausalLM")
11524+
@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
1152511525
class HunYuanModel(TextModel):
1152611526
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
1152711527

11528+
def _get_eod_token_id(self) -> int | None:
11529+
"""Get the actual end-of-generation token from config (eod_token_id)."""
11530+
return self.hparams.get("eod_token_id")
11531+
11532+
def _get_eot_token_id(self) -> int | None:
11533+
"""Get the end-of-turn token from generation_config.json.
11534+
This is the first entry in eos_token_id when it's a list."""
11535+
gen_cfg_path = self.dir_model / "generation_config.json"
11536+
if gen_cfg_path.is_file():
11537+
with open(gen_cfg_path, encoding="utf-8") as f:
11538+
gen_cfg = json.load(f)
11539+
eos = gen_cfg.get("eos_token_id")
11540+
if isinstance(eos, list) and len(eos) >= 2:
11541+
return eos[0]
11542+
return None
11543+
11544+
def _fix_special_tokens(self):
11545+
"""Fix EOS/EOT tokens that are incorrect in upstream configs."""
11546+
eod_id = self._get_eod_token_id()
11547+
if eod_id is not None:
11548+
self.gguf_writer.add_eos_token_id(eod_id)
11549+
eot_id = self._get_eot_token_id()
11550+
if eot_id is not None:
11551+
self.gguf_writer.add_eot_token_id(eot_id)
11552+
1152811553
def set_vocab(self):
1152911554
if (self.dir_model / "tokenizer.json").is_file():
11530-
self._set_vocab_gpt2()
11555+
tokens, toktypes, tokpre = self.get_vocab_base()
11556+
self.gguf_writer.add_tokenizer_model("gpt2")
11557+
self.gguf_writer.add_tokenizer_pre(tokpre)
11558+
self.gguf_writer.add_token_list(tokens)
11559+
self.gguf_writer.add_token_types(toktypes)
11560+
11561+
# HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
11562+
token_types = None
11563+
if (self.hparams.get("pad_token_id") or 0) < 0:
11564+
token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
11565+
special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
11566+
special_vocab.add_to_gguf(self.gguf_writer)
11567+
self._fix_special_tokens()
1153111568
else:
1153211569
from transformers import AutoTokenizer
1153311570
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -11579,13 +11616,18 @@ def set_vocab(self):
1157911616
# FIX for BOS token: Overwrite incorrect id read from config.json
1158011617
if self.hparams['hidden_size'] == 4096:
1158111618
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
11619+
self._fix_special_tokens()
1158211620

1158311621
def set_gguf_parameters(self):
11622+
# HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
11623+
saved_num_experts = self.hparams.pop("num_experts", None)
1158411624
super().set_gguf_parameters()
11625+
if saved_num_experts is not None and saved_num_experts > 1:
11626+
self.hparams["num_experts"] = saved_num_experts
1158511627
hparams = self.hparams
1158611628

1158711629
# Rope
11588-
if self.rope_parameters.get("rope_type") == "dynamic":
11630+
if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
1158911631
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
1159011632
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
1159111633
alpha = self.rope_parameters.get("alpha", 50)
@@ -11595,22 +11637,62 @@ def set_gguf_parameters(self):
1159511637
self.gguf_writer.add_rope_freq_base(scaled_base)
1159611638
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
1159711639
self.gguf_writer.add_rope_scaling_factor(1)
11598-
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
11599-
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
11600-
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
11640+
if self.rope_parameters.get("rope_type") == "dynamic":
11641+
# There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
11642+
self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
11643+
self.gguf_writer.add_context_length(256 * 1024) # 256k context length
1160111644

11602-
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
11603-
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
11604-
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
11645+
# if any of our assumptions about the values are wrong, something has changed and this may need to be updated
11646+
assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
11647+
"HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
1160511648

1160611649
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
1160711650
if name == "lm_head.weight":
1160811651
if self.hparams.get("tie_word_embeddings", False):
1160911652
logger.info("Skipping tied output layer 'lm_head.weight'")
1161011653
return
1161111654

11655+
# skip vision tensors for HunyuanVL models
11656+
if name.startswith("vit."):
11657+
return
11658+
11659+
yield from super().modify_tensors(data_torch, name, bid)
11660+
11661+
11662+
@ModelBase.register("HunYuanVLForConditionalGeneration")
11663+
class HunyuanOCRVisionModel(MmprojModel):
11664+
def __init__(self, *args, **kwargs):
11665+
super().__init__(*args, **kwargs)
11666+
assert self.hparams_vision is not None
11667+
# HunyuanOCR uses max_image_size instead of image_size
11668+
if "image_size" not in self.hparams_vision:
11669+
self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
11670+
11671+
def set_gguf_parameters(self):
11672+
super().set_gguf_parameters()
11673+
assert self.hparams_vision is not None
11674+
hparams = self.hparams_vision
11675+
self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
11676+
self.gguf_writer.add_vision_use_gelu(True)
11677+
self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
11678+
self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
11679+
self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
11680+
self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
11681+
11682+
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
11683+
if not name.startswith("vit."):
11684+
return # skip text tensors
11685+
# strip CLS token (row 0) from position embeddings so resize_position_embeddings works
11686+
if "position_embedding" in name:
11687+
data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
1161211688
yield from super().modify_tensors(data_torch, name, bid)
1161311689

11690+
def tensor_force_quant(self, name, new_name, bid, n_dims):
11691+
# force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
11692+
if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
11693+
return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
11694+
return super().tensor_force_quant(name, new_name, bid, n_dims)
11695+
1161411696

1161511697
@ModelBase.register("SmolLM3ForCausalLM")
1161611698
class SmolLM3Model(LlamaModel):

gguf-py/gguf/constants.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
734734
V_LAYER_OUT_SCALE = auto()
735735
V_PRE_NORM = auto()
736736
V_POST_NORM = auto()
737+
V_MM_PRE_NORM = auto() # hunyuanocr
737738
V_MM_POST_NORM = auto()
738739
V_MM_INP_NORM = auto()
739740
V_MM_INP_PROJ = auto() # gemma3
@@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
769770
V_MM_GATE = auto() # cogvlm
770771
V_TOK_BOI = auto() # cogvlm
771772
V_TOK_EOI = auto() # cogvlm
773+
V_TOK_IMG_BEGIN = auto() # hunyuanocr
774+
V_TOK_IMG_END = auto() # hunyuanocr
772775
V_STD_BIAS = auto() # gemma4
773776
V_STD_SCALE = auto() # gemma4
774777
V_SAM_POS_EMBD = auto() # Deepseek-OCR
@@ -1246,6 +1249,9 @@ class MODEL_TENSOR(IntEnum):
12461249
MODEL_TENSOR.V_MM_GATE: "mm.gate",
12471250
MODEL_TENSOR.V_TOK_BOI: "v.boi",
12481251
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
1252+
MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm",
1253+
MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin",
1254+
MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end",
12491255
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
12501256
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
12511257
# DeepSeek-OCR SAM
@@ -1393,6 +1399,9 @@ class MODEL_TENSOR(IntEnum):
13931399
MODEL_TENSOR.V_MM_GATE,
13941400
MODEL_TENSOR.V_TOK_BOI,
13951401
MODEL_TENSOR.V_TOK_EOI,
1402+
MODEL_TENSOR.V_MM_PRE_NORM,
1403+
MODEL_TENSOR.V_TOK_IMG_BEGIN,
1404+
MODEL_TENSOR.V_TOK_IMG_END,
13961405
MODEL_TENSOR.V_STD_BIAS,
13971406
MODEL_TENSOR.V_STD_SCALE,
13981407
MODEL_TENSOR.V_SAM_POS_EMBD,
@@ -4113,6 +4122,7 @@ class VisionProjectorType:
41134122
GLM4V = "glm4v"
41144123
YOUTUVL = "youtuvl"
41154124
NEMOTRON_V2_VL = "nemotron_v2_vl"
4125+
HUNYUANOCR = "hunyuanocr"
41164126

41174127

41184128
# Items here are (block size, type size)

gguf-py/gguf/tensor_mapping.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,13 +1359,15 @@ class TensorNameMap:
13591359
"visual.merger.mlp.{bid}", # qwen2vl
13601360
"mlp_AR.linear_{bid}", # PaddleOCR-VL
13611361
"merger.mlp.{bid}",
1362+
"vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
13621363
),
13631364

13641365
MODEL_TENSOR.V_MMPROJ_FC: (
13651366
"model.connector.modality_projection.proj", # SmolVLM
13661367
"model.vision.linear_proj.linear_proj", # cogvlm
13671368
"model.projector.layers", # Deepseek-OCR
13681369
"visual.merger.proj", # glm4v
1370+
"vit.perceive.mlp", # HunyuanOCR
13691371
),
13701372

13711373
MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1393,6 +1395,7 @@ class TensorNameMap:
13931395
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
13941396
"vpm.embeddings.patch_embedding",
13951397
"model.vision_model.embeddings.patch_embedding", # SmolVLM
1398+
"vit.embeddings.patch_embedding", # HunyuanOCR
13961399
"vision_tower.patch_conv", # pixtral-hf
13971400
"vision_encoder.patch_conv", # pixtral
13981401
"vision_model.patch_embedding.linear", # llama 4
@@ -1414,6 +1417,7 @@ class TensorNameMap:
14141417
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
14151418
"vpm.embeddings.position_embedding",
14161419
"model.vision_model.embeddings.position_embedding", # SmolVLM
1420+
"vit.embeddings.position_embedding", # HunyuanOCR
14171421
"vision_model.positional_embedding_vlm", # llama 4
14181422
"vision_tower.patch_embed.pos_emb", # kimi-vl
14191423
"visual.pos_embed", # qwen3vl
@@ -1425,10 +1429,12 @@ class TensorNameMap:
14251429

14261430
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
14271431
"model.image_newline", # Deepseek-OCR
1432+
"vit.perceive.image_newline", # HunyuanOCR
14281433
),
14291434

14301435
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
14311436
"model.view_seperator", # Deepseek-OCR
1437+
"vit.perceive.image_sep", # HunyuanOCR
14321438
),
14331439

14341440
MODEL_TENSOR.V_ENC_ATTN_QKV: (
@@ -1444,6 +1450,7 @@ class TensorNameMap:
14441450
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
14451451
"vpm.encoder.layers.{bid}.self_attn.q_proj",
14461452
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
1453+
"vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
14471454
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
14481455
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
14491456
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
@@ -1466,6 +1473,7 @@ class TensorNameMap:
14661473
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
14671474
"vpm.encoder.layers.{bid}.self_attn.k_proj",
14681475
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
1476+
"vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
14691477
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
14701478
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
14711479
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
@@ -1488,6 +1496,7 @@ class TensorNameMap:
14881496
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
14891497
"vpm.encoder.layers.{bid}.self_attn.v_proj",
14901498
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
1499+
"vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
14911500
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
14921501
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
14931502
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
@@ -1504,6 +1513,7 @@ class TensorNameMap:
15041513
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
15051514
"vpm.encoder.layers.{bid}.layer_norm1",
15061515
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
1516+
"vit.layers.{bid}.input_layernorm", # HunyuanOCR
15071517
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
15081518
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
15091519
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
@@ -1521,6 +1531,7 @@ class TensorNameMap:
15211531
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
15221532
"vpm.encoder.layers.{bid}.self_attn.out_proj",
15231533
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
1534+
"vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
15241535
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
15251536
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
15261537
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
@@ -1540,6 +1551,7 @@ class TensorNameMap:
15401551
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
15411552
"vpm.encoder.layers.{bid}.layer_norm2",
15421553
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
1554+
"vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
15431555
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
15441556
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
15451557
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1557,6 +1569,7 @@ class TensorNameMap:
15571569
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
15581570
"vpm.encoder.layers.{bid}.mlp.fc1",
15591571
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
1572+
"vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
15601573
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
15611574
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
15621575
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
@@ -1583,6 +1596,7 @@ class TensorNameMap:
15831596
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
15841597
"vpm.encoder.layers.{bid}.mlp.fc2",
15851598
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
1599+
"vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
15861600
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
15871601
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
15881602
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
@@ -1639,6 +1653,7 @@ class TensorNameMap:
16391653

16401654
MODEL_TENSOR.V_MM_POST_NORM: (
16411655
"visual.merger.post_projection_norm", # glm4v
1656+
"vit.perceive.after_rms", # HunyuanOCR
16421657
),
16431658

16441659
MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1806,6 +1821,18 @@ class TensorNameMap:
18061821
"model.vision.eoi", # cogvlm
18071822
),
18081823

1824+
MODEL_TENSOR.V_MM_PRE_NORM: (
1825+
"vit.perceive.before_rms", # HunyuanOCR
1826+
),
1827+
1828+
MODEL_TENSOR.V_TOK_IMG_BEGIN: (
1829+
"vit.perceive.image_begin", # HunyuanOCR
1830+
),
1831+
1832+
MODEL_TENSOR.V_TOK_IMG_END: (
1833+
"vit.perceive.image_end", # HunyuanOCR
1834+
),
1835+
18091836
MODEL_TENSOR.V_STD_BIAS: (
18101837
"model.vision_tower.std_bias", # gemma4
18111838
),

src/llama-chat.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
7373
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
7474
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
7575
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
76+
{ "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
7677
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
7778
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
7879
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
216217
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
217218
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
218219
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
220+
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
221+
return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
219222
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
220223
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
221224
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
822825
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
823826
}
824827
}
828+
} else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
829+
// tencent/HunyuanOCR
830+
ss << "<|hy_begin▁of▁sentence|>";
831+
for (size_t i = 0; i < chat.size(); i++) {
832+
std::string role(chat[i]->role);
833+
if (i == 0 && role == "system") {
834+
ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
835+
continue;
836+
}
837+
838+
if (role == "user") {
839+
ss << chat[i]->content << "<|hy_User|>";
840+
} else if (role == "assistant") {
841+
ss << chat[i]->content << "<|hy_Assistant|>";
842+
}
843+
}
825844
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
826845
// moonshotai/Kimi-K2-Instruct
827846
for (auto message : chat) {

src/llama-chat.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ enum llm_chat_template {
5353
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
5454
LLM_CHAT_TEMPLATE_OPENAI_MOE,
5555
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
56+
LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
5657
LLM_CHAT_TEMPLATE_KIMI_K2,
5758
LLM_CHAT_TEMPLATE_SEED_OSS,
5859
LLM_CHAT_TEMPLATE_GROK_2,

tools/mtmd/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ add_library(mtmd
1919
models/conformer.cpp
2020
models/gemma4v.cpp
2121
models/glm4v.cpp
22+
models/hunyuanocr.cpp
2223
models/internvl.cpp
2324
models/kimivl.cpp
2425
models/kimik25.cpp

0 commit comments

Comments
 (0)