@@ -1359,13 +1359,15 @@ class TensorNameMap:
13591359 "visual.merger.mlp.{bid}" , # qwen2vl
13601360 "mlp_AR.linear_{bid}" , # PaddleOCR-VL
13611361 "merger.mlp.{bid}" ,
1362+ "vit.perceive.proj.{bid}" , # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
13621363 ),
13631364
13641365 MODEL_TENSOR .V_MMPROJ_FC : (
13651366 "model.connector.modality_projection.proj" , # SmolVLM
13661367 "model.vision.linear_proj.linear_proj" , # cogvlm
13671368 "model.projector.layers" , # Deepseek-OCR
13681369 "visual.merger.proj" , # glm4v
1370+ "vit.perceive.mlp" , # HunyuanOCR
13691371 ),
13701372
13711373 MODEL_TENSOR .V_MMPROJ_MLP : (
@@ -1393,6 +1395,7 @@ class TensorNameMap:
13931395 "model.vision_tower.embeddings.patch_embeddings.projection" , # Intern-S1
13941396 "vpm.embeddings.patch_embedding" ,
13951397 "model.vision_model.embeddings.patch_embedding" , # SmolVLM
1398+ "vit.embeddings.patch_embedding" , # HunyuanOCR
13961399 "vision_tower.patch_conv" , # pixtral-hf
13971400 "vision_encoder.patch_conv" , # pixtral
13981401 "vision_model.patch_embedding.linear" , # llama 4
@@ -1414,6 +1417,7 @@ class TensorNameMap:
14141417 "model.vision_tower.embeddings.position_embeddings" , # Intern-S1
14151418 "vpm.embeddings.position_embedding" ,
14161419 "model.vision_model.embeddings.position_embedding" , # SmolVLM
1420+ "vit.embeddings.position_embedding" , # HunyuanOCR
14171421 "vision_model.positional_embedding_vlm" , # llama 4
14181422 "vision_tower.patch_embed.pos_emb" , # kimi-vl
14191423 "visual.pos_embed" , # qwen3vl
@@ -1425,10 +1429,12 @@ class TensorNameMap:
14251429
14261430 MODEL_TENSOR .V_ENC_EMBD_IMGNL : (
14271431 "model.image_newline" , # Deepseek-OCR
1432+ "vit.perceive.image_newline" , # HunyuanOCR
14281433 ),
14291434
14301435 MODEL_TENSOR .V_ENC_EMBD_VSEP : (
14311436 "model.view_seperator" , # Deepseek-OCR
1437+ "vit.perceive.image_sep" , # HunyuanOCR
14321438 ),
14331439
14341440 MODEL_TENSOR .V_ENC_ATTN_QKV : (
@@ -1444,6 +1450,7 @@ class TensorNameMap:
14441450 "model.vision_tower.encoder.layer.{bid}.attention.q_proj" , # Intern-S1
14451451 "vpm.encoder.layers.{bid}.self_attn.q_proj" ,
14461452 "model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
1453+ "vit.layers.{bid}.self_attn.q_proj" , # HunyuanOCR
14471454 "vision_model.model.layers.{bid}.self_attn.q_proj" , # llama4
14481455 "vision_tower.transformer.layers.{bid}.attention.q_proj" , # pixtral-hf
14491456 "vision_encoder.transformer.layers.{bid}.attention.wq" , # pixtral
@@ -1466,6 +1473,7 @@ class TensorNameMap:
14661473 "model.vision_tower.encoder.layer.{bid}.attention.k_proj" , # Intern-S1
14671474 "vpm.encoder.layers.{bid}.self_attn.k_proj" ,
14681475 "model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
1476+ "vit.layers.{bid}.self_attn.k_proj" , # HunyuanOCR
14691477 "vision_model.model.layers.{bid}.self_attn.k_proj" , # llama4
14701478 "vision_tower.transformer.layers.{bid}.attention.k_proj" , # pixtral-hf
14711479 "vision_encoder.transformer.layers.{bid}.attention.wk" , # pixtral
@@ -1488,6 +1496,7 @@ class TensorNameMap:
14881496 "model.vision_tower.encoder.layer.{bid}.attention.v_proj" , # Intern-S1
14891497 "vpm.encoder.layers.{bid}.self_attn.v_proj" ,
14901498 "model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
1499+ "vit.layers.{bid}.self_attn.v_proj" , # HunyuanOCR
14911500 "vision_model.model.layers.{bid}.self_attn.v_proj" , # llama4
14921501 "vision_tower.transformer.layers.{bid}.attention.v_proj" , # pixtral-hf
14931502 "vision_encoder.transformer.layers.{bid}.attention.wv" , # pixtral
@@ -1504,6 +1513,7 @@ class TensorNameMap:
15041513 "model.vision_tower.encoder.layer.{bid}.layernorm_before" , # Intern-S1
15051514 "vpm.encoder.layers.{bid}.layer_norm1" ,
15061515 "model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
1516+ "vit.layers.{bid}.input_layernorm" , # HunyuanOCR
15071517 "vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral-hf
15081518 "vision_encoder.transformer.layers.{bid}.attention_norm" , # pixtral
15091519 "vision_model.model.layers.{bid}.input_layernorm" , # llama4, gemma4
@@ -1521,6 +1531,7 @@ class TensorNameMap:
15211531 "model.vision_tower.encoder.layer.{bid}.attention.projection_layer" , # Intern-S1
15221532 "vpm.encoder.layers.{bid}.self_attn.out_proj" ,
15231533 "model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
1534+ "vit.layers.{bid}.self_attn.o_proj" , # HunyuanOCR
15241535 "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer" , # Janus Pro
15251536 "vision_model.model.layers.{bid}.self_attn.o_proj" , # llama4
15261537 "vision_tower.transformer.layers.{bid}.attention.o_proj" , # pixtral-hf
@@ -1540,6 +1551,7 @@ class TensorNameMap:
15401551 "model.vision_tower.encoder.layer.{bid}.layernorm_after" , # Intern-S1
15411552 "vpm.encoder.layers.{bid}.layer_norm2" ,
15421553 "model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
1554+ "vit.layers.{bid}.post_attention_layernorm" , # HunyuanOCR
15431555 "vision_model.model.layers.{bid}.post_attention_layernorm" , # llama4
15441556 "vision_tower.transformer.layers.{bid}.ffn_norm" , # pixtral-hf
15451557 "vision_encoder.transformer.layers.{bid}.ffn_norm" , # pixtral
@@ -1557,6 +1569,7 @@ class TensorNameMap:
15571569 "model.vision_tower.encoder.layer.{bid}.mlp.fc1" , # Intern-S1
15581570 "vpm.encoder.layers.{bid}.mlp.fc1" ,
15591571 "model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3
1572+ "vit.layers.{bid}.mlp.dense_h_to_4h" , # HunyuanOCR
15601573 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral-hf
15611574 "vision_encoder.transformer.layers.{bid}.feed_forward.w3" , # pixtral
15621575 "vision_model.model.layers.{bid}.mlp.fc1" , # llama4
@@ -1583,6 +1596,7 @@ class TensorNameMap:
15831596 "model.vision_tower.encoder.layer.{bid}.mlp.fc2" , # Intern-S1
15841597 "vpm.encoder.layers.{bid}.mlp.fc2" ,
15851598 "model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3
1599+ "vit.layers.{bid}.mlp.dense_4h_to_h" , # HunyuanOCR
15861600 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral-hf
15871601 "vision_encoder.transformer.layers.{bid}.feed_forward.w2" , # pixtral
15881602 "vision_model.model.layers.{bid}.mlp.fc2" , # llama4
@@ -1639,6 +1653,7 @@ class TensorNameMap:
16391653
16401654 MODEL_TENSOR .V_MM_POST_NORM : (
16411655 "visual.merger.post_projection_norm" , # glm4v
1656+ "vit.perceive.after_rms" , # HunyuanOCR
16421657 ),
16431658
16441659 MODEL_TENSOR .V_MM_INP_PROJ : (
@@ -1806,6 +1821,18 @@ class TensorNameMap:
18061821 "model.vision.eoi" , # cogvlm
18071822 ),
18081823
1824+ MODEL_TENSOR .V_MM_PRE_NORM : (
1825+ "vit.perceive.before_rms" , # HunyuanOCR
1826+ ),
1827+
1828+ MODEL_TENSOR .V_TOK_IMG_BEGIN : (
1829+ "vit.perceive.image_begin" , # HunyuanOCR
1830+ ),
1831+
1832+ MODEL_TENSOR .V_TOK_IMG_END : (
1833+ "vit.perceive.image_end" , # HunyuanOCR
1834+ ),
1835+
18091836 MODEL_TENSOR .V_STD_BIAS : (
18101837 "model.vision_tower.std_bias" , # gemma4
18111838 ),
0 commit comments