set model_type; add nmm_infer/config.json

Browse files

Files changed (6) hide show

README.md +6 -26
assets/evaluation.png +2 -2
config.json +1 -0
configuration_longcat_next.py +1 -0
nmm_infer/config.json +1153 -0
nmm_infer/configuration_omni.py +248 -0

README.md CHANGED Viewed

@@ -268,9 +268,10 @@ messages = [
 ```python
 # Simply replace the messages in the main example with the messages below.
 messages = [
     {"role": "system", "content": ""},
-    {"role": "user", "content": "A small kitten sitting naturally on a moss-covered forest floor, centered in the frame, holding a rectangular wooden sign gently with its front paws resting over the top edge. The kitten has soft, fluffy fur, a natural relaxed posture, and a calm, curious expression with a slightly open mouth (not exaggerated), looking directly at the camera.\n\nThe sign is positioned firmly in front of the kitten\'s chest, supported by its paws, with realistic contact and no floating effect. The board reads \"LongCat-Next: When Modalities Internalize as Multilingual Tokens\" in clean, sharp black text, perfectly legible.\n\nThe environment is a lush forest with tall trees, ferns, and soft green foliage. The ground is covered with moss and small plants. Background softly blurred with natural depth of field. Lighting is soft, diffused sunlight filtering through the trees, creating gentle highlights and shadows. Realistic photography style, natural colors, high detail, no cartoonish exaggeration.<longcat_img_start>"}
 ]
 ```
@@ -295,6 +296,7 @@ messages = [
 ```python
 # Simply replace the messages in the main example with the messages below.
 messages = [
     {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/system_audio.wav<longcat_audio_end>"},
     {"role": "user", "content": "<longcat_audio_start>./assets/math1.wav<longcat_audio_end><longcat_audiogen_start>"}
@@ -308,6 +310,7 @@ messages = [
 ```python
 # Simply replace the messages in the main example with the messages below.
 messages = [
     {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/vc_zh3.wav<longcat_audio_end>"},
     {"role": "user", "content": "用这个声音合成以下内容：明天的meeting在三楼的Conference Room举行。<longcat_audiogen_start>"}
@@ -317,8 +320,7 @@ messages = [
 </details>
-<!-- > [!Tip] -->
 > We recommend using the following set of sampling parameters for generation:
 >
 > - Text: `{"max_new_tokens":2048,"do_sample":false}`
@@ -333,30 +335,8 @@ messages = [
 ## Deployment
-We have implemented basic adaptations in SGLang(Code is being uploaded) to support the deployment of LongCat-Next.
-```shell
-git clone [TBU]
-cd nmm_infer
-git checkout master
-sh setup.sh
-```
-```shell
-# Require CUDA >= 12.9
-# Setup environment
-source create_env.sh
-source set_env.sh
-# Run tests
-python3 demo.py \
-    --model-path meituan-longcat/LongCat-Next \
-    --sequential \
-    --output-dir output \
-    --tasks vis_gen vis_und aud_qa spk_syn
-```
 ## License Agreement

 ```python
 # Simply replace the messages in the main example with the messages below.
+# Suffix user content with '<longcat_img_start>' to force image generation.
 messages = [
     {"role": "system", "content": ""},
+    {"role": "user", "content": "A small kitten sitting naturally on a moss-covered forest floor, centered in the frame, holding a rectangular wooden sign gently with its front paws resting over the top edge. The kitten has soft, fluffy fur, a natural relaxed posture, and a calm, curious expression with a slightly open mouth (not exaggerated), looking directly at the camera.\n\nThe sign is positioned firmly in front of the kitten\'s chest, supported by its paws, with realistic contact and no floating effect. The board reads \"LongCat-Next: Lexicalizing Modalities as Discrete Tokens\" in clean, sharp black text, perfectly legible.\n\nThe environment is a lush forest with tall trees, ferns, and soft green foliage. The ground is covered with moss and small plants. Background softly blurred with natural depth of field. Lighting is soft, diffused sunlight filtering through the trees, creating gentle highlights and shadows. Realistic photography style, natural colors, high detail, no cartoonish exaggeration.<longcat_img_start>"}
 ]
 ```
 ```python
 # Simply replace the messages in the main example with the messages below.
+# Suffix user content with '<longcat_audiogen_start>' to force audio generation.
 messages = [
     {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/system_audio.wav<longcat_audio_end>"},
     {"role": "user", "content": "<longcat_audio_start>./assets/math1.wav<longcat_audio_end><longcat_audiogen_start>"}
 ```python
 # Simply replace the messages in the main example with the messages below.
+# Suffix user content with '<longcat_audiogen_start>' to force audio generation.
 messages = [
     {"role": "system", "content": "Replicate the voice in the audio clip to formulate an answer:<longcat_audio_start>./assets/vc_zh3.wav<longcat_audio_end>"},
     {"role": "user", "content": "用这个声音合成以下内容：明天的meeting在三楼的Conference Room举行。<longcat_audiogen_start>"}
 </details>
+> [!Tip]
 > We recommend using the following set of sampling parameters for generation:
 >
 > - Text: `{"max_new_tokens":2048,"do_sample":false}`
 ## Deployment
+We have implemented basic adaptations in SGLang to support the deployment of LongCat-Next. Please refer to this repository for more information: [meituan-longcat/LongCat-Next-inference](https://github.com/meituan-longcat/LongCat-Next-inference)
 ## License Agreement

assets/evaluation.png CHANGED Viewed

Git LFS Details

SHA256: 82bc8ab1a053e71f1328241be1739f3b9d0f0c0f84501500070c2e8a49542759
Pointer size: 131 Bytes
Size of remote file: 267 kB

Git LFS Details

SHA256: a4ad74ef615c6524fad0d8aa959cc85ebfb5cbb543ba1dc4fbb142801709d50e
Pointer size: 131 Bytes
Size of remote file: 267 kB

config.json CHANGED Viewed

@@ -38,6 +38,7 @@
   "emb_split_num": 4,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.57.6",
   "text_vocab_size": 131072,
   "text_vocab_plus_multimodal_special_token_size": 131125,

   "emb_split_num": 4,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.57.6",
+  "model_type": "longcat_next",
   "text_vocab_size": 131072,
   "text_vocab_plus_multimodal_special_token_size": 131125,

configuration_longcat_next.py CHANGED Viewed

@@ -5,6 +5,7 @@ from transformers.models.whisper.configuration_whisper import WhisperConfig
 from .configuration_longcat_ngram import LongcatFlashNgramConfig
 class LongcatNextConfig(LongcatFlashNgramConfig):
     def __init__(
         self,
         vocab_size=131072,

 from .configuration_longcat_ngram import LongcatFlashNgramConfig
 class LongcatNextConfig(LongcatFlashNgramConfig):
+    model_type = "longcat_next"
     def __init__(
         self,
         vocab_size=131072,

nmm_infer/config.json ADDED Viewed

	@@ -0,0 +1,1153 @@

+{
+  "architectures": [
+    "LongcatCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_omni.LongcatConfig"
+  },
+  "vocab_size": 282624,
+  "hidden_size": 3072,
+  "ffn_hidden_size": 6144,
+  "expert_ffn_hidden_size": 1024,
+  "num_layers": 14,
+  "num_attention_heads": 32,
+  "kv_lora_rank": 512,
+  "q_lora_rank": 1536,
+  "qk_rope_head_dim": 64,
+  "v_head_dim": 128,
+  "qk_nope_head_dim": 128,
+  "mla_scale_q_lora": true,
+  "mla_scale_kv_lora": true,
+  "routed_scaling_factor": 6.0,
+  "n_routed_experts": 256,
+  "max_position_embeddings": 131072,
+  "rms_norm_eps": 1e-5,
+  "use_cache": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "rope_theta": 10000000,
+  "attention_method": "MLA",
+  "zero_expert_num": 128,
+  "zero_expert_type": "identity",
+  "moe_topk": 12,
+  "use_mla": 1,
+  "moe_switch_token_num": 1024,
+  "moe_impl": "mix",
+  "oe_vocab_size_ratio": 78,
+  "oe_neighbor_num": 4,
+  "oe_split_num": 4,
+  "embP": 16,
+  "visual_offset": 150581,
+  "vocab_size_text":131072,
+  "vocab_size_special": 131125,
+  "audio_offset":131125,
+  "audio_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "activation_dropout": 0.0,
+    "activation_function": "gelu",
+    "add_cross_attention": false,
+    "apply_spec_augment": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "audio_codebook_loss_weights": [
+      1.0,
+      0.5,
+      0.25,
+      0.125,
+      0.125,
+      0.125,
+      0.125,
+      0.125,
+      0.125,
+      0.125
+    ],
+    "audio_delim_token_id": 131116,
+    "audio_end_token_id": 131104,
+    "audio_head_transformer_dims": 3072,
+    "audio_head_transformer_ffn_scale": 16,
+    "audio_head_transformer_layers": 4,
+    "audio_loss_weight": 1.0,
+    "audio_pad_token_id": 131105,
+    "audio_start_token_id": 131103,
+    "audio_text_delay": 1,
+    "audiogen_end_token_id": 131124,
+    "audiogen_start_token_id": 131123,
+    "audiotext_end_token_id": 131121,
+    "audiotext_loss_weight": 1.0,
+    "audiotext_pad_token_id": 131122,
+    "audiotext_start_token_id": 131120,
+    "avg_pooler": 4,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": [
+      220,
+      50256
+    ],
+    "bos_token_id": 50256,
+    "chunk_size_feed_forward": 0,
+    "classifier_proj_size": 256,
+    "cross_attention_hidden_size": null,
+    "d_model": 1280,
+    "decoder_attention_heads": 20,
+    "decoder_ffn_dim": 5120,
+    "decoder_kernel_size": 3,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 8,
+    "decoder_start_token_id": 50257,
+    "decoder_stride_size": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "enable": true,
+    "encoder_attention_heads": 20,
+    "encoder_ffn_dim": 5120,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 32,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 50256,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hop_length": 160,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "kernel_size": 3,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "mask_feature_length": 10,
+    "mask_feature_min_masks": 0,
+    "mask_feature_prob": 0.0,
+    "mask_time_length": 10,
+    "mask_time_min_masks": 2,
+    "mask_time_prob": 0.05,
+    "max_audio_seconds": 30,
+    "max_length": 20,
+    "max_source_positions": 1500,
+    "max_target_positions": 448,
+    "median_filter_width": 7,
+    "min_length": 0,
+    "model_type": "whisper",
+    "n_fft": 400,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 32,
+    "num_mel_bins": 128,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 50256,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 16000,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "split_overlap": 0.0,
+    "stride_size": 2,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "use_weighted_layer_sum": false,
+    "vocab_size": 51865,
+    "vq_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "codebook_sizes": [
+        8192,
+        4096,
+        2048,
+        1024,
+        1024,
+        1024,
+        1024,
+        1024
+      ],
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "enable": true,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    }
+  },
+  "flow_matching_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "act_fn": "gelu",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_head_dim": 64,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "cal_mel_mae": true,
+    "cfm_params": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "inference_cfg_rate": 0.7,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "reg_loss_type": "l1",
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "sigma_min": 1e-06,
+      "solver": "euler",
+      "suppress_tokens": null,
+      "t_scheduler": "cosine",
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "training_cfg_rate": 0.2,
+      "typical_p": 1.0,
+      "use_bfloat16": false
+    },
+    "channels": [
+      256
+    ],
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diffusion_steps": 10,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "enable": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hop_length": 480,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 80,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "loss_weight": 1.0,
+    "max_audio_seconds": 30,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "n_blocks": 4,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_heads": 8,
+    "num_mid_blocks": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "prenet_activation_function": "gelu",
+    "prenet_attention_heads": 8,
+    "prenet_d_model": 512,
+    "prenet_ffn_dim": 2048,
+    "prenet_in_dim": 1280,
+    "prenet_loss_weight": 1.0,
+    "prenet_max_source_positions": 5000,
+    "prenet_nlayers": 12,
+    "prenet_out_dim": 80,
+    "prenet_target_mel_length_scale_ratio": 1.0,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 24000,
+    "sep_token_id": null,
+    "spk_emb_dim": 0,
+    "split_overlap": 0.1,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "unet_use_omni_attn": false,
+    "use_bfloat16": false,
+    "use_hidden_states_before_dconv2": true,
+    "use_hires_mel": true
+  },
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "max_window_layers": 28,
+  "model_type": "omni",
+  "multimodal": [
+    "image",
+    "video",
+    "audio",
+    "audiogen"
+  ],
+  "multimodal_special_token_list": [
+    131072,
+    131073,
+    131074,
+    131075,
+    131076,
+    131077,
+    131078,
+    131079,
+    131080,
+    131081,
+    131082,
+    131083,
+    131084,
+    131085,
+    131086,
+    131087,
+    131088,
+    131089,
+    131090,
+    131091,
+    131092,
+    131093,
+    131094,
+    131095,
+    131096,
+    131097,
+    131098,
+    131099,
+    131100,
+    131101,
+    131102,
+    131103,
+    131104,
+    131105,
+    131106,
+    131107,
+    131108,
+    131109,
+    131110,
+    131111,
+    131112,
+    131113,
+    131114,
+    131115,
+    131116,
+    131117,
+    131118,
+    131119,
+    131120,
+    131121,
+    131122,
+    131123,
+    131124
+  ],
+  "num_hidden_layers": 14,
+  "num_key_value_heads": 4,
+  "omni_tokenizer_type": "auto",
+  "pad_token_id": 0,
+  "position_embedding_type": "rope",
+  "rope_scaling": {
+    "mrope_section": [
+      16,
+      24,
+      24
+    ],
+    "type": "mrope"
+  },
+  "sliding_window": 131072,
+  "sparse_attention_heads": null,
+  "sparse_attention_layers": [],
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "train_multimodal_special_tokens_only": false,
+  "transformers_version": "4.51.3",
+  "use_norm_head": false,
+  "use_sliding_window": false,
+  "video_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decode_way": "1fps",
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "frame_pattern": "<frame><TIMEIDX>",
+    "hidden_act": "quick_gelu",
+    "hidden_size": 3584,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_delimiter_token_id": 131115,
+    "image_end_token_id": 131107,
+    "image_line_token_id": 131109,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_pad_token_id": 131108,
+    "image_size": 224,
+    "image_start_token_id": 131106,
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_frame_num": 128,
+    "max_length": 20,
+    "max_pixels": 150528,
+    "merge_size": 2,
+    "min_length": 0,
+    "min_pixels": 78400,
+    "mlp_ratio": 4,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_heads": 16,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "split_video": true,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "video_end_token_id": 131119,
+    "video_place_token_id": 131117,
+    "video_start_token_id": 131118
+  },
+  "visual_config": {
+    "_attn_implementation_autoset": true,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depth": 32,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "freeze": true,
+    "fullatt_block_indexes": [
+      7,
+      15,
+      23,
+      31
+    ],
+    "hidden_act": "silu",
+    "hidden_size": 1280,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_delimiter_token_id": 131115,
+    "image_end_token_id": 131107,
+    "image_head_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "enable": true,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "image_head_transformer_dims": 2048,
+      "image_head_transformer_ffn_scale": 16,
+      "image_head_transformer_layers": 4,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_return_sequences": 1,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "visual_codebook_loss_weights": [
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0,
+        1.0
+      ]
+    },
+    "image_line_token_id": 131109,
+    "image_mean": [
+      0.48145466,
+      0.4578275,
+      0.40821073
+    ],
+    "image_pad_token_id": 131108,
+    "image_size": 224,
+    "image_start_token_id": 131106,
+    "image_std": [
+      0.26862954,
+      0.26130258,
+      0.27577711
+    ],
+    "in_channels": 3,
+    "in_chans": 3,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3420,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_pixels": 3211264,
+    "merge_size": 2,
+    "min_length": 0,
+    "min_pixels": 50176,
+    "mlp_ratio": 4,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_heads": 16,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "out_hidden_size": 3584,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spatial_merge_size": 2,
+    "spatial_patch_size": 14,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "temporal_patch_size": 2,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "tokens_per_second": 2,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "visual_loss_weight": 0.125,
+    "window_size": 112
+  },
+  "visual_quantizer_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "codebook_dim": 3584,
+    "codebook_size": 16384,
+    "codebook_sizes": [
+      16384,
+      16384,
+      16384,
+      16384,
+      16384,
+      16384,
+      16384,
+      16384
+    ],
+    "commit_loss_ratio": 0.25,
+    "cross_attention_hidden_size": null,
+    "decay": 0.99,
+    "decoder_start_token_id": null,
+    "depth": 8,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "entropy_loss_ratio": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "feature_decoder_config": {
+      "_attn_implementation_autoset": false,
+      "_name_or_path": "",
+      "add_cross_attention": false,
+      "architectures": null,
+      "bad_words_ids": null,
+      "begin_suppress_tokens": null,
+      "bos_token_id": null,
+      "chunk_size_feed_forward": 0,
+      "cross_attention_hidden_size": null,
+      "decoder_start_token_id": null,
+      "depth": 3,
+      "diversity_penalty": 0.0,
+      "do_sample": false,
+      "early_stopping": false,
+      "enable": false,
+      "encoder_no_repeat_ngram_size": 0,
+      "eos_token_id": null,
+      "exponential_decay_length_penalty": null,
+      "finetuning_task": null,
+      "forced_bos_token_id": null,
+      "forced_eos_token_id": null,
+      "fullatt_block_indexes": [
+        0,
+        1,
+        2
+      ],
+      "hidden_act": "silu",
+      "hidden_size": 1280,
+      "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+      },
+      "in_channels": 1280,
+      "intermediate_size": 3420,
+      "is_decoder": false,
+      "is_encoder_decoder": false,
+      "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+      },
+      "length_penalty": 1.0,
+      "max_length": 20,
+      "min_length": 0,
+      "model_type": "",
+      "no_repeat_ngram_size": 0,
+      "num_beam_groups": 1,
+      "num_beams": 1,
+      "num_heads": 16,
+      "num_return_sequences": 1,
+      "out_channels": 3584,
+      "output_attentions": false,
+      "output_hidden_states": false,
+      "output_scores": false,
+      "pad_token_id": null,
+      "patch_size": 1,
+      "post_quant_conv": true,
+      "prefix": null,
+      "problem_type": null,
+      "pruned_heads": {},
+      "remove_invalid_values": false,
+      "repetition_penalty": 1.0,
+      "return_dict": true,
+      "return_dict_in_generate": false,
+      "sep_token_id": null,
+      "spatial_merge_size": 1,
+      "suppress_tokens": null,
+      "task_specific_params": null,
+      "temperature": 1.0,
+      "temporal_patch_size": 1,
+      "tf_legacy_loss": false,
+      "tie_encoder_decoder": false,
+      "tie_word_embeddings": true,
+      "tokenizer_class": null,
+      "top_k": 50,
+      "top_p": 1.0,
+      "torch_dtype": null,
+      "torchscript": false,
+      "typical_p": 1.0,
+      "use_bfloat16": false,
+      "use_sliding_window": false,
+      "window_size": -1
+    },
+    "feature_reconstruction_ratio": 1.0,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "in_channels": 3584,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "quant_conv": true,
+    "quantizer_type": "rq",
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "restart_unused_codes": true,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "shared_codebook": true,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "vq_loss_ratio": 0
+  },
+  "vocoder_config": {
+    "_attn_implementation_autoset": false,
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "channels": [
+      256,
+      256,
+      256,
+      256,
+      256
+    ],
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "enable": true,
+    "enable_multi_scale": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hop_length": 256,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_audio_seconds": 30,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "n_fft": 1024,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_mel_bins": 80,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sampling_rate": 16000,
+    "sep_token_id": null,
+    "split_overlap": 0.0,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "visual_decoder_config": {
+    "codebook_dim": 3584,
+    "image_decoder_config": {
+      "attention_dropout": 0.0,
+      "codebook_dim": 3584,
+      "distill_taps": [
+        3,
+        7,
+        15,
+        23
+      ],
+      "hidden_act": "gelu",
+      "hidden_size": 1024,
+      "intermediate_size": 2730,
+      "k_bias": false,
+      "layer_norm_eps": 1e-06,
+      "num_attention_heads": 16,
+      "num_hidden_layers": 32,
+      "patch_size": 14,
+      "q_bias": true,
+      "spatial_merge_size": 2,
+      "subln": true,
+      "swiglu": true,
+      "teacher_dims": {
+        "15": 1280,
+        "23": 1280,
+        "3": 1280,
+        "7": 1280
+      },
+      "temporal_patch_size": 2,
+      "v_bias": true
+    },
+    "transformer_config": {
+      "patch_size": 2,
+      "in_channels": 16,
+      "hidden_size": 2520,
+      "num_layers": 32,
+      "num_refiner_layers": 2,
+      "num_attention_heads": 21,
+      "num_kv_heads": 7,
+      "multiple_of": 256,
+      "norm_eps": 1e-5,
+      "axes_dim_rope": [40, 40, 40],
+      "axes_lens": [10000, 10000, 10000],
+      "text_feat_dim": 2048,
+      "timestep_scale": 1000.0
+    },
+    "vae_config": {
+      "act_fn": "silu",
+      "block_out_channels": [128, 256, 512, 512],
+      "down_block_types": [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D"
+      ],
+      "in_channels": 3,
+      "latent_channels": 16,
+      "layers_per_block": 2,
+      "mid_block_add_attention": true,
+      "norm_num_groups": 32,
+      "out_channels": 3,
+      "sample_size": 1024,
+      "scaling_factor": 0.3611,
+      "shift_factor": 0.1159,
+      "up_block_types": [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D"
+      ],
+      "use_post_quant_conv": false,
+      "use_quant_conv": false,
+      "force_upcast": true
+    },
+    "scheduler_config": {
+      "num_train_timesteps": 1000,
+      "dynamic_time_shift": true
+    }
+  },
+  "ignored_token_ids": "131072:131125"
+}

nmm_infer/configuration_omni.py ADDED Viewed

	@@ -0,0 +1,248 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+from transformers import WhisperConfig
+from transformers import CLIPVisionConfig
+logger = logging.get_logger(__name__)
+LONGCAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class LongcatConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LongcatModel`]. It is used to instantiate an Longcat
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Longcat.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 131072):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`LongcatModel`]
+        hidden_size (`int`, *optional*, defaults to 7168):
+            Dimension of the hidden representations.
+        ffn_hidden_size (`int`, *optional*, defaults to 18432):
+            Dimension of the MLP representations.
+        expert_ffn_hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the MoE representations.
+        num_layers (`int`, *optional*, defaults to 61):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 128):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 128):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        n_routed_experts (`int`, *optional*, defaults to 256):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+            Scaling factor or routed experts.
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            Rank of the LoRA matrices for key and value projections.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            Rank of the LoRA matrices for query projections.
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of the query/key heads that use rotary position embeddings.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the value heads.
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the query/key heads that don't use rotary position embeddings.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the weights of the routed experts.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    ```python
+    >>> from transformers import LongcatModel, LongcatConfig
+    >>> # Initializing a Longcat style configuration
+    >>> configuration = LongcatConfig()
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "longcat"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.experts.*.gate_proj": "local_colwise",
+        "layers.*.mlp.experts.*.up_proj": "local_colwise",
+        "layers.*.mlp.experts.*.down_proj": "local_rowwise",
+        "layers.*.mlps.*.gate_proj": "local_colwise",
+        "layers.*.mlps.*.up_proj": "local_colwise",
+        "layers.*.mlps.*.down_proj": "local_rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=282624,
+        hidden_size=3072,
+        ffn_hidden_size=6144,
+        expert_ffn_hidden_size=1024,
+        num_layers=14,
+        num_attention_heads=32,
+        num_key_value_heads=4,
+        n_routed_experts=256,
+        routed_scaling_factor=6.0,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        head_dim=128,
+        qk_nope_head_dim=128,
+        mla_scale_q_lora=True,
+        mla_scale_kv_lora=True,
+        moe_topk=12,
+        norm_topk_prob=False,
+        hidden_act="silu",
+        max_position_embeddings=8192,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=1000000,
+        attention_bias=False,
+        attention_dropout=0.0,
+        attention_method='MLA',
+        initializer_range=0.02,
+        router_bias=False,
+        zero_expert_num=None,
+        zero_expert_type=None,
+        oe_vocab_size_ratio=None,
+        oe_neighbor_num=None,
+        oe_split_num=None,
+        embP=None,
+        audio_config=None,
+        visual_config=None,
+        video_config=None,
+        vocoder_config=None,
+        flow_matching_config=None,
+        visual_quantizer_config=None,
+        visual_decoder_config=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.expert_ffn_hidden_size = expert_ffn_hidden_size
+        self.num_layers = num_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.moe_topk = moe_topk
+        self.norm_topk_prob = norm_topk_prob
+        self.mla_scale_q_lora = mla_scale_q_lora
+        self.mla_scale_kv_lora = mla_scale_kv_lora
+        self.attention_method = attention_method
+        self.initializer_range = initializer_range
+        self.router_bias = router_bias
+        self.zero_expert_num = zero_expert_num
+        self.zero_expert_type = zero_expert_type
+        self.oe_vocab_size_ratio=oe_vocab_size_ratio
+        self.oe_neighbor_num=oe_neighbor_num
+        self.oe_split_num=oe_split_num
+        self.embP=embP
+        if self.attention_method == "GQA":
+            self.head_dim = head_dim
+        elif self.attention_method == "MLA":
+            self.head_dim = qk_rope_head_dim
+        else:
+            ValueError("attention_method should be one of [\"GQA\", \"MLA\"]")
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+        if audio_config is not None:
+            self.audio_config = WhisperConfig(**audio_config)
+            if self.audio_config.vq_config is not None:
+                self.audio_config.vq_config = PretrainedConfig(**self.audio_config.vq_config)
+        if vocoder_config is not None:
+            self.vocoder_config = PretrainedConfig(**vocoder_config)
+        if flow_matching_config is not None:
+            self.flow_matching_config = PretrainedConfig(**flow_matching_config)
+            self.flow_matching_config.cfm_params = PretrainedConfig(**self.flow_matching_config.cfm_params)
+        if visual_config is not None:
+            self.visual_config = CLIPVisionConfig(**visual_config)
+            if hasattr(self.visual_config, 'vq_config') and self.visual_config.vq_config is not None:
+                self.visual_config.vq_config = PretrainedConfig(**self.visual_config.vq_config)
+            if hasattr(self.visual_config, 'image_head_config') and self.visual_config.image_head_config is not None:
+                self.visual_config.image_head_config = PretrainedConfig(**self.visual_config.image_head_config)
+        if video_config is not None:
+            self.video_config = CLIPVisionConfig(**video_config)
+        if visual_quantizer_config is not None:
+            self.visual_quantizer_config = PretrainedConfig(**visual_quantizer_config)
+            self.visual_quantizer_config.feature_decoder_config = PretrainedConfig(
+                **self.visual_quantizer_config.feature_decoder_config)
+        if visual_decoder_config is not None:
+            self.visual_decoder_config = PretrainedConfig(**visual_decoder_config)
+            self.visual_decoder_config.image_decoder_config = PretrainedConfig(**getattr(self.visual_decoder_config, "image_decoder_config", {}))
+            self.visual_decoder_config.transformer_config = PretrainedConfig(**getattr(self.visual_decoder_config, "transformer_config", {}))
+            self.visual_decoder_config.vae_config = PretrainedConfig(**getattr(self.visual_decoder_config, "vae_config", {}))
+            self.visual_decoder_config.scheduler_config = PretrainedConfig(**getattr(self.visual_decoder_config, "scheduler_config", {}))
+__all__ = ["LongcatConfig"]