xiwenyoumu commited on
Commit
0fda810
·
verified ·
1 Parent(s): d242e45

Add Fast-dDrive: SASD ckpt-7000 weights, modeling code, configs

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ pipeline_tag: image-text-to-text
5
+ tags:
6
+ - block-diffusion
7
+ - vision-language-action
8
+ - autonomous-driving
9
+ - qwen2.5-vl
10
+ ---
11
+
12
+ # Fast-dDrive
13
+
14
+ Fast-dDrive is a block-diffusion Vision-Language-Action (VLA) model for
15
+ end-to-end autonomous driving, built on Qwen2.5-VL-3B. It pairs section-aware
16
+ structured-diffusion training (SASD) with scaffold-aware speculative decoding
17
+ (Scaffold Spec) and an optional shared-prefix multi-trajectory inference
18
+ scaling scheme, and reaches SOTA accuracy on the Waymo Open Dataset
19
+ End-to-End Driving (WOD-E2E) benchmark at over 200 tokens / second on a
20
+ single H100.
21
+
22
+ ## Quick start
23
+
24
+ ```python
25
+ import torch
26
+ from transformers import AutoModelForCausalLM, AutoProcessor
27
+
28
+ MODEL = "Efficient-Large-Model/Fast_dDrive_3B" # or your local clone
29
+
30
+ processor = AutoProcessor.from_pretrained(MODEL, trust_remote_code=True)
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ MODEL,
33
+ trust_remote_code=True,
34
+ dtype=torch.bfloat16,
35
+ ).cuda().eval()
36
+
37
+ # Scaffold Spec (paper canonical, threshold = 0.0)
38
+ output_ids = model.scaffold_speculative_sample(
39
+ input_ids=input_ids,
40
+ attention_mask=attention_mask,
41
+ pixel_values=pixel_values,
42
+ image_grid_thw=image_grid_thw,
43
+ confidence_threshold=0.0,
44
+ block_size=32,
45
+ max_new_tokens=512,
46
+ )
47
+ ```
48
+
49
+ ## Inference paths
50
+
51
+ This release exposes three decoding paths as bound methods on the model:
52
+
53
+ | Method | Description | Threshold |
54
+ |---|---|---|
55
+ | `mdm_sample_deep_scaffold` | **Section Diffusion (SD)** — iterative MDM denoising over a pre-filled JSON scaffold | `0.9` |
56
+ | `scaffold_speculative_sample` | **Scaffold Spec (SS)** — scaffold-aware self-speculative decoding (MDM draft + AR verify per block). Paper canonical. | `0.0` |
57
+ | `scaffold_spec_with_ss_multi_traj` | **SS multi-rollout** — shared-prefix N-rollout inference scaling on the trajectory section | `0.0` |
58
+
59
+ > **Important:** `scaffold_speculative_sample` and its multi-traj variant must
60
+ > be run with `confidence_threshold=0.0` to reproduce the paper numbers.
61
+ > Running at `0.9` silently degrades both ADE and throughput.
62
+
63
+ ## Headline results — WOD-E2E test set (single H100)
64
+
65
+ | Mode | RFS ↑ | ADE@3s ↓ | ADE@5s ↓ | TPS ↑ | Tok/Step ↑ |
66
+ |---|---|---|---|---|---|
67
+ | Scaffold Spec | 7.823 | 1.254 | 2.907 | 210.4 | 4.90 |
68
+ | + Inference scaling (N=4) | 7.827 | 1.240 | 2.821 | 114.7 | 2.76 |
69
+
70
+ On the WOD-E2E val set, Scaffold Spec runs at 1919 ms / sample (4.1× over the
71
+ AR baseline); fused with SGLang the same configuration drops to 665 ms /
72
+ sample at 608.5 TPS — the 11.8× / 12× speedup over AR cited in the paper.
73
+
74
+ ## Files
75
+
76
+ - `modeling.py` — model definition (`Fast_dDriveForConditionalGeneration`)
77
+ - `configuration.py` — config classes
78
+ - `section_utils.py` — scaffold construction + section-aligned block index utilities
79
+ - `generation_utils.py` — the three inference paths, attached to the model class on import
80
+ - `config.json`, `generation_config.json`, `preprocessor_config.json`, `chat_template.jinja`, tokenizer files — standard HF artifacts
81
+ - `model-0000{1..4}-of-00004.safetensors` — model weights (4 shards)
82
+
83
+ ## Citation
84
+
85
+ ```bibtex
86
+ @misc{zhang2026fastddriveefficientblockdiffusionvlm,
87
+ title={Fast-dDrive: Efficient Block-Diffusion VLM for Autonomous Driving},
88
+ author={Kewei Zhang and Jin Wang and Sensen Gao and Chengyue Wu and Yulong Cao and Songyang Han and Boris Ivanovic and Langechuan Liu and Marco Pavone and Song Han and Daquan Zhou and Enze Xie},
89
+ year={2026},
90
+ eprint={2605.23163},
91
+ archivePrefix={arXiv},
92
+ primaryClass={cs.CV},
93
+ url={https://arxiv.org/abs/2605.23163},
94
+ }
95
+ ```
__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fast-dDrive HF release package.
2
+
3
+ Mirrors the layout of ``Efficient-Large-Model/Fast_dVLM_3B`` on the Hugging Face
4
+ Hub: a single :mod:`modeling` module that holds the model definition plus
5
+ inference-time decoding paths (Section Diffusion, Scaffold Spec, and
6
+ Scaffold Spec with multi-trajectory rollouts), and a :mod:`configuration`
7
+ module with the config classes.
8
+
9
+ Users normally load the model via::
10
+
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+ model = AutoModelForCausalLM.from_pretrained(
13
+ "Efficient-Large-Model/Fast_dDrive", # or local path
14
+ trust_remote_code=True,
15
+ )
16
+
17
+ with the ``auto_map`` entry in :file:`config.json` pointing back to the classes
18
+ defined here.
19
+ """
20
+
21
+ from .configuration import (
22
+ Fast_dDriveConfig,
23
+ Fast_dDriveTextConfig,
24
+ Fast_dDriveVisionConfig,
25
+ )
26
+ from .modeling import Fast_dDriveForConditionalGeneration
27
+
28
+ __all__ = [
29
+ "Fast_dDriveConfig",
30
+ "Fast_dDriveTextConfig",
31
+ "Fast_dDriveVisionConfig",
32
+ "Fast_dDriveForConditionalGeneration",
33
+ ]
added_tokens.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|NULL|>": 151666,
5
+ "<|box_end|>": 151649,
6
+ "<|box_start|>": 151648,
7
+ "<|endoftext|>": 151643,
8
+ "<|file_sep|>": 151664,
9
+ "<|fim_middle|>": 151660,
10
+ "<|fim_pad|>": 151662,
11
+ "<|fim_prefix|>": 151659,
12
+ "<|fim_suffix|>": 151661,
13
+ "<|im_end|>": 151645,
14
+ "<|im_start|>": 151644,
15
+ "<|image_pad|>": 151655,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|video_pad|>": 151656,
22
+ "<|vision_end|>": 151653,
23
+ "<|vision_pad|>": 151654,
24
+ "<|vision_start|>": 151652,
25
+ "|<MASK>|": 151665
26
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "always_mask_im_end": true,
3
+ "anneal_block_size": false,
4
+ "architectures": [
5
+ "Fast_dDriveForConditionalGeneration"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bd_size": 32,
9
+ "block_causal_no_dynamic": false,
10
+ "complementary_mask": true,
11
+ "deep_json_scaffold": true,
12
+ "dtype": "float32",
13
+ "enable_efficient_vision_embed": false,
14
+ "entropy_loss": false,
15
+ "entropy_loss_weight": 1.0,
16
+ "eos_token_id": 151645,
17
+ "flexible_bd_size": false,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 2048,
20
+ "image_token_id": 151655,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 11008,
23
+ "max_position_embeddings": 128000,
24
+ "max_window_layers": 70,
25
+ "minimum_noise_level": 0.001,
26
+ "model_type": "fast_d_drive",
27
+ "num_attention_heads": 16,
28
+ "num_hidden_layers": 36,
29
+ "num_key_value_heads": 2,
30
+ "pad_token_id": 151643,
31
+ "rms_norm_eps": 1e-06,
32
+ "rope_scaling": {
33
+ "mrope_section": [
34
+ 16,
35
+ 24,
36
+ 24
37
+ ],
38
+ "rope_type": "default",
39
+ "type": "default"
40
+ },
41
+ "rope_theta": 1000000.0,
42
+ "section_block_steps": null,
43
+ "section_loss_weights": {
44
+ "critical_objects": 1.5,
45
+ "explanation": 1.0,
46
+ "future_meta_behavior": 2.0,
47
+ "trajectory": 3.0
48
+ },
49
+ "section_noise_schedule": {
50
+ "critical_objects": "1.0,2.0",
51
+ "explanation": "1.0,1.0",
52
+ "future_meta_behavior": "1.0,1.5",
53
+ "trajectory": "2.0,1.0"
54
+ },
55
+ "section_token_budgets": null,
56
+ "sliding_window": 32768,
57
+ "static_json_scaffold": false,
58
+ "text_config": {
59
+ "always_mask_im_end": true,
60
+ "anneal_block_size": false,
61
+ "architectures": [
62
+ "Fast_dDriveForConditionalGeneration"
63
+ ],
64
+ "attention_dropout": 0.0,
65
+ "auto_map": {
66
+ "AutoConfig": "configuration.Fast_dDriveConfig",
67
+ "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
68
+ "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
69
+ },
70
+ "bd_size": 32,
71
+ "block_causal_no_dynamic": false,
72
+ "block_length": null,
73
+ "bos_token_id": 151643,
74
+ "complementary_mask": true,
75
+ "deep_json_scaffold": true,
76
+ "dtype": "float32",
77
+ "enable_efficient_vision_embed": false,
78
+ "entropy_loss": false,
79
+ "entropy_loss_weight": 1.0,
80
+ "eos_token_id": 151645,
81
+ "flexible_bd_size": false,
82
+ "hidden_act": "silu",
83
+ "hidden_size": 2048,
84
+ "image_token_id": null,
85
+ "initializer_range": 0.02,
86
+ "intermediate_size": 11008,
87
+ "layer_types": [
88
+ "full_attention",
89
+ "full_attention",
90
+ "full_attention",
91
+ "full_attention",
92
+ "full_attention",
93
+ "full_attention",
94
+ "full_attention",
95
+ "full_attention",
96
+ "full_attention",
97
+ "full_attention",
98
+ "full_attention",
99
+ "full_attention",
100
+ "full_attention",
101
+ "full_attention",
102
+ "full_attention",
103
+ "full_attention",
104
+ "full_attention",
105
+ "full_attention",
106
+ "full_attention",
107
+ "full_attention",
108
+ "full_attention",
109
+ "full_attention",
110
+ "full_attention",
111
+ "full_attention",
112
+ "full_attention",
113
+ "full_attention",
114
+ "full_attention",
115
+ "full_attention",
116
+ "full_attention",
117
+ "full_attention",
118
+ "full_attention",
119
+ "full_attention",
120
+ "full_attention",
121
+ "full_attention",
122
+ "full_attention",
123
+ "full_attention"
124
+ ],
125
+ "max_position_embeddings": 128000,
126
+ "max_window_layers": 70,
127
+ "minimum_noise_level": 0.001,
128
+ "model_type": "fast_d_drive_for_causal_lm",
129
+ "num_attention_heads": 16,
130
+ "num_hidden_layers": 36,
131
+ "num_key_value_heads": 2,
132
+ "rms_norm_eps": 1e-06,
133
+ "rope_scaling": {
134
+ "mrope_section": [
135
+ 16,
136
+ 24,
137
+ 24
138
+ ],
139
+ "rope_type": "default",
140
+ "type": "default"
141
+ },
142
+ "rope_theta": 1000000.0,
143
+ "section_block_steps": null,
144
+ "section_token_budgets": null,
145
+ "self_spec_inference_mode": null,
146
+ "sliding_window": null,
147
+ "static_json_scaffold": false,
148
+ "tie_word_embeddings": true,
149
+ "use_block_causal_mask": true,
150
+ "use_cache": true,
151
+ "use_json_scaffold": true,
152
+ "use_sliding_window": false,
153
+ "video_token_id": null,
154
+ "vision_end_token_id": 151653,
155
+ "vision_start_token_id": 151652,
156
+ "vision_token_id": 151654,
157
+ "vocab_size": 151936
158
+ },
159
+ "transformers_version": "4.57.1",
160
+ "use_block_causal_mask": true,
161
+ "use_cache": true,
162
+ "use_json_scaffold": true,
163
+ "use_sliding_window": false,
164
+ "video_token_id": 151656,
165
+ "vision_config": {
166
+ "depth": 32,
167
+ "dtype": "float32",
168
+ "fullatt_block_indexes": [
169
+ 7,
170
+ 15,
171
+ 23,
172
+ 31
173
+ ],
174
+ "hidden_act": "silu",
175
+ "hidden_size": 1280,
176
+ "in_channels": 3,
177
+ "in_chans": 3,
178
+ "initializer_range": 0.02,
179
+ "intermediate_size": 3420,
180
+ "model_type": "fast_d_drive",
181
+ "num_heads": 16,
182
+ "out_hidden_size": 2048,
183
+ "patch_size": 14,
184
+ "spatial_merge_size": 2,
185
+ "spatial_patch_size": 14,
186
+ "temporal_patch_size": 2,
187
+ "tokens_per_second": 2,
188
+ "window_size": 112
189
+ },
190
+ "vision_end_token_id": 151653,
191
+ "vision_start_token_id": 151652,
192
+ "vision_token_id": 151654,
193
+ "vocab_size": 151936,
194
+ "auto_map": {
195
+ "AutoConfig": "configuration.Fast_dDriveConfig",
196
+ "AutoModel": "modeling.Fast_dDriveForConditionalGeneration",
197
+ "AutoModelForCausalLM": "modeling.Fast_dDriveForConditionalGeneration"
198
+ }
199
+ }
configuration.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # limitations under the License.
3
+ from transformers.configuration_utils import PretrainedConfig, layer_type_validation
4
+ from transformers.modeling_rope_utils import rope_config_validation
5
+
6
+
7
+ class Fast_dDriveVisionConfig(PretrainedConfig):
8
+ model_type = "fast_d_drive"
9
+ base_config_key = "vision_config"
10
+
11
+ def __init__(
12
+ self,
13
+ depth=32,
14
+ hidden_size=3584,
15
+ hidden_act="silu",
16
+ intermediate_size=3420,
17
+ num_heads=16,
18
+ in_channels=3,
19
+ patch_size=14,
20
+ spatial_merge_size=2,
21
+ temporal_patch_size=2,
22
+ tokens_per_second=4,
23
+ window_size=112,
24
+ out_hidden_size=3584,
25
+ fullatt_block_indexes=[7, 15, 23, 31],
26
+ initializer_range=0.02,
27
+ **kwargs,
28
+ ):
29
+ super().__init__(**kwargs)
30
+
31
+ self.depth = depth
32
+ self.hidden_size = hidden_size
33
+ self.hidden_act = hidden_act
34
+ self.intermediate_size = intermediate_size
35
+ self.num_heads = num_heads
36
+ self.in_channels = in_channels
37
+ self.patch_size = patch_size
38
+ self.spatial_merge_size = spatial_merge_size
39
+ self.temporal_patch_size = temporal_patch_size
40
+ self.tokens_per_second = tokens_per_second
41
+ self.window_size = window_size
42
+ self.fullatt_block_indexes = fullatt_block_indexes
43
+ self.out_hidden_size = out_hidden_size
44
+ self.initializer_range = initializer_range
45
+
46
+
47
+ class Fast_dDriveTextConfig(PretrainedConfig):
48
+
49
+ model_type = "fast_d_drive_for_causal_lm"
50
+ base_config_key = "text_config"
51
+ keys_to_ignore_at_inference = ["past_key_values"]
52
+ base_model_tp_plan = {
53
+ "layers.*.self_attn.q_proj": "colwise",
54
+ "layers.*.self_attn.k_proj": "colwise",
55
+ "layers.*.self_attn.v_proj": "colwise",
56
+ "layers.*.self_attn.o_proj": "rowwise",
57
+ "layers.*.mlp.gate_proj": "colwise",
58
+ "layers.*.mlp.up_proj": "colwise",
59
+ "layers.*.mlp.down_proj": "rowwise",
60
+ }
61
+ base_model_pp_plan = {
62
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
63
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
64
+ "norm": (["hidden_states"], ["hidden_states"]),
65
+ }
66
+
67
+ def __init__(
68
+ self,
69
+ vocab_size=152064,
70
+ hidden_size=8192,
71
+ intermediate_size=29568,
72
+ num_hidden_layers=80,
73
+ num_attention_heads=64,
74
+ num_key_value_heads=8,
75
+ hidden_act="silu",
76
+ max_position_embeddings=32768,
77
+ initializer_range=0.02,
78
+ rms_norm_eps=1e-05,
79
+ use_cache=True,
80
+ tie_word_embeddings=False,
81
+ rope_theta=1000000.0,
82
+ use_sliding_window=False,
83
+ sliding_window=4096,
84
+ max_window_layers=80,
85
+ layer_types=None,
86
+ attention_dropout=0.0,
87
+ rope_scaling=None,
88
+ image_token_id=None,
89
+ video_token_id=None,
90
+ bd_size=8,
91
+ self_spec_inference_mode=None,
92
+ block_length=None,
93
+ use_block_causal_mask=False,
94
+ complementary_mask=True,
95
+ minimum_noise_level=1e-3,
96
+ entropy_loss=False,
97
+ entropy_loss_weight=1.0,
98
+ block_causal_no_dynamic=False,
99
+ **kwargs,
100
+ ):
101
+ self.vocab_size = vocab_size
102
+ self.max_position_embeddings = max_position_embeddings
103
+ self.hidden_size = hidden_size
104
+ self.intermediate_size = intermediate_size
105
+ self.num_hidden_layers = num_hidden_layers
106
+ self.num_attention_heads = num_attention_heads
107
+ self.use_sliding_window = use_sliding_window
108
+ self.sliding_window = sliding_window if self.use_sliding_window else None
109
+ self.max_window_layers = max_window_layers
110
+
111
+ # for backward compatibility
112
+ if num_key_value_heads is None:
113
+ num_key_value_heads = num_attention_heads
114
+
115
+ self.num_key_value_heads = num_key_value_heads
116
+ self.hidden_act = hidden_act
117
+ self.initializer_range = initializer_range
118
+ self.rms_norm_eps = rms_norm_eps
119
+ self.use_cache = use_cache
120
+ self.rope_theta = rope_theta
121
+ self.attention_dropout = attention_dropout
122
+ self.rope_scaling = rope_scaling
123
+ self.bd_size = bd_size
124
+ self.layer_types = layer_types
125
+ self.use_block_causal_mask = use_block_causal_mask
126
+ self.complementary_mask = complementary_mask
127
+ self.minimum_noise_level = minimum_noise_level
128
+ self.entropy_loss = entropy_loss
129
+ self.entropy_loss_weight = entropy_loss_weight
130
+ self.block_causal_no_dynamic = block_causal_no_dynamic
131
+ self.self_spec_inference_mode = self_spec_inference_mode
132
+ self.block_length = block_length
133
+ if self.layer_types is None:
134
+ self.layer_types = [
135
+ "sliding_attention"
136
+ if self.sliding_window is not None and i >= self.max_window_layers
137
+ else "full_attention"
138
+ for i in range(self.num_hidden_layers)
139
+ ]
140
+ layer_type_validation(self.layer_types)
141
+
142
+ # Validate the correctness of rotary position embeddings parameters
143
+ # BC: if there is a 'type' field, move it to 'rope_type'.
144
+ # and change type from 'mrope' to 'default' because `mrope` does default RoPE calculations
145
+ # one can set it to "linear"/"dynamic" etc. to have scaled RoPE
146
+ # TODO: @raushan update config in the hub
147
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
148
+ if self.rope_scaling["type"] == "mrope":
149
+ self.rope_scaling["type"] = "default"
150
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
151
+ rope_config_validation(self, ignore_keys={"mrope_section"})
152
+ self.image_token_id = image_token_id
153
+ self.video_token_id = video_token_id
154
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
155
+
156
+
157
+ class Fast_dDriveConfig(PretrainedConfig):
158
+
159
+ model_type = "fast_d_drive"
160
+ sub_configs = {"vision_config": Fast_dDriveVisionConfig, "text_config": Fast_dDriveTextConfig}
161
+ keys_to_ignore_at_inference = ["past_key_values"]
162
+
163
+ def __init__(
164
+ self,
165
+ text_config=None,
166
+ vision_config=None,
167
+ image_token_id=151655,
168
+ video_token_id=151656,
169
+ enable_efficient_vision_embed=False,
170
+ always_mask_im_end=False,
171
+ flexible_bd_size=False,
172
+ anneal_block_size=False,
173
+ **kwargs,
174
+ ):
175
+ if isinstance(vision_config, dict):
176
+ self.vision_config = self.sub_configs["vision_config"](**vision_config)
177
+ elif vision_config is None:
178
+ self.vision_config = self.sub_configs["vision_config"]()
179
+
180
+ if isinstance(text_config, dict):
181
+ self.text_config = self.sub_configs["text_config"](**text_config)
182
+ elif text_config is None:
183
+ # For BC use all kwargs to init `TextConfig`
184
+ self.text_config = self.sub_configs["text_config"](**kwargs)
185
+
186
+ self.image_token_id = image_token_id
187
+ self.video_token_id = video_token_id
188
+ self.enable_efficient_vision_embed = enable_efficient_vision_embed
189
+ self.always_mask_im_end = always_mask_im_end
190
+ self.flexible_bd_size = flexible_bd_size
191
+ self.anneal_block_size = anneal_block_size
192
+
193
+ super().__init__(**kwargs)
194
+
195
+ # def to_dict(self):
196
+ # output = super().to_dict()
197
+ # output.pop("auto_map", None)
198
+ # return output
199
+
200
+
201
+ __all__ = ["Fast_dDriveConfig", "Fast_dDriveTextConfig"]
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 1e-06,
10
+ "transformers_version": "4.57.1"
11
+ }
generation_utils.py ADDED
@@ -0,0 +1,1192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generation utilities for Fast-dDrive.
2
+
3
+ This module provides the three inference paths exposed by the canonical paper
4
+ release:
5
+
6
+ * ``mdm_sample_deep_scaffold`` — Section Diffusion (SD): iterative MDM
7
+ denoising over a pre-filled JSON scaffold, no AR verification.
8
+ * ``scaffold_speculative_sample`` — Scaffold Spec (SS): scaffold-aware
9
+ self-speculative decoding (MDM draft + AR verify per block).
10
+ * ``scaffold_spec_with_ss_multi_traj`` — SS with shared-prefix multi-trajectory
11
+ rollouts (the test-time inference-scaling path).
12
+
13
+ All three are attached as bound methods on
14
+ :class:`Fast_dDriveForConditionalGeneration` when this module is
15
+ imported (see ``modeling.py`` for the import hook).
16
+ """
17
+
18
+ import os
19
+ import re
20
+ import sys
21
+ import math
22
+ import torch
23
+ import types
24
+ import numpy as np
25
+ from transformers.cache_utils import DynamicCache
26
+
27
+
28
+ def _crop_cache(past_key_values, max_length: int):
29
+ """Crop a DynamicCache to max_length tokens, compatible with Qwen cache layout."""
30
+ new_past_key_values = []
31
+ for layer_num in range(len(past_key_values)):
32
+ layer_past_key_values = ()
33
+ for kv_idx in range(len(past_key_values[layer_num])):
34
+ layer_past_key_values += (past_key_values[layer_num][kv_idx][:, :, :max_length, :],)
35
+ new_past_key_values.append(layer_past_key_values)
36
+ return DynamicCache(new_past_key_values)
37
+
38
+
39
+ def _sample_from_logits(logits, temperature=0.0):
40
+ """Sample token ids from logits with optional temperature scaling.
41
+
42
+ When temperature <= 0, falls back to argmax (greedy).
43
+ """
44
+ if temperature <= 0:
45
+ return logits.argmax(dim=-1)
46
+ scaled = logits / temperature
47
+ probs = torch.softmax(scaled, dim=-1)
48
+ original_shape = probs.shape[:-1]
49
+ flat_probs = probs.reshape(-1, probs.shape[-1])
50
+ sampled = torch.multinomial(flat_probs, num_samples=1).squeeze(-1)
51
+ return sampled.reshape(original_shape)
52
+
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # mdm_sample_deep_scaffold — Section Diffusion (SD)
56
+ # ---------------------------------------------------------------------------
57
+
58
+ def mdm_sample_deep_scaffold(
59
+ self,
60
+ input_ids,
61
+ tokenizer,
62
+ max_tokens=512,
63
+ pixel_values=None,
64
+ image_grid_thw=None,
65
+ mask_id=151665,
66
+ null_id=151666,
67
+ threshold=0.9,
68
+ stop_token=151645,
69
+ explanation_block_size=32,
70
+ explanation_max_blocks=6,
71
+ block_size=32,
72
+ return_stats=False,
73
+ use_kv_cache=True,
74
+ temperature=0.0,
75
+ ):
76
+ """
77
+ Deep scaffold MDM generation with train-consistent hybrid block causal mask.
78
+
79
+ Pre-fills the entire JSON scaffold (including sub-keys for critical_objects,
80
+ future_meta_behavior, trajectory) with MASK tokens at value positions only.
81
+ Then denoises each section's value tokens via iterative unmasking.
82
+
83
+ The attention mask matches training: prompt tokens use causal attention,
84
+ response tokens use block-causal attention where each section's denoise
85
+ steps form separate blocks. Block i can see all prompt tokens and blocks
86
+ 0..i, but NOT blocks i+1..N (which still contain MASK tokens).
87
+
88
+ For explanation (variable length), NULL tokens in the output signal that
89
+ the section content is complete — trailing NULLs are stripped.
90
+
91
+ KV-cache path (``use_kv_cache=True``, default):
92
+ Prompt K/V is computed once with vision embedding scatter, then each
93
+ response block, once fully denoised, gets its K/V appended to the
94
+ cache. Subsequent blocks' iterative unmasking only forwards their
95
+ own ~block_size tokens against the cache (plus prior committed
96
+ blocks), avoiding O(seqlen^2) recomputation of the prompt + prior
97
+ blocks every iteration. Correctness is preserved because
98
+ block-causal attention means block k only attends to prompt +
99
+ blocks 0..k, which is exactly what the cache provides.
100
+ """
101
+ import math
102
+ import os as _os
103
+ from .section_utils import (
104
+ build_deep_json_scaffold,
105
+ SECTION_KEYS,
106
+ NULL_TOKEN_ID,
107
+ )
108
+
109
+ # Env override for A/B testing the KV cache path without editing code.
110
+ _kv_env = _os.environ.get("MDM_DS_USE_KV_CACHE")
111
+ if _kv_env is not None:
112
+ use_kv_cache = _kv_env not in ("0", "false", "False", "")
113
+
114
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
115
+ tokenizer,
116
+ mask_id=mask_id,
117
+ null_id=null_id,
118
+ explanation_block_size=explanation_block_size,
119
+ explanation_max_blocks=explanation_max_blocks,
120
+ )
121
+
122
+ tokens_per_step = []
123
+ original_input_length = input_ids.shape[1]
124
+
125
+ # Phase 1: Build sequence with scaffold appended
126
+ scaffold_tensor = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long).unsqueeze(0)
127
+ x_t = torch.cat([input_ids, scaffold_tensor], dim=1)
128
+ seqlen = x_t.shape[1]
129
+
130
+ # Track scaffold (frozen) vs value (to denoise) positions in scaffold region
131
+ scaffold_frozen = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
132
+
133
+ # ── Build response_block_idx matching training's compute_section_block_idx_deep_static ──
134
+ response_block_idx = torch.full((seqlen,), -1, device=self.device, dtype=torch.long)
135
+ current_block = 0
136
+ assigned = set()
137
+
138
+ for section_name in SECTION_KEYS:
139
+ if section_name not in section_ranges:
140
+ continue
141
+ sec_start, sec_end = section_ranges[section_name]
142
+
143
+ # Find value positions (non-scaffold) in this section
144
+ value_positions = []
145
+ for i in range(sec_start, sec_end):
146
+ if not scaffold_mask_list[i]: # 0 = value token
147
+ value_positions.append(original_input_length + i)
148
+
149
+ if not value_positions:
150
+ current_block += 1
151
+ continue
152
+
153
+ # Block assignment MUST match training's
154
+ # compute_section_block_idx_deep_static: n_blocks = ceil(value/block_size)
155
+ # for every section. Previously non-explanation sections were forced to
156
+ # a single block; that broke attention alignment for trajectory
157
+ # (70 value tokens → training 3 blocks vs inference 1 block), causing
158
+ # trajectory over-extrapolation. CO (12) and FMB (6) still resolve
159
+ # to 1 block since their value counts are < block_size.
160
+ tokens_per_step_sec = block_size
161
+ n_steps = max(1, math.ceil(len(value_positions) / tokens_per_step_sec))
162
+
163
+ # Assign block indices to value tokens
164
+ for vi, abs_pos in enumerate(value_positions):
165
+ block_in_section = min(vi // tokens_per_step_sec, n_steps - 1)
166
+ response_block_idx[abs_pos] = current_block + block_in_section
167
+ assigned.add(abs_pos)
168
+
169
+ # Assign scaffold tokens to nearest value token's block
170
+ for i in range(sec_start, sec_end):
171
+ abs_pos = original_input_length + i
172
+ if scaffold_mask_list[i] and abs_pos not in assigned:
173
+ best_block = -1
174
+ for delta in range(1, sec_end - sec_start + 10):
175
+ for cand in [abs_pos + delta, abs_pos - delta]:
176
+ if cand in assigned:
177
+ best_block = response_block_idx[cand].item()
178
+ break
179
+ if best_block >= 0:
180
+ break
181
+ if best_block >= 0:
182
+ response_block_idx[abs_pos] = best_block
183
+ assigned.add(abs_pos)
184
+
185
+ current_block += n_steps
186
+
187
+ # Assign any remaining unassigned scaffold tokens (e.g. top-level separators)
188
+ for i in range(len(scaffold_tokens)):
189
+ abs_pos = original_input_length + i
190
+ if abs_pos not in assigned:
191
+ # Find nearest assigned position
192
+ best_block = -1
193
+ for delta in range(1, seqlen):
194
+ for cand in [abs_pos + delta, abs_pos - delta]:
195
+ if 0 <= cand < seqlen and cand in assigned:
196
+ best_block = response_block_idx[cand].item()
197
+ break
198
+ if best_block >= 0:
199
+ break
200
+ if best_block >= 0:
201
+ response_block_idx[abs_pos] = best_block
202
+ assigned.add(abs_pos)
203
+
204
+ # ── Build hybrid block causal mask (computed once, reused for all forward passes) ──
205
+ attention_mask = self.model.eval_hybrid_mask(seqlen, response_block_idx).to(self.device)
206
+
207
+ # Section-MoE-LoRA: set section_ids before language model forward
208
+ set_section_ids = lambda *a, **kw: None # noqa: E731 (Section-MoE-LoRA disabled in release)
209
+ # Map block indices to section IDs (0=CO, 1=Exp, 2=FMB, 3=Traj, 4=Other/Prompt)
210
+ _sec_ids = torch.full((seqlen,), 4, device=self.device, dtype=torch.long)
211
+ for section_name, (sec_start, sec_end) in section_ranges.items():
212
+ abs_start = original_input_length + sec_start
213
+ abs_end = original_input_length + sec_end
214
+ if section_name == "critical_objects":
215
+ _sec_ids[abs_start:abs_end] = 0
216
+ elif section_name == "explanation":
217
+ _sec_ids[abs_start:abs_end] = 1
218
+ elif section_name == "future_meta_behavior":
219
+ _sec_ids[abs_start:abs_end] = 2
220
+ elif section_name == "trajectory":
221
+ _sec_ids[abs_start:abs_end] = 3
222
+
223
+ # Add batch dimension
224
+ _sec_ids_batch = _sec_ids.unsqueeze(0)
225
+ set_section_ids(_sec_ids_batch)
226
+
227
+ # ── Precompute vision embeddings and position_ids once ──
228
+ # BUG FIX: Previously pixel_values was only passed on the first forward
229
+ # (step==0) but with use_cache=False every forward is independent, so all
230
+ # subsequent forwards lost vision information entirely.
231
+ _embed_fn = self.model.get_input_embeddings()
232
+ _cached_image_embeds = None
233
+ _cached_image_mask = None
234
+
235
+ if pixel_values is not None:
236
+ _cached_image_embeds = self.model.get_image_features(pixel_values, image_grid_thw)
237
+ _cached_image_embeds = torch.cat(_cached_image_embeds, dim=0).to(
238
+ self.device, _embed_fn.weight.dtype
239
+ )
240
+ _tmp_embeds = _embed_fn(x_t)
241
+ _cached_image_mask, _ = self.model.get_placeholder_mask(
242
+ x_t, inputs_embeds=_tmp_embeds, image_features=_cached_image_embeds
243
+ )
244
+
245
+ # Compute position_ids once with correct image_grid_thw (3D RoPE)
246
+ _position_ids, _rope_deltas = self.model.get_rope_index(
247
+ x_t, image_grid_thw, None
248
+ )
249
+ self.model.rope_deltas = _rope_deltas
250
+
251
+ # ── Compute contiguous block ranges in the response region ──
252
+ # Each block's absolute [start, end) range in x_t is the maximal
253
+ # contiguous span of positions sharing the same response_block_idx.
254
+ # Blocks are ordered by block_idx and cover the entire response.
255
+ _block_ranges = [] # list of (block_idx, abs_start, abs_end)
256
+ _cur_bi = None
257
+ _cur_start = None
258
+ for _p in range(seqlen):
259
+ _bi = int(response_block_idx[_p].item())
260
+ if _bi < 0:
261
+ if _cur_bi is not None:
262
+ _block_ranges.append((_cur_bi, _cur_start, _p))
263
+ _cur_bi, _cur_start = None, None
264
+ continue
265
+ if _cur_bi is None:
266
+ _cur_bi, _cur_start = _bi, _p
267
+ elif _bi != _cur_bi:
268
+ _block_ranges.append((_cur_bi, _cur_start, _p))
269
+ _cur_bi, _cur_start = _bi, _p
270
+ if _cur_bi is not None:
271
+ _block_ranges.append((_cur_bi, _cur_start, seqlen))
272
+
273
+ # Map block_idx -> section_name for downstream logic (section-specific
274
+ # behaviors like explanation NULL handling can still be scoped).
275
+ _block_idx_to_section = {}
276
+ for _sname, (_sstart, _send) in section_ranges.items():
277
+ _sabs_start = original_input_length + _sstart
278
+ _sabs_end = original_input_length + _send
279
+ for _bi, _bs, _be in _block_ranges:
280
+ # Assign section by whether the block's range overlaps the section
281
+ if _bs < _sabs_end and _be > _sabs_start:
282
+ _block_idx_to_section.setdefault(_bi, _sname)
283
+
284
+ # ── Phase 2: Denoise block-by-block with optional KV cache ──
285
+ # Without cache (fallback): each forward replays the entire sequence.
286
+ # With cache: prompt K/V computed once; each block's finalized K/V is
287
+ # appended after denoising, so later blocks only forward their own
288
+ # ~block_size tokens against the cache.
289
+ step = 0
290
+
291
+ past_kv = None
292
+ prev_last_logit = None # logit at the position just before the next block
293
+
294
+ if use_kv_cache:
295
+ # Phase 0: prompt prefill. Includes vision scatter; cache becomes
296
+ # the reusable foundation for every scaffold block.
297
+ prompt_tokens = x_t[:, :original_input_length]
298
+ prompt_embeds = _embed_fn(prompt_tokens)
299
+ if _cached_image_embeds is not None:
300
+ prompt_image_mask = _cached_image_mask[:, :original_input_length]
301
+ prompt_embeds = prompt_embeds.masked_scatter(
302
+ prompt_image_mask, _cached_image_embeds
303
+ )
304
+ prompt_position_ids = _position_ids[..., :original_input_length]
305
+
306
+ # Causal over prompt (matches training's prompt-side attention).
307
+ # When attention_mask=None, the model's eval_mask auto-builds causal
308
+ # because use_block_causal_mask=True and update_kv_cache=True.
309
+ prompt_out = self.forward(
310
+ inputs_embeds=prompt_embeds,
311
+ position_ids=prompt_position_ids,
312
+ attention_mask=None,
313
+ past_key_values=None,
314
+ use_cache=True,
315
+ update_kv_cache=True,
316
+ )
317
+ past_kv = prompt_out.past_key_values
318
+ # Logit at position (original_input_length - 1); used to predict
319
+ # the first token of the first response block via causal shift.
320
+ prev_last_logit = prompt_out.logits[:, -1:, :]
321
+
322
+ # ── Iterate blocks in order ──
323
+ for _block_idx, block_abs_start, block_abs_end in _block_ranges:
324
+ B = block_abs_end - block_abs_start
325
+ section_name = _block_idx_to_section.get(_block_idx, None)
326
+
327
+ # Count MASK tokens in this block
328
+ block_slice = x_t[0, block_abs_start:block_abs_end]
329
+ n_masks_in_block = int((block_slice == mask_id).sum().item())
330
+
331
+ # ── Iterative unmasking within this block (if any MASKs) ──
332
+ if n_masks_in_block > 0:
333
+ max_iter = n_masks_in_block + 5 # safety limit
334
+ for _ in range(max_iter):
335
+ current_block_masks = (x_t[:, block_abs_start:block_abs_end] == mask_id)
336
+ if current_block_masks.sum() == 0:
337
+ break
338
+
339
+ if use_kv_cache:
340
+ # Feed only this block; past_kv covers prompt + prior blocks.
341
+ block_tokens = x_t[:, block_abs_start:block_abs_end]
342
+ block_embeds = _embed_fn(block_tokens)
343
+ block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
344
+ L_cached = past_kv.get_seq_length() if past_kv is not None else 0
345
+ # Block-causal + bidirectional-within-block ⇒ this
346
+ # block's queries attend to all cached KV plus all
347
+ # fresh block KV ⇒ all-True mask of shape [B, L+B].
348
+ block_attn = torch.ones(
349
+ B, L_cached + B, device=self.device, dtype=torch.bool
350
+ )
351
+ output = self.forward(
352
+ inputs_embeds=block_embeds,
353
+ attention_mask=block_attn,
354
+ position_ids=block_position_ids,
355
+ past_key_values=past_kv,
356
+ use_cache=True,
357
+ update_kv_cache=False, # read-only during iteration
358
+ )
359
+ logits = output.logits # [1, B, V]
360
+ # Shift: pred for abs_pos uses logit at abs_pos-1.
361
+ # logit at block_abs_start-1 is prev_last_logit; the
362
+ # rest come from this forward's earlier positions.
363
+ sec_logits = torch.cat([prev_last_logit, logits[:, :-1, :]], dim=1)
364
+ else:
365
+ # Full-sequence forward (fallback path, same as before)
366
+ _cur_embeds = _embed_fn(x_t)
367
+ if _cached_image_embeds is not None:
368
+ _cur_embeds = _cur_embeds.masked_scatter(
369
+ _cached_image_mask, _cached_image_embeds
370
+ )
371
+ output = self.forward(
372
+ input_ids=x_t,
373
+ inputs_embeds=_cur_embeds,
374
+ attention_mask=attention_mask,
375
+ position_ids=_position_ids,
376
+ use_cache=False,
377
+ )
378
+ logits = output.logits
379
+ sec_logits = logits[:, block_abs_start:block_abs_end, :]
380
+ sec_logits = torch.cat(
381
+ [logits[:, block_abs_start - 1:block_abs_start, :],
382
+ sec_logits[:, :-1, :]], dim=1
383
+ )
384
+
385
+ if temperature > 0:
386
+ # Temperature sampling for diverse generation (e.g. GRPO rollouts)
387
+ sampling_probs = torch.softmax(sec_logits / temperature, dim=-1)
388
+ x_1 = torch.multinomial(
389
+ sampling_probs.view(-1, sampling_probs.shape[-1]), num_samples=1
390
+ ).view(sampling_probs.shape[:-1])
391
+ else:
392
+ # Greedy (default, backward compatible)
393
+ x_1 = sec_logits.argmax(dim=-1)
394
+ probs = torch.softmax(sec_logits, dim=-1)
395
+ x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
396
+
397
+ # Only consider currently-masked positions in this block
398
+ x1_p = torch.where(current_block_masks, x1_p, -torch.inf)
399
+ unmask_idx = (x1_p > threshold)
400
+
401
+ if unmask_idx.sum() > 0:
402
+ x_t[:, block_abs_start:block_abs_end][unmask_idx] = x_1[unmask_idx]
403
+ tokens_per_step.append(int(unmask_idx.sum()))
404
+ else:
405
+ # Fallback: unmask highest-confidence token
406
+ pos = x1_p.argmax()
407
+ row = 0
408
+ col = pos.item()
409
+ x_t[:, block_abs_start:block_abs_end][row, col] = x_1[row, col]
410
+ tokens_per_step.append(1)
411
+
412
+ step += 1
413
+ if step > max_tokens:
414
+ break
415
+
416
+ # ── Commit this block's K/V to the cache ──
417
+ # Run one final forward at block's fully-denoised state with
418
+ # update_kv_cache=True so future blocks can attend to it via cache.
419
+ # prev_last_logit is refreshed to the logit at the last position
420
+ # of this block for the NEXT block's first-position prediction.
421
+ if use_kv_cache:
422
+ block_tokens = x_t[:, block_abs_start:block_abs_end]
423
+ block_embeds = _embed_fn(block_tokens)
424
+ block_position_ids = _position_ids[..., block_abs_start:block_abs_end]
425
+ L_cached = past_kv.get_seq_length() if past_kv is not None else 0
426
+ block_attn = torch.ones(
427
+ B, L_cached + B, device=self.device, dtype=torch.bool
428
+ )
429
+ commit_out = self.forward(
430
+ inputs_embeds=block_embeds,
431
+ attention_mask=block_attn,
432
+ position_ids=block_position_ids,
433
+ past_key_values=past_kv,
434
+ use_cache=True,
435
+ update_kv_cache=True,
436
+ )
437
+ past_kv = commit_out.past_key_values
438
+ prev_last_logit = commit_out.logits[:, -1:, :]
439
+
440
+ # NOTE: a previous null_ratio>0.3 early-stopping heuristic was
441
+ # removed. It computed the ratio globally across the whole
442
+ # explanation and, when tripped, force-filled every remaining
443
+ # MASK with NULL — including MASKs in middle positions that
444
+ # should have held real text — which cut short explanations
445
+ # mid-sentence. Training always produces 192 value tokens
446
+ # (real text + <|NULL|> padding at the tail) and the model
447
+ # learned to emit NULL cleanly at the tail, so the final
448
+ # NULL-strip below is sufficient. Cost: every sample now
449
+ # denoises all 6 explanation blocks.
450
+
451
+ # Post-process: strip NULL tokens from the output
452
+ gen_tokens = x_t[0, original_input_length:].tolist()
453
+ cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
454
+ x_t = torch.cat([
455
+ input_ids,
456
+ torch.tensor([cleaned], device=self.device, dtype=torch.long)
457
+ ], dim=1)
458
+
459
+ gen_length = x_t.shape[1] - original_input_length
460
+
461
+ if return_stats:
462
+ stats = {
463
+ "tokens_per_step": tokens_per_step,
464
+ "total_steps": step,
465
+ "gen_length": gen_length,
466
+ "null_tokens_stripped": len(gen_tokens) - len(cleaned),
467
+ "block_size": block_size,
468
+ }
469
+ return x_t, stats
470
+ return x_t
471
+
472
+ @torch.no_grad()
473
+
474
+
475
+ # ---------------------------------------------------------------------------
476
+ # scaffold_speculative_sample — Scaffold Spec (SS)
477
+ # ---------------------------------------------------------------------------
478
+
479
+ def scaffold_speculative_sample(
480
+ self,
481
+ input_ids,
482
+ tokenizer,
483
+ block_size=32,
484
+ max_tokens=1024,
485
+ pixel_values=None,
486
+ image_grid_thw=None,
487
+ mask_id=151665,
488
+ null_id=151666,
489
+ threshold=0.9,
490
+ stop_token=151645,
491
+ explanation_block_size=32,
492
+ explanation_max_blocks=6,
493
+ return_stats=False,
494
+ draft_temperature=0.0,
495
+ verify_temperature=0.0,
496
+ ):
497
+ """
498
+ Scaffold-aware self-speculative decoding.
499
+
500
+ Minimal modification of standard self-spec
501
+ (speculative_block_causal_sample_cache): scaffold (structural JSON)
502
+ tokens are pre-filled in the draft block instead of MASK and
503
+ auto-accepted during causal verification.
504
+
505
+ Key design: uses *exactly the same* attention patterns as standard
506
+ self-spec (block-diff for draft, **causal** for verify via
507
+ auto eval_mask). Only the draft block content differs — scaffold
508
+ positions carry known tokens instead of MASK, giving the draft
509
+ better context while scaffold tokens are "free" during acceptance.
510
+ """
511
+ from .section_utils import (
512
+ build_deep_json_scaffold,
513
+ NULL_TOKEN_ID,
514
+ )
515
+
516
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
517
+ tokenizer,
518
+ mask_id=mask_id,
519
+ null_id=null_id,
520
+ explanation_block_size=explanation_block_size,
521
+ explanation_max_blocks=explanation_max_blocks,
522
+ )
523
+
524
+ scaffold_len = len(scaffold_tokens)
525
+ original_input_length = input_ids.shape[1]
526
+ tokens_per_step = []
527
+ self.model.bd_size = block_size
528
+
529
+ _ss_profile = bool(os.environ.get("SS_PROFILE"))
530
+ _ss_traj_start = section_ranges.get("trajectory", (None, None))[0]
531
+ if _ss_profile:
532
+ import time as _time
533
+ torch.cuda.synchronize()
534
+ _ss_t = {"start": _time.perf_counter()}
535
+ _ss_marked_traj_start = False
536
+ _ss_n_fwd_prefix = 0
537
+ _ss_n_fwd_traj = 0
538
+
539
+ # Pre-convert to tensors for vectorized operations in the loop
540
+ scaffold_tok_t = torch.tensor(
541
+ scaffold_tokens, device=self.device, dtype=torch.long
542
+ )
543
+ scaffold_is_fixed = torch.tensor(
544
+ scaffold_mask_list, device=self.device, dtype=torch.bool
545
+ )
546
+
547
+ # ── Phase 1: Prefill prompt (identical to standard self-spec) ──
548
+ output = self.forward(
549
+ input_ids=input_ids,
550
+ pixel_values=pixel_values,
551
+ image_grid_thw=image_grid_thw,
552
+ use_cache=True,
553
+ update_kv_cache=True,
554
+ )
555
+ logits, past_key_values = output.logits, output.past_key_values
556
+ if _ss_profile:
557
+ torch.cuda.synchronize()
558
+ _ss_t["after_prefill"] = _time.perf_counter()
559
+
560
+ # First token — use scaffold token (always '{')
561
+ next_token = torch.tensor(
562
+ [[scaffold_tokens[0]]], device=self.device, dtype=torch.long
563
+ )
564
+ input_ids = torch.cat([input_ids, next_token], dim=1)
565
+ tokens_per_step.append(1)
566
+ scaffold_cursor = 1
567
+ step = 1
568
+
569
+ # ── Phase 2: Self-speculative decoding loop ──
570
+ # Follows the exact same structure as
571
+ # speculative_block_causal_sample_cache, with scaffold-aware draft.
572
+ while scaffold_cursor < scaffold_len:
573
+ if _ss_profile and (not _ss_marked_traj_start) and (
574
+ _ss_traj_start is not None and scaffold_cursor >= _ss_traj_start
575
+ ):
576
+ torch.cuda.synchronize()
577
+ _ss_t["enter_traj"] = _time.perf_counter()
578
+ _ss_marked_traj_start = True
579
+ prompt_length = input_ids.shape[1]
580
+ n_draft = min(block_size - 1, scaffold_len - scaffold_cursor)
581
+
582
+ # Build draft block: [seed, scaffold_or_MASK × n_draft]
583
+ sc_end = scaffold_cursor + n_draft
584
+ is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
585
+ draft_tensor = torch.where(
586
+ is_fixed,
587
+ scaffold_tok_t[scaffold_cursor:sc_end],
588
+ mask_id,
589
+ ).unsqueeze(0)
590
+ x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
591
+ mask_idx = (x_t == mask_id)
592
+
593
+ # ── Draft (block-diff bidirectional via auto eval_mask) ──
594
+ logits = self.forward(
595
+ input_ids=x_t,
596
+ use_cache=True,
597
+ past_key_values=past_key_values,
598
+ update_kv_cache=False,
599
+ eval_bd_size=block_size,
600
+ ).logits
601
+ tokens_per_step.append(0)
602
+ step += 1
603
+
604
+ # Shift logits (same as standard self-spec)
605
+ logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
606
+ if draft_temperature > 0:
607
+ # Temperature sampling for draft diversity
608
+ scaled = logits / draft_temperature
609
+ draft_probs = torch.softmax(scaled, dim=-1)
610
+ x_1 = torch.multinomial(
611
+ draft_probs.view(-1, draft_probs.shape[-1]), num_samples=1
612
+ ).view(draft_probs.shape[:-1])
613
+ # Confidence uses unscaled probs for thresholding
614
+ probs = torch.softmax(logits, dim=-1)
615
+ x1_p = torch.gather(
616
+ probs, dim=-1, index=x_1.unsqueeze(-1)
617
+ ).squeeze(-1)
618
+ else:
619
+ x_1 = logits.argmax(dim=-1)
620
+ probs = torch.softmax(logits, dim=-1)
621
+ x1_p = torch.gather(
622
+ probs, dim=-1, index=x_1.unsqueeze(-1)
623
+ ).squeeze(-1)
624
+
625
+ # Only fill MASK positions; scaffold positions keep their tokens
626
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
627
+ unmask_idx = (x1_p > 0) # threshold=0 for draft filling
628
+
629
+ if unmask_idx.sum() > 0:
630
+ x_t[unmask_idx] = x_1[unmask_idx]
631
+ else:
632
+ # Fallback: fill most confident MASK
633
+ mask_only_p = x1_p.clone()
634
+ mask_only_p[~mask_idx] = -torch.inf
635
+ if mask_only_p.max() > -torch.inf:
636
+ best = mask_only_p.argmax()
637
+ x_t.view(-1)[best] = x_1.view(-1)[best]
638
+
639
+ # ── Verify (causal via auto eval_mask, commit to cache) ──
640
+ output = self.forward(
641
+ input_ids=x_t,
642
+ use_cache=True,
643
+ past_key_values=past_key_values,
644
+ update_kv_cache=True,
645
+ eval_bd_size=block_size,
646
+ )
647
+ past_key_values = output.past_key_values
648
+ if verify_temperature > 0:
649
+ verify_logits = output.logits / verify_temperature
650
+ verify_probs = torch.softmax(verify_logits, dim=-1)
651
+ ar_block_token = torch.multinomial(
652
+ verify_probs.view(-1, verify_probs.shape[-1]), num_samples=1
653
+ ).view(verify_probs.shape[:-1])
654
+ else:
655
+ ar_block_token = output.logits.argmax(dim=-1)
656
+
657
+ # ── AR acceptance (scaffold positions auto-pass) ──
658
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
659
+ accepted_token_num = 0
660
+ for i in range(n_draft):
661
+ if is_fixed[i] or ar_matches[i]:
662
+ accepted_token_num += 1
663
+ else:
664
+ break
665
+ accepted_token_num += 1 # bonus token
666
+
667
+ tokens_per_step.append(accepted_token_num)
668
+
669
+ # Force scaffold tokens at scaffold positions, AR predictions elsewhere
670
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
671
+ acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
672
+ acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
673
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
674
+ scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
675
+
676
+ input_ids = torch.cat([input_ids, accepted_ids], dim=1)
677
+ scaffold_cursor += accepted_token_num
678
+
679
+ past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
680
+
681
+ step += 1
682
+
683
+ # Stop conditions
684
+ if input_ids.shape[1] - original_input_length > max_tokens:
685
+ break
686
+ if stop_token in input_ids[:, prompt_length:]:
687
+ stop_token_idx = (
688
+ input_ids[:, prompt_length:] == stop_token
689
+ ).nonzero()[0][1]
690
+ if (
691
+ input_ids[:, prompt_length:prompt_length + stop_token_idx]
692
+ == mask_id
693
+ ).sum() == 0:
694
+ break
695
+
696
+ if _ss_profile:
697
+ torch.cuda.synchronize()
698
+ _ss_t["end"] = _time.perf_counter()
699
+ _t_total = _ss_t["end"] - _ss_t["start"]
700
+ _t_pre = _ss_t["after_prefill"] - _ss_t["start"]
701
+ _t_traj_in = _ss_t.get("enter_traj")
702
+ if _t_traj_in is not None:
703
+ _t_prefix = _t_traj_in - _ss_t["after_prefill"]
704
+ _t_traj = _ss_t["end"] - _t_traj_in
705
+ else:
706
+ _t_prefix = _ss_t["end"] - _ss_t["after_prefill"]
707
+ _t_traj = 0.0
708
+ print(
709
+ f"[ss profile] total={_t_total*1000:.0f}ms "
710
+ f"prefill={_t_pre*1000:.0f}ms "
711
+ f"prefix-decode={_t_prefix*1000:.0f}ms "
712
+ f"traj-decode={_t_traj*1000:.0f}ms",
713
+ flush=True,
714
+ )
715
+
716
+ # ── Phase 3: Post-process — truncate at stop, strip NULL ──
717
+ if stop_token in input_ids[:, original_input_length:]:
718
+ stop_token_idx = (
719
+ input_ids[:, original_input_length:] == stop_token
720
+ ).nonzero()[0][1]
721
+ input_ids = input_ids[
722
+ :, :stop_token_idx + original_input_length + 1
723
+ ]
724
+
725
+ gen_tokens = input_ids[0, original_input_length:].tolist()
726
+ cleaned = [t for t in gen_tokens if t != null_id and t != mask_id]
727
+ output_ids = torch.cat(
728
+ [
729
+ input_ids[:, :original_input_length],
730
+ torch.tensor(
731
+ [cleaned], device=self.device, dtype=torch.long
732
+ ),
733
+ ],
734
+ dim=1,
735
+ )
736
+
737
+ gen_length = output_ids.shape[1] - original_input_length
738
+
739
+ if return_stats:
740
+ stats = {
741
+ "tokens_per_step": tokens_per_step,
742
+ "total_steps": step,
743
+ "gen_length": gen_length,
744
+ "null_tokens_stripped": len(gen_tokens) - len(cleaned),
745
+ "block_size": block_size,
746
+ "method": "scaffold_speculative_v5",
747
+ }
748
+ return output_ids, stats
749
+ return output_ids
750
+
751
+ @torch.no_grad()
752
+
753
+
754
+ # ---------------------------------------------------------------------------
755
+ # scaffold_spec_with_ss_multi_traj — SS multi-rollout inference scaling
756
+ # ---------------------------------------------------------------------------
757
+
758
+ def scaffold_spec_with_ss_multi_traj(
759
+ self,
760
+ input_ids,
761
+ tokenizer,
762
+ block_size=32,
763
+ max_tokens=1024,
764
+ pixel_values=None,
765
+ image_grid_thw=None,
766
+ mask_id=151665,
767
+ null_id=151666,
768
+ threshold=0.9,
769
+ stop_token=151645,
770
+ explanation_block_size=32,
771
+ explanation_max_blocks=6,
772
+ return_stats=False,
773
+ num_traj_rollouts=4,
774
+ traj_verify_temperature=0.5,
775
+ traj_draft_temperature=0.0,
776
+ merge_weights=None,
777
+ batch_parallel=False,
778
+ ):
779
+ """Scaffold Spec with shared prefix + N SS rollouts on the trajectory section.
780
+
781
+ Decoding pipeline:
782
+ 0) Prompt prefill [shared]
783
+ 1) Scaffold Spec for sections 1-3 (CoT) at verify_temp = 0 [shared, deterministic]
784
+ 2) Fork KV cache N times [O(N) memory]
785
+ 3) For each fork: continue Scaffold Spec on the trajectory
786
+ section with verify_temperature = traj_verify_temperature
787
+ (each rollout draws different samples in the AR-verify step
788
+ because torch.multinomial is invoked with a global RNG).
789
+ 4) Parse all N trajectories and return their weighted mean.
790
+
791
+ Cost: roughly 1 full SS pass (sections 1-3 are ~88%% of decoded tokens
792
+ on our schema) + N x trajectory-only SS passes. For N = 4 this is
793
+ ~1.5x the cost of a single SS, vs ~4x for naive sequential rerolling.
794
+
795
+ If batch_parallel = True, the N trajectory rollouts are executed in a
796
+ batched (batch_size = N) manner: one shared model.forward per
797
+ speculative draft / verify step over an N-replicated trajectory
798
+ suffix, which removes the per-rollout serial overhead at the cost of
799
+ replicating the per-layer KV cache N-fold along the batch dimension.
800
+
801
+ Returns: (output_ids, stats) if return_stats else output_ids.
802
+ """
803
+ from .section_utils import (
804
+ build_deep_json_scaffold,
805
+ SECTION_KEYS,
806
+ )
807
+
808
+ scaffold_tokens, section_ranges, scaffold_mask_list = build_deep_json_scaffold(
809
+ tokenizer,
810
+ mask_id=mask_id,
811
+ null_id=null_id,
812
+ explanation_block_size=explanation_block_size,
813
+ explanation_max_blocks=explanation_max_blocks,
814
+ )
815
+
816
+ scaffold_len = len(scaffold_tokens)
817
+ original_input_length = input_ids.shape[1]
818
+ tokens_per_step = []
819
+ self.model.bd_size = block_size
820
+
821
+ scaffold_tok_t = torch.tensor(scaffold_tokens, device=self.device, dtype=torch.long)
822
+ scaffold_is_fixed = torch.tensor(scaffold_mask_list, device=self.device, dtype=torch.bool)
823
+ traj_start_in_scaffold = section_ranges["trajectory"][0]
824
+
825
+ _profile = bool(os.environ.get("SS_MT_PROFILE"))
826
+ if _profile:
827
+ import time as _time
828
+ torch.cuda.synchronize()
829
+ _t_phase = {"start": _time.perf_counter()}
830
+ _phase_clone_total = 0.0
831
+ _phase_rollout_each = []
832
+
833
+ # ── Phase 0: Prefill prompt ──
834
+ output = self.forward(
835
+ input_ids=input_ids, pixel_values=pixel_values,
836
+ image_grid_thw=image_grid_thw,
837
+ use_cache=True, update_kv_cache=True,
838
+ )
839
+ logits, past_key_values = output.logits, output.past_key_values
840
+ if _profile:
841
+ torch.cuda.synchronize()
842
+ _t_phase["after_prefill"] = _time.perf_counter()
843
+
844
+ next_token = torch.tensor(
845
+ [[scaffold_tokens[0]]], device=self.device, dtype=torch.long,
846
+ )
847
+ input_ids = torch.cat([input_ids, next_token], dim=1)
848
+ tokens_per_step.append(1)
849
+ scaffold_cursor = 1
850
+ step = 1
851
+
852
+ # ── Phase 1: Scaffold Spec for non-trajectory sections (shared, vt=0) ──
853
+ while scaffold_cursor < scaffold_len and scaffold_cursor < traj_start_in_scaffold:
854
+ remaining_before_traj = traj_start_in_scaffold - scaffold_cursor
855
+ n_draft = min(block_size - 1, remaining_before_traj)
856
+ if n_draft <= 0:
857
+ break
858
+
859
+ sc_end = scaffold_cursor + n_draft
860
+ is_fixed = scaffold_is_fixed[scaffold_cursor:sc_end]
861
+ draft_tensor = torch.where(
862
+ is_fixed, scaffold_tok_t[scaffold_cursor:sc_end], mask_id,
863
+ ).unsqueeze(0)
864
+ x_t = torch.cat([input_ids[:, -1:], draft_tensor], dim=1)
865
+ mask_idx = (x_t == mask_id)
866
+
867
+ # Draft (block-bidirectional)
868
+ logits = self.forward(
869
+ input_ids=x_t, use_cache=True,
870
+ past_key_values=past_key_values,
871
+ update_kv_cache=False, eval_bd_size=block_size,
872
+ ).logits
873
+ tokens_per_step.append(0)
874
+ step += 1
875
+
876
+ logits = torch.cat([logits[:, :1, :], logits[:, :-1, :]], dim=1)
877
+ x_1 = logits.argmax(dim=-1)
878
+ probs = torch.softmax(logits, dim=-1)
879
+ x1_p = torch.gather(probs, dim=-1, index=x_1.unsqueeze(-1)).squeeze(-1)
880
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
881
+ unmask_idx = (x1_p > 0)
882
+ if unmask_idx.sum() > 0:
883
+ x_t[unmask_idx] = x_1[unmask_idx]
884
+ else:
885
+ mask_only_p = x1_p.clone()
886
+ mask_only_p[~mask_idx] = -torch.inf
887
+ if mask_only_p.max() > -torch.inf:
888
+ best = mask_only_p.argmax()
889
+ x_t.view(-1)[best] = x_1.view(-1)[best]
890
+
891
+ # Verify (causal, greedy)
892
+ output = self.forward(
893
+ input_ids=x_t, use_cache=True,
894
+ past_key_values=past_key_values,
895
+ update_kv_cache=True, eval_bd_size=block_size,
896
+ )
897
+ past_key_values = output.past_key_values
898
+ ar_block_token = output.logits.argmax(dim=-1)
899
+
900
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
901
+ accepted_token_num = 0
902
+ for i in range(n_draft):
903
+ if is_fixed[i] or ar_matches[i]:
904
+ accepted_token_num += 1
905
+ else:
906
+ break
907
+ accepted_token_num += 1
908
+
909
+ max_accept = traj_start_in_scaffold - scaffold_cursor
910
+ if accepted_token_num > max_accept:
911
+ accepted_token_num = max_accept
912
+
913
+ tokens_per_step.append(accepted_token_num)
914
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
915
+ acc_end = min(scaffold_cursor + accepted_token_num, scaffold_len)
916
+ acc_fixed = scaffold_is_fixed[scaffold_cursor:acc_end]
917
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
918
+ scaffold_tok_t[scaffold_cursor:acc_end][acc_fixed]
919
+
920
+ input_ids = torch.cat([input_ids, accepted_ids], dim=1)
921
+ scaffold_cursor += accepted_token_num
922
+ past_key_values = _crop_cache(past_key_values, input_ids.shape[1] - 1)
923
+ step += 1
924
+
925
+ if input_ids.shape[1] - original_input_length > max_tokens:
926
+ break
927
+
928
+ if _profile:
929
+ torch.cuda.synchronize()
930
+ _t_phase["after_phase1"] = _time.perf_counter()
931
+
932
+ # ── Phase 2: Fork KV cache N times (one per trajectory rollout) ──
933
+ prefix_input_ids = input_ids.clone()
934
+ prefix_len = prefix_input_ids.shape[1]
935
+
936
+ def _clone_cache(kv):
937
+ if _profile:
938
+ torch.cuda.synchronize()
939
+ _t0 = _time.perf_counter()
940
+ cloned = []
941
+ for layer_num in range(len(kv)):
942
+ cloned.append(tuple(t.clone() for t in kv[layer_num]))
943
+ ret = DynamicCache(cloned)
944
+ if _profile:
945
+ torch.cuda.synchronize()
946
+ nonlocal _phase_clone_total
947
+ _phase_clone_total += _time.perf_counter() - _t0
948
+ return ret
949
+
950
+ # ── Phase 3: N SS rollouts on trajectory section, each with vt > 0 ──
951
+ # All rollouts start from the same prefix; randomness comes from
952
+ # the multinomial calls in draft / verify (RNG is process-global).
953
+ N = max(1, int(num_traj_rollouts))
954
+
955
+ def _run_one_traj_rollout(start_kv, start_input_ids):
956
+ """Continue Scaffold Spec from start_kv / start_input_ids over the
957
+ trajectory section, applying traj_*_temperature. Returns the
958
+ final ss_input_ids (with trajectory tokens appended) and the
959
+ extracted trajectory value tokens."""
960
+ local_kv = start_kv
961
+ local_input = start_input_ids
962
+ local_cursor = scaffold_cursor
963
+
964
+ while local_cursor < scaffold_len:
965
+ n_draft = min(block_size - 1, scaffold_len - local_cursor)
966
+ sc_end = local_cursor + n_draft
967
+ is_fixed = scaffold_is_fixed[local_cursor:sc_end]
968
+ draft_tensor = torch.where(
969
+ is_fixed, scaffold_tok_t[local_cursor:sc_end], mask_id,
970
+ ).unsqueeze(0)
971
+ x_t = torch.cat([local_input[:, -1:], draft_tensor], dim=1)
972
+ mask_idx = (x_t == mask_id)
973
+
974
+ # Draft (block-bidirectional, optionally temp-sampled)
975
+ draft_logits = self.forward(
976
+ input_ids=x_t, use_cache=True, past_key_values=local_kv,
977
+ update_kv_cache=False, eval_bd_size=block_size,
978
+ ).logits
979
+ draft_logits = torch.cat(
980
+ [draft_logits[:, :1, :], draft_logits[:, :-1, :]], dim=1,
981
+ )
982
+ if traj_draft_temperature > 0:
983
+ scaled = draft_logits / traj_draft_temperature
984
+ draft_probs = torch.softmax(scaled, dim=-1)
985
+ x_1 = torch.multinomial(
986
+ draft_probs.view(-1, draft_probs.shape[-1]),
987
+ num_samples=1,
988
+ ).view(draft_probs.shape[:-1])
989
+ else:
990
+ x_1 = draft_logits.argmax(dim=-1)
991
+ probs = torch.softmax(draft_logits, dim=-1)
992
+ x1_p = torch.gather(
993
+ probs, dim=-1, index=x_1.unsqueeze(-1),
994
+ ).squeeze(-1)
995
+ x1_p = torch.where(mask_idx, x1_p, -torch.inf)
996
+ unmask_idx = (x1_p > 0)
997
+ if unmask_idx.sum() > 0:
998
+ x_t[unmask_idx] = x_1[unmask_idx]
999
+ else:
1000
+ mask_only_p = x1_p.clone()
1001
+ mask_only_p[~mask_idx] = -torch.inf
1002
+ if mask_only_p.max() > -torch.inf:
1003
+ x_t.view(-1)[mask_only_p.argmax()] = \
1004
+ x_1.view(-1)[mask_only_p.argmax()]
1005
+
1006
+ # Verify (causal, optionally temp-sampled)
1007
+ v_out = self.forward(
1008
+ input_ids=x_t, use_cache=True, past_key_values=local_kv,
1009
+ update_kv_cache=True, eval_bd_size=block_size,
1010
+ )
1011
+ local_kv = v_out.past_key_values
1012
+ if traj_verify_temperature > 0:
1013
+ v_logits = v_out.logits / traj_verify_temperature
1014
+ v_probs = torch.softmax(v_logits, dim=-1)
1015
+ ar_block_token = torch.multinomial(
1016
+ v_probs.view(-1, v_probs.shape[-1]),
1017
+ num_samples=1,
1018
+ ).view(v_probs.shape[:-1])
1019
+ else:
1020
+ ar_block_token = v_out.logits.argmax(dim=-1)
1021
+
1022
+ ar_matches = (ar_block_token[0, :n_draft] == x_t[0, 1:n_draft + 1])
1023
+ accepted_token_num = 0
1024
+ for i in range(n_draft):
1025
+ if is_fixed[i] or ar_matches[i]:
1026
+ accepted_token_num += 1
1027
+ else:
1028
+ break
1029
+ accepted_token_num += 1
1030
+
1031
+ accepted_ids = ar_block_token[:, :accepted_token_num].clone()
1032
+ acc_end = min(local_cursor + accepted_token_num, scaffold_len)
1033
+ acc_fixed = scaffold_is_fixed[local_cursor:acc_end]
1034
+ accepted_ids[0, :len(acc_fixed)][acc_fixed] = \
1035
+ scaffold_tok_t[local_cursor:acc_end][acc_fixed]
1036
+
1037
+ local_input = torch.cat([local_input, accepted_ids], dim=1)
1038
+ local_cursor += accepted_token_num
1039
+ local_kv = _crop_cache(local_kv, local_input.shape[1] - 1)
1040
+
1041
+ if local_input.shape[1] - original_input_length > max_tokens:
1042
+ break
1043
+ if stop_token in local_input[:, prefix_len:]:
1044
+ st_idx = (local_input[:, prefix_len:] == stop_token).nonzero()
1045
+ if st_idx.numel() > 0:
1046
+ cand_st = st_idx[0][1].item()
1047
+ if (local_input[:, prefix_len:prefix_len + cand_st] == mask_id).sum() == 0:
1048
+ break
1049
+
1050
+ traj_values = [
1051
+ t for i, t in enumerate(local_input[0, original_input_length:].tolist())
1052
+ if i >= traj_start_in_scaffold and i < scaffold_len
1053
+ and not scaffold_mask_list[i] and t != null_id and t != mask_id
1054
+ ]
1055
+ return local_input, traj_values
1056
+
1057
+ # Sequential N rollouts (Option A; batch_parallel=False).
1058
+ rollout_inputs = []
1059
+ rollout_traj_values = []
1060
+ for _i in range(N):
1061
+ if _profile:
1062
+ torch.cuda.synchronize()
1063
+ _t_r0 = _time.perf_counter()
1064
+ cand_kv = _clone_cache(past_key_values)
1065
+ cand_input = prefix_input_ids.clone()
1066
+ cand_input, traj_vals = _run_one_traj_rollout(cand_kv, cand_input)
1067
+ rollout_inputs.append(cand_input)
1068
+ rollout_traj_values.append(traj_vals)
1069
+ step += 1
1070
+ if _profile:
1071
+ torch.cuda.synchronize()
1072
+ _phase_rollout_each.append(_time.perf_counter() - _t_r0)
1073
+
1074
+ if _profile:
1075
+ torch.cuda.synchronize()
1076
+ _t_phase["after_rollouts"] = _time.perf_counter()
1077
+ _t_total = _t_phase["after_rollouts"] - _t_phase["start"]
1078
+ _t_pre = _t_phase["after_prefill"] - _t_phase["start"]
1079
+ _t_p1 = _t_phase["after_phase1"] - _t_phase["after_prefill"]
1080
+ _t_rolls = _t_phase["after_rollouts"] - _t_phase["after_phase1"]
1081
+ print(
1082
+ f"[ss_mt profile] total={_t_total*1000:.0f}ms "
1083
+ f"prefill(P0)={_t_pre*1000:.0f}ms "
1084
+ f"prefix-decode(P1)={_t_p1*1000:.0f}ms "
1085
+ f"rollouts(P2+P3)={_t_rolls*1000:.0f}ms "
1086
+ f"of which kv-clone={_phase_clone_total*1000:.0f}ms "
1087
+ f"per-rollout={[f'{r*1000:.0f}' for r in _phase_rollout_each]}ms",
1088
+ flush=True,
1089
+ )
1090
+
1091
+ # ── Phase 4: Parse all rollouts, weighted-merge waypoints ──
1092
+ def _decode_trajectory(traj_tokens):
1093
+ text = tokenizer.decode(traj_tokens, skip_special_tokens=False)
1094
+ text = text.replace("<|NULL|>", "").strip()
1095
+ coords = re.findall(r"[+-]?\d+\.?\d*", text)
1096
+ wps = []
1097
+ for i in range(0, len(coords) - 1, 2):
1098
+ wps.append([float(coords[i]), float(coords[i + 1])])
1099
+ return wps
1100
+
1101
+ rollout_waypoints = [_decode_trajectory(v) for v in rollout_traj_values]
1102
+
1103
+ if merge_weights is None or len(merge_weights) != N:
1104
+ ws = [1.0 / N] * N
1105
+ else:
1106
+ total = sum(merge_weights)
1107
+ ws = [w / total for w in merge_weights]
1108
+
1109
+ if rollout_waypoints and all(len(w) > 0 for w in rollout_waypoints):
1110
+ n_wp = min(len(w) for w in rollout_waypoints)
1111
+ merged_waypoints = []
1112
+ for i in range(n_wp):
1113
+ mx = sum(ws[c] * rollout_waypoints[c][i][0] for c in range(N))
1114
+ my = sum(ws[c] * rollout_waypoints[c][i][1] for c in range(N))
1115
+ merged_waypoints.append([mx, my])
1116
+ else:
1117
+ merged_waypoints = next(
1118
+ (w for w in rollout_waypoints if w), [],
1119
+ )
1120
+
1121
+ # Output text: take rollout 0's full text but replace its trajectory
1122
+ # with the merged waypoints.
1123
+ base_input = rollout_inputs[0]
1124
+ if stop_token in base_input[:, original_input_length:]:
1125
+ st_idx = (base_input[:, original_input_length:] == stop_token).nonzero()[0][1]
1126
+ base_input = base_input[:, :st_idx + original_input_length + 1]
1127
+ base_raw_tokens = base_input[0, original_input_length:].tolist()
1128
+ base_cleaned = [t for t in base_raw_tokens if t != null_id and t != mask_id]
1129
+ base_null_stripped = len(base_raw_tokens) - len(base_cleaned)
1130
+ base_text = tokenizer.decode(base_cleaned, skip_special_tokens=False)
1131
+
1132
+ traj_parts = [
1133
+ f"[{x:+07.2f},{y:+06.2f}]" for x, y in merged_waypoints
1134
+ ]
1135
+ merged_traj_str = "[" + ", ".join(traj_parts) + "]"
1136
+ replaced_text = re.sub(
1137
+ r'("trajectory"\s*:\s*")(\[\[.*?\]\])',
1138
+ r"\g<1>" + merged_traj_str, base_text,
1139
+ )
1140
+
1141
+ merged_tokens = tokenizer.encode(replaced_text, add_special_tokens=False)
1142
+ output_ids = torch.cat([
1143
+ input_ids[:, :original_input_length],
1144
+ torch.tensor([merged_tokens], device=self.device, dtype=torch.long),
1145
+ ], dim=1)
1146
+
1147
+ gen_length = output_ids.shape[1] - original_input_length
1148
+
1149
+ if return_stats:
1150
+ stats = {
1151
+ "tokens_per_step": tokens_per_step,
1152
+ "total_steps": step,
1153
+ "gen_length": gen_length,
1154
+ "null_tokens_stripped": base_null_stripped,
1155
+ "block_size": block_size,
1156
+ "method": "scaffold_spec_with_ss_multi_traj",
1157
+ "num_traj_rollouts": N,
1158
+ "traj_verify_temperature": traj_verify_temperature,
1159
+ "rollout_waypoints": rollout_waypoints,
1160
+ "merged_waypoints": merged_waypoints,
1161
+ "merge_weights": ws,
1162
+ }
1163
+ return output_ids, stats
1164
+ return output_ids
1165
+
1166
+ @torch.no_grad()
1167
+
1168
+
1169
+ # ---------------------------------------------------------------------------
1170
+ # Bind decoding methods onto the model class.
1171
+ #
1172
+ # ``modeling.py`` imports this module at the bottom of the file, after the
1173
+ # ``Fast_dDriveForConditionalGeneration`` class has been defined. We
1174
+ # attach the three decoding paths as ordinary methods so callers can invoke
1175
+ # them as ``model.mdm_sample_deep_scaffold(...)`` etc. without any extra
1176
+ # registration step.
1177
+ # ---------------------------------------------------------------------------
1178
+
1179
+ def attach_generation_methods(cls):
1180
+ """Attach the three release decoding paths as methods of ``cls``."""
1181
+ cls.mdm_sample_deep_scaffold = mdm_sample_deep_scaffold
1182
+ cls.scaffold_speculative_sample = scaffold_speculative_sample
1183
+ cls.scaffold_spec_with_ss_multi_traj = scaffold_spec_with_ss_multi_traj
1184
+ return cls
1185
+
1186
+
1187
+ __all__ = [
1188
+ "mdm_sample_deep_scaffold",
1189
+ "scaffold_speculative_sample",
1190
+ "scaffold_spec_with_ss_multi_traj",
1191
+ "attach_generation_methods",
1192
+ ]
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a8945f208e0b7e62c71542d3301d755d95d02bcdb54d7deec28f8a819b4a2d
3
+ size 4972304384
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f399e3ccf37f2016964e531e2cfb5371a3d96d46205cb1b3eba24cd13d0de6aa
3
+ size 4932949248
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1c77c01870520022fd0111459ce27eff492ce932a64740dd26d23598289f3ed
3
+ size 4932949336
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c13c8eff2f5e9435ac7692790d1028ff1dbc5f8bd53db6e5a91b22c322162008
3
+ size 1425040040
model.safetensors.index.json ADDED
@@ -0,0 +1,833 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 234663856,
4
+ "total_size": 16263151616
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
261
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
263
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
265
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.28.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
267
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.28.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
270
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
271
+ "model.layers.28.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
272
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
273
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
275
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
277
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.29.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
279
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.29.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
282
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.29.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
284
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
285
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
287
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
299
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
301
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
303
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
306
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
308
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
309
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
311
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
313
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
315
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
318
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
320
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
321
+ "model.layers.32.input_layernorm.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.32.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
323
+ "model.layers.32.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.32.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
325
+ "model.layers.32.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.32.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
327
+ "model.layers.32.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.32.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.32.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
330
+ "model.layers.32.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.32.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
332
+ "model.layers.32.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
333
+ "model.layers.33.input_layernorm.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.33.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
335
+ "model.layers.33.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
336
+ "model.layers.33.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
337
+ "model.layers.33.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.33.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
339
+ "model.layers.33.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
340
+ "model.layers.33.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.33.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
342
+ "model.layers.33.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
343
+ "model.layers.33.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
344
+ "model.layers.33.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
345
+ "model.layers.34.input_layernorm.weight": "model-00003-of-00004.safetensors",
346
+ "model.layers.34.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
347
+ "model.layers.34.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
348
+ "model.layers.34.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
349
+ "model.layers.34.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
350
+ "model.layers.34.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
351
+ "model.layers.34.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
352
+ "model.layers.34.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.layers.34.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
354
+ "model.layers.34.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
355
+ "model.layers.34.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
356
+ "model.layers.34.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
357
+ "model.layers.35.input_layernorm.weight": "model-00004-of-00004.safetensors",
358
+ "model.layers.35.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
359
+ "model.layers.35.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
360
+ "model.layers.35.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
361
+ "model.layers.35.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
362
+ "model.layers.35.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
363
+ "model.layers.35.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
364
+ "model.layers.35.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
365
+ "model.layers.35.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
366
+ "model.layers.35.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
367
+ "model.layers.35.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
368
+ "model.layers.35.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
369
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00004.safetensors",
370
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
371
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
372
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
373
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
374
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
375
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
376
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
377
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
378
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
379
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
380
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
381
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
382
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
383
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
384
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
385
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
386
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
387
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
388
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
389
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
390
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
391
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
392
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
393
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
394
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
395
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
396
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
397
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
398
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
399
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
400
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
401
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
402
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
403
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
404
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
405
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
406
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
407
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
408
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
409
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
410
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
411
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
412
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
413
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
414
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
415
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
416
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
417
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
418
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
419
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
420
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
421
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
422
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
423
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
424
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
425
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
426
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
427
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
428
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
429
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
430
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
431
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
432
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
433
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
434
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
435
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
436
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
437
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
438
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
439
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
440
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
441
+ "model.norm.weight": "model-00004-of-00004.safetensors",
442
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
729
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
730
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
731
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
732
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
733
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
734
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
735
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
736
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
737
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
738
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
739
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
740
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
741
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
742
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
743
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
744
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
745
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
746
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
747
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
748
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
749
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
750
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
751
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
752
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
753
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
754
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
755
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
756
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
757
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
758
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
759
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
760
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
761
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
762
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
763
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
764
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
765
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
766
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
767
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
768
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
769
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
770
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
771
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
772
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
773
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
774
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
775
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
776
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
777
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
778
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
779
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
780
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
781
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
782
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
783
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
784
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
785
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
786
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
787
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
788
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
789
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
790
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
791
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
792
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
793
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
794
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
795
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
796
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
797
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
798
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
799
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
800
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
801
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
802
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
803
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
804
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
805
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
806
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
807
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
808
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
809
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
810
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
811
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
812
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
813
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
814
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
815
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
816
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
817
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
818
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
819
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
820
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
821
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
822
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
823
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
824
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
825
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
826
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
827
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
828
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
829
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
830
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
831
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
832
+ }
833
+ }
modeling.py ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "min_pixels": 3136,
3
+ "max_pixels": 12845056,
4
+ "patch_size": 14,
5
+ "temporal_patch_size": 2,
6
+ "merge_size": 2,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessor",
18
+ "processor_class": "Qwen2_5_VLProcessor"
19
+ }
section_utils.py ADDED
@@ -0,0 +1,803 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Section-aware block scheduling for JSON structured output.
3
+
4
+ Inspired by the S3 (Self-adaptive Schema Scaffolding) paper (arXiv:2507.04504),
5
+ this module provides utilities to:
6
+ 1. Parse tokenized JSON output into sections (critical_objects, explanation, etc.)
7
+ 2. Assign section-aware block indices for variable block sizes per section
8
+ 3. Build JSON scaffolds for inference (pre-fill structural tokens)
9
+
10
+ The DVLM-AD output schema has 4 sections:
11
+ - critical_objects: ~88 tokens (12 yes/no fields, nearly constant)
12
+ - explanation: ~114 tokens (variable, 72-172)
13
+ - future_meta_behavior: ~40 tokens (nearly constant)
14
+ - trajectory: ~80 tokens (nearly constant)
15
+ """
16
+
17
+ import torch
18
+ from typing import Dict, List, Optional, Tuple
19
+ import math
20
+
21
+
22
+ # Ordered list of section keys as they appear in the JSON output
23
+ SECTION_KEYS = [
24
+ "critical_objects",
25
+ "explanation",
26
+ "future_meta_behavior",
27
+ "trajectory",
28
+ ]
29
+
30
+ # Default token budgets per section (based on training data analysis)
31
+ DEFAULT_TOKEN_BUDGETS = {
32
+ "critical_objects": 88,
33
+ "explanation": 128,
34
+ "future_meta_behavior": 40,
35
+ "trajectory": 80,
36
+ }
37
+
38
+ # Default steps per section
39
+ DEFAULT_SECTION_STEPS = {
40
+ "critical_objects": 1,
41
+ "explanation": 3,
42
+ "future_meta_behavior": 1,
43
+ "trajectory": 1,
44
+ }
45
+
46
+
47
+
48
+
49
+ def _v1_removed_parse_json_sections(*args, **kwargs):
50
+ raise NotImplementedError("DS v1 parse_json_sections has been removed. Use deep scaffold v2.")
51
+
52
+
53
+ def _v1_removed_compute_section_block_idx(*args, **kwargs):
54
+ raise NotImplementedError("DS v1 compute_section_block_idx has been removed. Use compute_section_block_idx_deep_static.")
55
+
56
+
57
+ def _v1_removed_build_json_scaffold(*args, **kwargs):
58
+ raise NotImplementedError("DS v1 build_json_scaffold has been removed. Use build_deep_json_scaffold.")
59
+
60
+
61
+ def _v1_removed_compute_section_block_sizes(*args, **kwargs):
62
+ raise NotImplementedError("DS v1 compute_section_block_sizes has been removed.")
63
+
64
+
65
+ def build_static_scaffold_sequences(tokenizer) -> Dict[str, List[int]]:
66
+ """Pre-compute token sequences for top-level JSON boundary matching.
67
+
68
+ Used internally by :func:`build_deep_scaffold_sequences`.
69
+ """
70
+ return {
71
+ "prefix": tokenizer.encode('{"critical_objects":', add_special_tokens=False),
72
+ "between_co_exp": tokenizer.encode(' "explanation":', add_special_tokens=False),
73
+ "between_exp_fmb": tokenizer.encode(' "future_meta_behavior":', add_special_tokens=False),
74
+ "between_fmb_traj": tokenizer.encode(' "trajectory":', add_special_tokens=False),
75
+ }
76
+
77
+
78
+ def _v1_removed_compute_section_block_idx_static(*args, **kwargs):
79
+ raise NotImplementedError("DS v1 compute_section_block_idx_static has been removed. Use compute_section_block_idx_deep_static.")
80
+
81
+
82
+ # Backward-compatible aliases so stale imports produce clear errors
83
+ parse_json_sections = _v1_removed_parse_json_sections
84
+ compute_section_block_idx = _v1_removed_compute_section_block_idx
85
+ build_json_scaffold = _v1_removed_build_json_scaffold
86
+ compute_section_block_sizes = _v1_removed_compute_section_block_sizes
87
+ compute_section_block_idx_static = _v1_removed_compute_section_block_idx_static
88
+
89
+
90
+ # ═══════════════════════════════════════════════════════════════
91
+ # Deep scaffold v2: constants and utilities
92
+ # ═══════════════════════════════════════════════════════════════
93
+
94
+ NULL_TOKEN_ID = 151666
95
+
96
+ # critical_objects: 12 sub-keys, each value is exactly 1 token (yes=9693 / no=2152)
97
+ CRITICAL_OBJECTS_SUBKEYS = [
98
+ "nearby_vehicle", "pedestrian", "cyclist", "construction",
99
+ "traffic_element", "weather_condition", "road_hazard",
100
+ "emergency_vehicle", "animal", "special_vehicle",
101
+ "conflicting_vehicle", "door_opening_vehicle",
102
+ ]
103
+
104
+ # future_meta_behavior: each sub-key value is exactly 3 tokens
105
+ # (e.g., "keep speed" → [4867, 4732, 151667] or "go straight" → [2849, 7833, 151667])
106
+ FMB_VALUE_BUDGET = 3
107
+
108
+
109
+ def build_deep_json_scaffold(
110
+ tokenizer,
111
+ section_token_budgets: Optional[Dict[str, int]] = None,
112
+ mask_id: Optional[int] = None,
113
+ null_id: Optional[int] = None,
114
+ explanation_block_size: int = 32,
115
+ explanation_max_blocks: int = 6,
116
+ ) -> Tuple[List[int], Dict[str, Tuple[int, int]], List[int]]:
117
+ """Build a deep JSON scaffold for inference (v2).
118
+
119
+ Constructs a template response by building a Python dict and
120
+ processing it through the **exact same pipeline** as the training
121
+ dataloader (``multi_modal_dataset.py``):
122
+
123
+ 1. Build a realistic dict with placeholder values.
124
+ 2. Pad explanation with ``<|NULL|>`` to ``exp_budget`` tokens.
125
+ 3. Pad FMB values with ``<|NULL|>`` to 3 tokens each.
126
+ 4. Normalize trajectory to ``+XXX.XX`` format with spaces.
127
+ 5. Serialize with ``json.dumps(obj, ensure_ascii=False)``.
128
+ 6. Tokenize the whole string as one piece.
129
+ 7. Run ``compute_section_block_idx_deep_static`` to get scaffold/value.
130
+ 8. Replace value positions with MASK tokens.
131
+
132
+ This guarantees identical BPE tokenization as training data.
133
+
134
+ Returns
135
+ -------
136
+ scaffold_tokens : list[int]
137
+ Token IDs with MASK at value positions.
138
+ section_ranges : dict
139
+ Section name -> (start, end) within scaffold_tokens.
140
+ scaffold_mask : list[int]
141
+ 0 = value (to denoise), 1 = scaffold (frozen).
142
+ """
143
+ import torch as _torch
144
+ import json as _json
145
+ import re as _re
146
+
147
+ if mask_id is None:
148
+ mask_tok = tokenizer.encode("|<MASK>|", add_special_tokens=False)
149
+ mask_id = mask_tok[0] if len(mask_tok) == 1 else 151665
150
+ if null_id is None:
151
+ null_id = NULL_TOKEN_ID
152
+
153
+ exp_budget = explanation_block_size * explanation_max_blocks # default 192
154
+
155
+ # ── Step 1: Build a Python dict matching training data structure ──
156
+ # Placeholder explanation text (will be replaced with MASK anyway).
157
+ filler_explanation = (
158
+ "The ego vehicle is driving forward on the road. "
159
+ "There are nearby vehicles ahead that may affect the path. "
160
+ "No pedestrians or cyclists are detected in the immediate area. "
161
+ "The road conditions appear normal with no hazards present. "
162
+ "Speed adjustment may be needed based on the traffic ahead. "
163
+ "No lateral maneuvering is required at this time."
164
+ )
165
+
166
+ def _build_template(n_exp_nulls: int) -> str:
167
+ """Build template via json.dumps — identical to dataloader output."""
168
+ null_pad = "<|NULL|>" * n_exp_nulls
169
+
170
+ data_obj = {
171
+ "critical_objects": {
172
+ "nearby_vehicle": "no", "pedestrian": "no", "cyclist": "no",
173
+ "construction": "no", "traffic_element": "no",
174
+ "weather_condition": "no", "road_hazard": "no",
175
+ "emergency_vehicle": "no", "animal": "no",
176
+ "special_vehicle": "no", "conflicting_vehicle": "no",
177
+ "door_opening_vehicle": "no",
178
+ },
179
+ "explanation": filler_explanation + null_pad,
180
+ "future_meta_behavior": {
181
+ "longitudinal": "come to stop",
182
+ "lateral": "go straight<|NULL|>",
183
+ },
184
+ # Raw trajectory — will be normalized below
185
+ "trajectory": "[[+14.70,-00.04], [+29.55,-00.21], [+44.51,-00.56], [+59.50,-01.06], [+74.39,-01.69]]",
186
+ }
187
+
188
+ # Apply exact same trajectory normalization as dataloader (lines 851-863)
189
+ traj = data_obj["trajectory"]
190
+ def _fmt_coord(m):
191
+ raw = m.group(0)
192
+ sign = raw[0]
193
+ num = float(raw[1:])
194
+ return f"{sign}{num:06.2f}"
195
+ traj = _re.sub(r'[+-]\d+\.\d+', _fmt_coord, traj)
196
+ traj = _re.sub(r',([+-])', r', \1', traj)
197
+ traj = _re.sub(r'\[([+-])', r'[ \1', traj)
198
+ data_obj["trajectory"] = traj
199
+
200
+ # Serialize with json.dumps — identical to dataloader line 865
201
+ return _json.dumps(data_obj, ensure_ascii=False)
202
+
203
+ # ── Step 2: Iteratively adjust NULL count for exp_budget ──
204
+ deep_seqs = build_deep_scaffold_sequences(tokenizer)
205
+ top_seqs = deep_seqs["top"]
206
+
207
+ def _count_exp_value_tokens(tok_list):
208
+ """Count explanation VALUE tokens (between boundary patterns)."""
209
+ co_exp_pat = top_seqs["between_co_exp"]
210
+ exp_fmb_pat = top_seqs["between_exp_fmb"]
211
+ co_exp_pos = _find_subseq(tok_list, co_exp_pat, 0)
212
+ if co_exp_pos < 0:
213
+ return None
214
+ exp_start = co_exp_pos + len(co_exp_pat)
215
+ exp_fmb_pos = _find_subseq(tok_list, exp_fmb_pat, exp_start)
216
+ if exp_fmb_pos < 0:
217
+ return None
218
+ # exp_start..exp_fmb_pos includes opening/closing quotes (scaffold)
219
+ # value tokens = total - 2 (quotes)
220
+ return (exp_fmb_pos - exp_start) - 2
221
+
222
+ # Measure base explanation tokens (no NULLs)
223
+ toks_0 = tokenizer.encode(_build_template(0), add_special_tokens=False)
224
+ base_exp = _count_exp_value_tokens(toks_0)
225
+ if base_exp is not None:
226
+ needed_nulls = max(0, exp_budget - base_exp)
227
+ else:
228
+ needed_nulls = exp_budget // 2 # fallback
229
+
230
+ # Build and measure, adjust once
231
+ template = _build_template(needed_nulls)
232
+ template_tokens = tokenizer.encode(template, add_special_tokens=False)
233
+ actual_exp = _count_exp_value_tokens(template_tokens)
234
+ if actual_exp is not None and actual_exp != exp_budget:
235
+ needed_nulls = max(0, needed_nulls + (exp_budget - actual_exp))
236
+ template = _build_template(needed_nulls)
237
+ template_tokens = tokenizer.encode(template, add_special_tokens=False)
238
+
239
+ # ── Step 3: Run training scaffold detection ──
240
+ prompt_len = 10
241
+ all_tokens = [1] * prompt_len + template_tokens
242
+ labels_list = [-100] * prompt_len + template_tokens
243
+
244
+ labels = _torch.tensor([labels_list])
245
+ token_ids = _torch.tensor([all_tokens])
246
+
247
+ _, _, _, scaffold_mask_tensor, _ = compute_section_block_idx_deep_static(
248
+ labels, token_ids, deep_seqs, fallback_block_size=32,
249
+ )
250
+
251
+ # ── Step 4: Extract scaffold/value and replace value with MASK ──
252
+ scaffold_tokens = list(template_tokens)
253
+ scaffold_mask_list: List[int] = []
254
+ for i in range(len(template_tokens)):
255
+ abs_pos = prompt_len + i
256
+ is_scaffold = scaffold_mask_tensor[abs_pos].item()
257
+ scaffold_mask_list.append(1 if is_scaffold else 0)
258
+
259
+ for i in range(len(scaffold_tokens)):
260
+ if scaffold_mask_list[i] == 0:
261
+ scaffold_tokens[i] = mask_id
262
+
263
+ # ── Step 5: Compute section ranges ──
264
+ section_ranges: Dict[str, Tuple[int, int]] = {}
265
+ boundary_order = [
266
+ ("prefix", "critical_objects"),
267
+ ("between_co_exp", "explanation"),
268
+ ("between_exp_fmb", "future_meta_behavior"),
269
+ ("between_fmb_traj", "trajectory"),
270
+ ]
271
+
272
+ search_from = 0
273
+ prev_section_name = None
274
+ prev_value_start = None
275
+
276
+ for boundary_key, section_name in boundary_order:
277
+ pattern = top_seqs.get(boundary_key)
278
+ if pattern is None:
279
+ continue
280
+ pos = _find_subseq(template_tokens, pattern, search_from)
281
+ if pos < 0:
282
+ continue
283
+ if prev_section_name is not None and prev_value_start is not None:
284
+ section_ranges[prev_section_name] = (prev_value_start, pos)
285
+ value_start = pos + len(pattern)
286
+ prev_section_name = section_name
287
+ prev_value_start = value_start
288
+ search_from = value_start
289
+
290
+ if prev_section_name is not None and prev_value_start is not None:
291
+ section_ranges[prev_section_name] = (prev_value_start, len(template_tokens))
292
+
293
+ return scaffold_tokens, section_ranges, scaffold_mask_list
294
+
295
+
296
+ def _find_subseq(seq: List[int], pattern: List[int], start: int = 0) -> int:
297
+ """Find first occurrence of *pattern* in *seq* starting at *start*. Returns -1 if not found."""
298
+ n = len(pattern)
299
+ for i in range(start, len(seq) - n + 1):
300
+ if seq[i : i + n] == pattern:
301
+ return i
302
+ return -1
303
+
304
+
305
+ def build_deep_scaffold_sequences(tokenizer) -> Dict[str, object]:
306
+ """
307
+ Pre-compute token sequences for deep scaffold matching.
308
+
309
+ Returns a dict with:
310
+ - Top-level boundary patterns (same as build_static_scaffold_sequences)
311
+ - Sub-key patterns for critical_objects, future_meta_behavior, trajectory
312
+ """
313
+ seqs: Dict[str, object] = {}
314
+
315
+ # ── Top-level boundaries (reuse existing) ──
316
+ seqs["top"] = build_static_scaffold_sequences(tokenizer)
317
+
318
+ # ── critical_objects sub-key patterns ──
319
+ # In context, CO value starts with ' {"nearby_vehicle": "yes", ...'
320
+ # Token 5212 = ' {"' merges space+brace+quote in context
321
+ # First entry: ' {"key": "'
322
+ # Subsequent: '", "key": "' (token 497='","' merges quote+comma)
323
+ co_patterns = []
324
+ for i, key in enumerate(CRITICAL_OBJECTS_SUBKEYS):
325
+ if i == 0:
326
+ pattern = tokenizer.encode(' {"' + key + '": "', add_special_tokens=False)
327
+ else:
328
+ pattern = tokenizer.encode('", "' + key + '": "', add_special_tokens=False)
329
+ co_patterns.append({"key": key, "pattern": pattern, "index": i})
330
+ seqs["co_subkeys"] = co_patterns
331
+ seqs["co_closing"] = tokenizer.encode('"}', add_special_tokens=False)
332
+ # json.dumps produces "}," which may merge into a single token
333
+ seqs["co_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
334
+
335
+ # ── future_meta_behavior sub-key patterns ──
336
+ # After dataloader processing (mdm markers removed, NULLs cleaned):
337
+ # ' {"longitudinal": "keep speed", "lateral": "go straight"}'
338
+ # Scaffold = everything except the value content between quotes.
339
+ seqs["fmb_prefix"] = tokenizer.encode(' {"longitudinal": "', add_special_tokens=False)
340
+ seqs["fmb_closing"] = tokenizer.encode('"}', add_special_tokens=False)
341
+ seqs["fmb_closing_comma"] = tokenizer.encode('"},', add_special_tokens=False)
342
+ # Between longitudinal value and lateral value: '", "lateral": "'
343
+ seqs["fmb_between"] = tokenizer.encode('", "lateral": "', add_special_tokens=False)
344
+
345
+ # ── trajectory structure patterns ──
346
+ # After dataloader processing (no mdm markers), traj is:
347
+ # ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
348
+ seqs["traj_open"] = tokenizer.encode(' "[[', add_special_tokens=False)
349
+ # After dataloader inserts spaces (e.g. [+14.70,-00.04] → [ +14.70, -00.04]),
350
+ # tokens split cleanly: '],'(1125), ' ['(508), ','(11) are all independent.
351
+ seqs["traj_wp_sep"] = tokenizer.encode('],', add_special_tokens=False) # [1125]
352
+ seqs["traj_wp_open"] = tokenizer.encode(' [', add_special_tokens=False) # [508]
353
+ seqs["traj_coord_comma"] = tokenizer.encode(',', add_special_tokens=False) # [11]
354
+ seqs["traj_close"] = tokenizer.encode(']]"}', add_special_tokens=False)
355
+ seqs["traj_close_split"] = tokenizer.encode(']]"', add_special_tokens=False)
356
+ seqs["traj_close_split2"] = tokenizer.encode(']]', add_special_tokens=False)
357
+ # Trajectory-only output support, e.g. {"trajectory": "..."}.
358
+ seqs["traj_only_boundaries"] = [
359
+ tokenizer.encode('{"trajectory":', add_special_tokens=False),
360
+ tokenizer.encode(' {"trajectory":', add_special_tokens=False),
361
+ tokenizer.encode('"trajectory":', add_special_tokens=False),
362
+ tokenizer.encode(' "trajectory":', add_special_tokens=False),
363
+ ]
364
+
365
+ return seqs
366
+
367
+
368
+ def _mark_scaffold_range(scaffold_positions: List[int], start: int, length: int):
369
+ """Add positions [start, start+length) to scaffold_positions."""
370
+ for i in range(length):
371
+ scaffold_positions.append(start + i)
372
+
373
+
374
+ def compute_section_block_idx_deep_static(
375
+ labels: torch.Tensor,
376
+ token_ids: torch.Tensor,
377
+ deep_scaffold_sequences: Dict[str, object],
378
+ fallback_block_size: int = 32,
379
+ ) -> Tuple[torch.Tensor, torch.Tensor, int, torch.Tensor]:
380
+ """
381
+ Deep-scaffold v2 block index computation.
382
+
383
+ Freezes sub-keys within sections:
384
+ - critical_objects: only yes/no values are denoised
385
+ - future_meta_behavior: only value tokens are denoised
386
+ - trajectory: only coordinate digits are denoised
387
+ - explanation: all content is denoised
388
+
389
+ Block count per section is computed dynamically:
390
+ ``n_blocks = ceil(num_value_tokens / fallback_block_size)``.
391
+
392
+ Args:
393
+ labels: [B, seq_len]
394
+ token_ids: [B, seq_len]
395
+ deep_scaffold_sequences: output of ``build_deep_scaffold_sequences``
396
+ fallback_block_size: block size (bd_size), default 32
397
+
398
+ Returns:
399
+ response_block_idx, turn_idx, n_blocks, scaffold_mask
400
+ """
401
+ labels_single = labels[0]
402
+ token_list = token_ids[0].tolist()
403
+ seq_len = labels_single.shape[0]
404
+ device = labels.device
405
+
406
+ response_mask = (labels_single != -100)
407
+ response_block_idx = torch.full((seq_len,), -1, device=device, dtype=torch.int64)
408
+ turn_idx = torch.zeros((seq_len,), device=device, dtype=torch.int64)
409
+ scaffold_mask = torch.zeros((seq_len,), device=device, dtype=torch.bool)
410
+
411
+ response_positions = response_mask.nonzero(as_tuple=True)[0]
412
+ if len(response_positions) == 0:
413
+ return response_block_idx, turn_idx, 0, scaffold_mask
414
+
415
+ resp_start = response_positions[0].item()
416
+ resp_end = response_positions[-1].item() + 1
417
+ effective_resp_end = resp_end
418
+ resp_tokens = token_list[resp_start:resp_end]
419
+
420
+ top_seqs = deep_scaffold_sequences["top"]
421
+
422
+ # ── Step 1: Find top-level section boundaries (same as static version) ──
423
+ boundary_order = [
424
+ ("prefix", "critical_objects"),
425
+ ("between_co_exp", "explanation"),
426
+ ("between_exp_fmb", "future_meta_behavior"),
427
+ ("between_fmb_traj", "trajectory"),
428
+ ]
429
+
430
+ sections: Dict[str, Tuple[int, int]] = {}
431
+ scaffold_positions: List[int] = []
432
+ # Top-level boundary scaffold tokens should belong to the *following*
433
+ # section's first block (e.g. `"explanation":` -> explanation block 0).
434
+ boundary_scaffold_to_section: Dict[str, List[int]] = {}
435
+
436
+ search_from = 0
437
+ prev_section_name: Optional[str] = None
438
+ prev_value_start: Optional[int] = None
439
+
440
+ for boundary_key, section_name in boundary_order:
441
+ pattern = top_seqs.get(boundary_key)
442
+ if pattern is None:
443
+ continue
444
+ pos = _find_subseq(resp_tokens, pattern, search_from)
445
+ if pos < 0:
446
+ continue
447
+
448
+ if prev_section_name is not None and prev_value_start is not None:
449
+ sections[prev_section_name] = (prev_value_start, pos)
450
+
451
+ _mark_scaffold_range(scaffold_positions, pos, len(pattern))
452
+ boundary_scaffold_to_section.setdefault(section_name, []).extend(
453
+ list(range(pos, pos + len(pattern)))
454
+ )
455
+
456
+ value_start = pos + len(pattern)
457
+ prev_section_name = section_name
458
+ prev_value_start = value_start
459
+ search_from = value_start
460
+
461
+ if prev_section_name is not None and prev_value_start is not None:
462
+ sections[prev_section_name] = (prev_value_start, len(resp_tokens))
463
+
464
+ # New dataset compatibility: response may contain only trajectory.
465
+ # If the 4-section boundaries are not found, try direct trajectory key match.
466
+ if "trajectory" not in sections:
467
+ traj_only_patterns = deep_scaffold_sequences.get("traj_only_boundaries", [])
468
+ # Reuse legacy boundary pattern as additional fallback (contains
469
+ # `"trajectory":` in old-format responses).
470
+ between_fmb_traj = top_seqs.get("between_fmb_traj")
471
+ if between_fmb_traj:
472
+ traj_only_patterns = list(traj_only_patterns) + [between_fmb_traj]
473
+
474
+ traj_pos = -1
475
+ traj_pat: Optional[List[int]] = None
476
+ for pat in traj_only_patterns:
477
+ if not pat:
478
+ continue
479
+ pos = _find_subseq(resp_tokens, pat, 0)
480
+ if pos >= 0:
481
+ traj_pos = pos
482
+ traj_pat = pat
483
+ break
484
+
485
+ if traj_pos >= 0 and traj_pat is not None:
486
+ _mark_scaffold_range(scaffold_positions, traj_pos, len(traj_pat))
487
+ boundary_scaffold_to_section.setdefault("trajectory", []).extend(
488
+ list(range(traj_pos, traj_pos + len(traj_pat)))
489
+ )
490
+ sections["trajectory"] = (traj_pos + len(traj_pat), len(resp_tokens))
491
+ # print(f"sections: {sections}")
492
+ # ── Step 2: Deep scaffold within critical_objects ──
493
+ if "critical_objects" in sections:
494
+ co_start, co_end = sections["critical_objects"]
495
+ co_tokens = resp_tokens[co_start:co_end]
496
+
497
+ co_search = 0
498
+ for entry in deep_scaffold_sequences["co_subkeys"]:
499
+ pattern = entry["pattern"]
500
+ pos = _find_subseq(co_tokens, pattern, co_search)
501
+ if pos < 0:
502
+ continue
503
+ _mark_scaffold_range(scaffold_positions, co_start + pos, len(pattern))
504
+ # The single value token is right after the pattern — skip it
505
+ co_search = pos + len(pattern) + 1
506
+
507
+ # Mark closing '"}' or "}," as scaffold
508
+ co_close = deep_scaffold_sequences["co_closing"]
509
+ close_pos = _find_subseq(co_tokens, co_close,
510
+ max(0, len(co_tokens) - len(co_close) - 2))
511
+ if close_pos >= 0:
512
+ _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close))
513
+ else:
514
+ # json.dumps may produce "}," as a single token
515
+ co_close_comma = deep_scaffold_sequences.get("co_closing_comma")
516
+ if co_close_comma:
517
+ close_pos = _find_subseq(co_tokens, co_close_comma,
518
+ max(0, len(co_tokens) - len(co_close_comma) - 2))
519
+ if close_pos >= 0:
520
+ _mark_scaffold_range(scaffold_positions, co_start + close_pos, len(co_close_comma))
521
+
522
+ # ── Step 2b: Explanation opening/closing quotes as scaffold ──
523
+ # Explanation content is all VALUE, but the surrounding quotes must be
524
+ # SCAFFOLD so that VALUE tokens are exactly block-aligned (multiple of bd_size).
525
+ if "explanation" in sections:
526
+ exp_start, exp_end = sections["explanation"]
527
+ if exp_start < exp_end:
528
+ # Opening quote: first token of explanation section (e.g. ' "')
529
+ scaffold_positions.append(exp_start)
530
+ # Closing quote+comma: last token (e.g. '",')
531
+ scaffold_positions.append(exp_start + (exp_end - exp_start) - 1)
532
+
533
+ # ── Step 3: Deep scaffold within future_meta_behavior ──
534
+ # After dataloader processing, FMB has no <|mdm_start|>/<|mdm_end|> markers.
535
+ # Format: ' {"longitudinal": "keep speed", "lateral": "go straight"}'
536
+ # Strategy: use fmb_prefix to find start, fmb_between to split long/lat values,
537
+ # and fmb_closing to find end. Everything except value content is scaffold.
538
+ if "future_meta_behavior" in sections:
539
+ fmb_start, fmb_end = sections["future_meta_behavior"]
540
+ fmb_tokens = resp_tokens[fmb_start:fmb_end]
541
+
542
+ fmb_scaffold_positions = set()
543
+
544
+ # 1. Mark fmb_prefix as scaffold: ' {"longitudinal": "'
545
+ fmb_prefix = deep_scaffold_sequences["fmb_prefix"]
546
+ prefix_pos = _find_subseq(fmb_tokens, fmb_prefix, 0)
547
+ if prefix_pos >= 0:
548
+ for i in range(prefix_pos, prefix_pos + len(fmb_prefix)):
549
+ fmb_scaffold_positions.add(i)
550
+
551
+ long_value_start = prefix_pos + len(fmb_prefix)
552
+
553
+ # 2. Mark fmb_between as scaffold: '", "lateral": "'
554
+ fmb_between = deep_scaffold_sequences.get("fmb_between")
555
+ if fmb_between:
556
+ between_pos = _find_subseq(fmb_tokens, fmb_between, long_value_start)
557
+ if between_pos >= 0:
558
+ for i in range(between_pos, between_pos + len(fmb_between)):
559
+ fmb_scaffold_positions.add(i)
560
+
561
+ lat_value_start = between_pos + len(fmb_between)
562
+
563
+ # 3. Mark closing '"}' or "}," as scaffold
564
+ fmb_close = deep_scaffold_sequences["fmb_closing"]
565
+ close_pos = _find_subseq(fmb_tokens, fmb_close,
566
+ max(0, len(fmb_tokens) - len(fmb_close) - 2))
567
+ if close_pos < 0:
568
+ fmb_close_comma = deep_scaffold_sequences.get("fmb_closing_comma")
569
+ if fmb_close_comma:
570
+ close_pos = _find_subseq(fmb_tokens, fmb_close_comma,
571
+ max(0, len(fmb_tokens) - len(fmb_close_comma) - 2))
572
+ if close_pos >= 0:
573
+ fmb_close = fmb_close_comma
574
+ if close_pos >= 0:
575
+ for i in range(close_pos, close_pos + len(fmb_close)):
576
+ fmb_scaffold_positions.add(i)
577
+
578
+ for i in fmb_scaffold_positions:
579
+ scaffold_positions.append(fmb_start + i)
580
+
581
+ # ── Step 4: Deep scaffold within trajectory ──
582
+ # After dataloader processing (no mdm markers), trajectory is:
583
+ # ' "[[+14.70,-00.04], [+29.55,-00.21], ...]"'
584
+ if "trajectory" in sections:
585
+ traj_start, traj_end = sections["trajectory"]
586
+ traj_tokens = resp_tokens[traj_start:traj_end]
587
+
588
+ # Opening "[[
589
+ traj_open = deep_scaffold_sequences["traj_open"]
590
+ open_pos = _find_subseq(traj_tokens, traj_open, 0)
591
+ if open_pos >= 0:
592
+ _mark_scaffold_range(scaffold_positions, traj_start + open_pos, len(traj_open))
593
+
594
+ # Waypoint separators ], (4 of them between 5 waypoints)
595
+ traj_wp_sep = deep_scaffold_sequences["traj_wp_sep"]
596
+ sep_search = 0
597
+ for _ in range(4):
598
+ sep_pos = _find_subseq(traj_tokens, traj_wp_sep, sep_search)
599
+ if sep_pos < 0:
600
+ break
601
+ _mark_scaffold_range(scaffold_positions, traj_start + sep_pos, len(traj_wp_sep))
602
+ sep_search = sep_pos + len(traj_wp_sep)
603
+
604
+ # Intermediate waypoint opening ' [' (4 of them, between 5 waypoints)
605
+ traj_wp_open = deep_scaffold_sequences.get("traj_wp_open")
606
+ if traj_wp_open:
607
+ wo_search = 0
608
+ for _ in range(4):
609
+ wo_pos = _find_subseq(traj_tokens, traj_wp_open, wo_search)
610
+ if wo_pos < 0:
611
+ break
612
+ _mark_scaffold_range(scaffold_positions, traj_start + wo_pos, len(traj_wp_open))
613
+ wo_search = wo_pos + len(traj_wp_open)
614
+
615
+ # Coordinate comma ',' between x and y within each waypoint (5 of them)
616
+ traj_coord_comma = deep_scaffold_sequences.get("traj_coord_comma")
617
+ if traj_coord_comma:
618
+ cc_search = 0
619
+ for _ in range(5):
620
+ cc_pos = _find_subseq(traj_tokens, traj_coord_comma, cc_search)
621
+ if cc_pos < 0:
622
+ break
623
+ _mark_scaffold_range(scaffold_positions, traj_start + cc_pos, len(traj_coord_comma))
624
+ cc_search = cc_pos + len(traj_coord_comma)
625
+
626
+ # Closing ]]" or just ]]
627
+ traj_close = deep_scaffold_sequences["traj_close"]
628
+ close_pos = _find_subseq(traj_tokens, traj_close,
629
+ max(0, len(traj_tokens) - len(traj_close) - 6))
630
+ if close_pos < 0:
631
+ for split_key in ["traj_close_split", "traj_close_split2"]:
632
+ tcs = deep_scaffold_sequences.get(split_key)
633
+ if tcs:
634
+ close_pos = _find_subseq(traj_tokens, tcs,
635
+ max(0, len(traj_tokens) - len(tcs) - 6))
636
+ if close_pos >= 0:
637
+ traj_close = tcs
638
+ break
639
+ if close_pos >= 0:
640
+ _mark_scaffold_range(scaffold_positions, traj_start + close_pos, len(traj_close))
641
+ # Align training with inference scaffold: exclude trailing tokens
642
+ # after the JSON closing of trajectory (e.g. "<|im_end|>\n") from
643
+ # section/block scheduling.
644
+ effective_resp_end = min(
645
+ effective_resp_end,
646
+ resp_start + traj_start + close_pos + len(traj_close),
647
+ )
648
+
649
+ # Opening quote " (first token of traj value)
650
+ if len(traj_tokens) > 0:
651
+ scaffold_positions.append(traj_start)
652
+
653
+ # ── Mark scaffold mask (absolute positions) ──
654
+ scaffold_positions_set = set(scaffold_positions)
655
+ for sp in scaffold_positions_set:
656
+ abs_pos = resp_start + sp
657
+ if abs_pos < seq_len:
658
+ scaffold_mask[abs_pos] = True
659
+
660
+ # ── Assign block indices per section ──
661
+ current_block = 0
662
+ assigned = set()
663
+ block_to_section = {} # block_idx -> section_name (for SASD compatibility)
664
+ section_first_block: Dict[str, int] = {}
665
+
666
+ for section_name in SECTION_KEYS:
667
+ if section_name not in sections:
668
+ continue
669
+
670
+ rel_start, rel_end = sections[section_name]
671
+ abs_start = resp_start + rel_start
672
+ abs_end = resp_start + rel_end
673
+ abs_start = max(abs_start, resp_start)
674
+ abs_end = min(abs_end, effective_resp_end)
675
+
676
+ num_tokens = abs_end - abs_start
677
+ if num_tokens <= 0:
678
+ continue
679
+
680
+ # Count only non-scaffold tokens for block sizing
681
+ value_positions = [p for p in range(abs_start, abs_end)
682
+ if response_mask[p] and (p - resp_start) not in scaffold_positions_set]
683
+ num_value_tokens = len(value_positions)
684
+
685
+ if num_value_tokens <= 0:
686
+ section_first_block[section_name] = current_block
687
+ block_to_section[current_block] = section_name
688
+ current_block += 1
689
+ continue
690
+
691
+ # Use fixed block size (bd_size) and compute number of blocks dynamically
692
+ tokens_per_step = fallback_block_size
693
+ n_steps = max(1, math.ceil(num_value_tokens / tokens_per_step))
694
+
695
+ for b in range(n_steps):
696
+ block_to_section[current_block + b] = section_name
697
+
698
+ section_first_block[section_name] = current_block
699
+ for vi, pos in enumerate(value_positions):
700
+ block_in_section = min(vi // tokens_per_step, n_steps - 1)
701
+ response_block_idx[pos] = current_block + block_in_section
702
+ assigned.add(pos)
703
+
704
+ current_block += n_steps
705
+
706
+ # Assign scaffold tokens within each section to the nearest value token
707
+ # in the SAME section. This keeps section-closing tokens such as `"},`
708
+ # with their section instead of drifting to the next section.
709
+ for section_name in SECTION_KEYS:
710
+ if section_name not in sections:
711
+ continue
712
+ rel_start, rel_end = sections[section_name]
713
+ abs_start = max(resp_start + rel_start, resp_start)
714
+ abs_end = min(resp_start + rel_end, resp_end)
715
+ if abs_end <= abs_start:
716
+ continue
717
+
718
+ for abs_pos in range(abs_start, abs_end):
719
+ rel_pos = abs_pos - resp_start
720
+ if (
721
+ abs_pos >= seq_len
722
+ or not response_mask[abs_pos]
723
+ or abs_pos in assigned
724
+ or rel_pos not in scaffold_positions_set
725
+ ):
726
+ continue
727
+
728
+ best_block = -1
729
+ max_delta = max(1, abs_end - abs_start)
730
+ for delta in range(1, max_delta + 1):
731
+ # Prefer left first so closing punctuation tends to stay with
732
+ # the preceding content in the same section.
733
+ for cand in [abs_pos - delta, abs_pos + delta]:
734
+ if abs_start <= cand < abs_end and cand in assigned:
735
+ best_block = response_block_idx[cand].item()
736
+ break
737
+ if best_block >= 0:
738
+ break
739
+
740
+ if best_block < 0:
741
+ best_block = section_first_block.get(section_name, -1)
742
+
743
+ if best_block >= 0:
744
+ response_block_idx[abs_pos] = best_block
745
+ assigned.add(abs_pos)
746
+
747
+ # Top-level boundary tokens are explicitly attached to the following
748
+ # section's first block, instead of nearest-neighbor assignment.
749
+ for section_name, rel_positions in boundary_scaffold_to_section.items():
750
+ first_block = section_first_block.get(section_name)
751
+ if first_block is None:
752
+ continue
753
+ for rel_pos in rel_positions:
754
+ abs_pos = resp_start + rel_pos
755
+ if abs_pos >= seq_len or not response_mask[abs_pos]:
756
+ continue
757
+ response_block_idx[abs_pos] = first_block
758
+ assigned.add(abs_pos)
759
+
760
+ # Scaffold tokens → block index of nearest assigned neighbour
761
+ for sp in scaffold_positions_set:
762
+ abs_pos = resp_start + sp
763
+ if (
764
+ abs_pos >= seq_len
765
+ or abs_pos >= effective_resp_end
766
+ or not response_mask[abs_pos]
767
+ or abs_pos in assigned
768
+ ):
769
+ continue
770
+ best_block = -1
771
+ for delta in range(1, seq_len):
772
+ for cand in [abs_pos + delta, abs_pos - delta]:
773
+ if 0 <= cand < seq_len and cand in assigned:
774
+ best_block = response_block_idx[cand].item()
775
+ break
776
+ if best_block >= 0:
777
+ break
778
+ if best_block >= 0:
779
+ response_block_idx[abs_pos] = best_block
780
+ assigned.add(abs_pos)
781
+
782
+ # Fallback for unassigned response tokens
783
+ for pos in range(resp_start, effective_resp_end):
784
+ if response_mask[pos] and pos not in assigned:
785
+ offset = pos - resp_start
786
+ response_block_idx[pos] = current_block + offset // fallback_block_size
787
+ assigned.add(pos)
788
+
789
+ fallback_positions = [p for p in range(resp_start, effective_resp_end)
790
+ if response_mask[p] and response_block_idx[p].item() >= current_block]
791
+ if fallback_positions:
792
+ current_block = max(response_block_idx[p].item() for p in fallback_positions) + 1
793
+
794
+ n_blocks = current_block
795
+
796
+ # Turn index
797
+ for i in range(1, seq_len):
798
+ if response_block_idx[i] != response_block_idx[i - 1]:
799
+ turn_idx[i] = turn_idx[i - 1] + 1
800
+ else:
801
+ turn_idx[i] = turn_idx[i - 1]
802
+
803
+ return response_block_idx, turn_idx, n_blocks, scaffold_mask, block_to_section
special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "|<MASK>|",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "eos_token": {
12
+ "content": "<|im_end|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "pad_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ }
25
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a25ec1183126b2a0a76961dba7680d62b2209776fc31d39b85be8833b9386ae9
3
+ size 11422266
tokenizer_config.json ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "|<MASK>|",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|NULL|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ }
197
+ },
198
+ "additional_special_tokens": [
199
+ "|<MASK>|"
200
+ ],
201
+ "bos_token": null,
202
+ "clean_up_tokenization_spaces": false,
203
+ "eos_token": "<|im_end|>",
204
+ "errors": "replace",
205
+ "extra_special_tokens": {},
206
+ "model_max_length": 131072,
207
+ "pad_token": "<|endoftext|>",
208
+ "padding_side": "right",
209
+ "split_special_tokens": false,
210
+ "tokenizer_class": "Qwen2Tokenizer",
211
+ "unk_token": null
212
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff