qgallouedec HF Staff commited on
Commit
e16e18c
·
verified ·
1 Parent(s): 9d27147

Upload VoxtralForConditionalGeneration

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tekken.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ tags:
4
+ - trl
5
+ ---
6
+
7
+ # Tiny VoxtralForConditionalGeneration
8
+
9
+ This is a minimal model built for unit tests in the [TRL](https://github.com/huggingface/trl) library.
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VoxtralForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "activation_dropout": 0.0,
7
+ "activation_function": "gelu",
8
+ "attention_dropout": 0.0,
9
+ "dropout": 0.0,
10
+ "head_dim": 64,
11
+ "hidden_size": 1280,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 5120,
14
+ "layerdrop": 0.0,
15
+ "max_source_positions": 1500,
16
+ "model_type": "voxtral_encoder",
17
+ "num_attention_heads": 20,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 20,
20
+ "num_mel_bins": 128,
21
+ "scale_embedding": false,
22
+ "vocab_size": 51866
23
+ },
24
+ "audio_token_id": 24,
25
+ "dtype": "bfloat16",
26
+ "hidden_size": 3072,
27
+ "model_type": "voxtral",
28
+ "projector_hidden_act": "gelu",
29
+ "text_config": {
30
+ "attention_bias": false,
31
+ "attention_dropout": 0.0,
32
+ "head_dim": 4,
33
+ "hidden_act": "silu",
34
+ "hidden_size": 16,
35
+ "initializer_range": 0.02,
36
+ "intermediate_size": 32,
37
+ "layer_types": null,
38
+ "max_position_embeddings": 131072,
39
+ "mlp_bias": false,
40
+ "model_type": "llama",
41
+ "num_attention_heads": 4,
42
+ "num_hidden_layers": 2,
43
+ "num_key_value_heads": 2,
44
+ "pretraining_tp": 1,
45
+ "rms_norm_eps": 1e-05,
46
+ "rope_scaling": null,
47
+ "rope_theta": 100000000.0,
48
+ "sliding_window": null,
49
+ "use_cache": true,
50
+ "vocab_size": 131072
51
+ },
52
+ "transformers_version": "4.56.2",
53
+ "vocab_size": 131072
54
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "pad_token_id": 11,
5
+ "transformers_version": "4.56.2"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ccef54545a1e5b005e5d5332b169749da57017aa2de3017f85ffff5a7f552a1d
3
+ size 1282559192
preprocessor_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "n_fft": 400,
8
+ "n_samples": 480000,
9
+ "nb_max_frames": 3000,
10
+ "padding_side": "right",
11
+ "padding_value": 0.0,
12
+ "processor_class": "VoxtralProcessor",
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ }
tekken.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aaf3836c2a5332f029ce85a7a62255c966f47b6797ef81dedd0ade9c862e4a8
3
+ size 14894206