Yingxu He commited on
Commit
c73efa1
1 Parent(s): ee6fe1d

Upload MERaLiONForConditionalGeneration

Browse files
config.json CHANGED
@@ -1,7 +1,10 @@
1
  {
2
- "_attn_implementation_autoset": true,
 
 
3
  "auto_map": {
4
- "AutoConfig": "configuration_meralion.MERaLiONConfig"
 
5
  },
6
  "head_dim": 256,
7
  "hidden_size": 3584,
@@ -163,5 +166,6 @@
163
  "sliding_window_size": 4096,
164
  "torch_dtype": "bfloat16"
165
  },
 
166
  "transformers_version": "4.46.3"
167
  }
 
1
  {
2
+ "architectures": [
3
+ "MERaLiONForConditionalGeneration"
4
+ ],
5
  "auto_map": {
6
+ "AutoConfig": "configuration_meralion.MERaLiONConfig",
7
+ "AutoModelForSpeechSeq2Seq": "modeling_meralion.MERaLiONForConditionalGeneration"
8
  },
9
  "head_dim": 256,
10
  "hidden_size": 3584,
 
166
  "sliding_window_size": 4096,
167
  "torch_dtype": "bfloat16"
168
  },
169
+ "torch_dtype": "bfloat16",
170
  "transformers_version": "4.46.3"
171
  }
generation_config.json CHANGED
@@ -4,5 +4,5 @@
4
  "cache_implementation": "hybrid",
5
  "eos_token_id": 107,
6
  "pad_token_id": 0,
7
- "transformers_version": "4.44.2"
8
  }
 
4
  "cache_implementation": "hybrid",
5
  "eos_token_id": 107,
6
  "pad_token_id": 0,
7
+ "transformers_version": "4.46.3"
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:270fdb3d09f24b4a2cf476b1efb2392f1ec2effc77505b591d68666bebeaa230
3
- size 4984397848
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:109c417e21fe736abd753bb25e7be7400fcd607e557b7809fe2fada6cce16c24
3
+ size 4984029208
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f29e844bfed074de0b3d36255695552b9583abfbc7f76c2bd6d3ff78a541b95
3
  size 4991612592
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2c2d7f48181275b438981c4c760e1df3622f79d935b48d16a2e7cf43c52dea1
3
  size 4991612592
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72660974f5e6a798f8e269c979c4abf2f74f45a0f1f8bbde9d204aca360e69f6
3
  size 4918183272
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e6ef9acd74f3c0d229be95dc8dfdd3bc59af97547412070c5ca5f722fcd4ebd
3
  size 4918183272
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6a978abdeae43c46a9cb3687aa9ccf1c3e421d01a16fb27f7fa14e64ad4cb9fa
3
  size 4962259664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18981d354c807e0cde98e0521acfc3422faec53fcf9a95ce72ac63d5dbd1f7d0
3
  size 4962259664
model.safetensors.index.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "metadata": {
3
- "total_size": 19856334336
4
  },
5
  "weight_map": {
6
  "ln_speech.bias": "model-00001-of-00004.safetensors",
 
1
  {
2
  "metadata": {
3
+ "total_size": 19855965696
4
  },
5
  "weight_map": {
6
  "ln_speech.bias": "model-00001-of-00004.safetensors",
modeling_meralion.py CHANGED
@@ -1031,12 +1031,6 @@ MERALION_INPUTS_DOCSTRING = r"""
1031
  [`PreTrainedTokenizer.__call__`] for details.
1032
 
1033
  [What are input IDs?](../glossary#input-ids)
1034
- input_ids_left (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1035
- Indices of left-padded input sequences tokens in the vocabulary. Padding will be ignored by default should you provide
1036
- it.
1037
- input_ids_right (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1038
- Indices of right-padded input sequences tokens in the vocabulary. Padding will be ignored by default should you provide
1039
- it.
1040
  input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`, *optional*):
1041
  Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
1042
  loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
@@ -1063,17 +1057,6 @@ MERALION_INPUTS_DOCSTRING = r"""
1063
 
1064
  - 1 indicates the head is **not masked**,
1065
  - 0 indicates the head is **masked**.
1066
-
1067
- attention_mask_left (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
1068
- Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1069
-
1070
- - 1 for tokens that are **not masked**,
1071
- - 0 for tokens that are **masked**.
1072
- attention_mask_right (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
1073
- Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1074
-
1075
- - 1 for tokens that are **not masked**,
1076
- - 0 for tokens that are **masked**.
1077
  feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
1078
  Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1079
 
 
1031
  [`PreTrainedTokenizer.__call__`] for details.
1032
 
1033
  [What are input IDs?](../glossary#input-ids)
 
 
 
 
 
 
1034
  input_features (`torch.FloatTensor` of shape `(batch_size, feature_size, feature_sequence_length)`, *optional*):
1035
  Float values mel features extracted from the raw speech waveform. Raw speech waveform can be obtained by
1036
  loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
 
1057
 
1058
  - 1 indicates the head is **not masked**,
1059
  - 0 indicates the head is **masked**.
 
 
 
 
 
 
 
 
 
 
 
1060
  feature_attention_mask (`torch.Tensor` of shape `(batch_size, feature_sequence_length)`, *optional*):
1061
  Mask to avoid performing attention on padding feature indices. Mask values selected in `[0, 1]`:
1062