init
Browse files- .gitignore +2 -0
- README.md +1 -1
- app.py +79 -0
- model.py +133 -0
- my_model/added_tokens.json +6 -0
- my_model/config.json +138 -0
- my_model/model.safetensors +3 -0
- my_model/preprocessor_config.json +10 -0
- my_model/special_tokens_map.json +6 -0
- my_model/tokenizer_config.json +48 -0
- my_model/vocab.json +1 -0
- requirements.txt +8 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.venv/
|
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Anime Speech Emotion Recognition
|
3 |
-
emoji:
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Anime Speech Emotion Recognition
|
3 |
+
emoji: 😊😱😠
|
4 |
colorFrom: gray
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
app.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pprint
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
import librosa
|
5 |
+
import plotly.graph_objects as go
|
6 |
+
import spaces
|
7 |
+
import torch
|
8 |
+
from loguru import logger
|
9 |
+
from transformers import AutoFeatureExtractor
|
10 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
11 |
+
|
12 |
+
from model import EmotionModel
|
13 |
+
|
14 |
+
repo_id = "my_model"
|
15 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
16 |
+
logger.info(f"device: {device}")
|
17 |
+
model = EmotionModel.from_pretrained(repo_id, device_map=device)
|
18 |
+
model.eval()
|
19 |
+
processor = AutoFeatureExtractor.from_pretrained(repo_id)
|
20 |
+
|
21 |
+
label_map = {
|
22 |
+
"Angry": "😠 怒り",
|
23 |
+
"Disgusted": "😒 嫌悪",
|
24 |
+
"Embarrassed": "😳 戸惑い",
|
25 |
+
"Fearful": "😨 恐怖",
|
26 |
+
"Happy": "😊 幸せ",
|
27 |
+
"Sad": "😢 悲しみ",
|
28 |
+
"Surprised": "😲 驚き",
|
29 |
+
"Neutral": "😐 中立",
|
30 |
+
"Sexual1": "🥰 NSFW1",
|
31 |
+
"Sexual2": "🍭 NSFW2",
|
32 |
+
}
|
33 |
+
|
34 |
+
|
35 |
+
@spaces.GPU
|
36 |
+
def pipe(filename: str) -> tuple[dict[str, float], go.Figure]:
|
37 |
+
audio, sr = librosa.load(filename, sr=16000)
|
38 |
+
duration = librosa.get_duration(y=audio, sr=sr)
|
39 |
+
logger.info(f"filename: {filename}, duration: {duration}")
|
40 |
+
if duration > 30.0:
|
41 |
+
return (
|
42 |
+
{f"Error: 音声ファイルの長さが長すぎます: {duration}秒": 0.0},
|
43 |
+
go.Figure(),
|
44 |
+
)
|
45 |
+
inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
|
46 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
47 |
+
with torch.no_grad():
|
48 |
+
outputs: SequenceClassifierOutput = model(**inputs)
|
49 |
+
logits = outputs.logits # shape: (batch_size, num_labels)
|
50 |
+
# ロジットの取得
|
51 |
+
logits = logits[0].cpu().numpy()
|
52 |
+
labels = [label_map[label] for id, label in model.config.id2label.items()]
|
53 |
+
sorted_pairs = sorted(zip(logits, labels), key=lambda x: x[0])
|
54 |
+
sorted_logits, sorted_labels = zip(*sorted_pairs)
|
55 |
+
logger.info(f"Result:\n{pprint.pformat(sorted_pairs)}")
|
56 |
+
probabilities = outputs.logits.softmax(dim=-1)
|
57 |
+
scores_dict = {label: prob.item() for label, prob in zip(labels, probabilities[0])}
|
58 |
+
fig = go.Figure([go.Bar(x=sorted_logits, y=sorted_labels, orientation="h")])
|
59 |
+
return scores_dict, fig
|
60 |
+
|
61 |
+
|
62 |
+
md = """
|
63 |
+
# 音声からの感情認識 ver 0.1
|
64 |
+
|
65 |
+
- 音声ファイルから感情を予測して、確率とlogits (softmax前の値) を表示します
|
66 |
+
- 30秒以上の音声ファイルは受け付けません
|
67 |
+
"""
|
68 |
+
|
69 |
+
with gr.Blocks() as app:
|
70 |
+
gr.Markdown(md)
|
71 |
+
audio = gr.Audio(type="filepath")
|
72 |
+
btn = gr.Button("感情を予測")
|
73 |
+
with gr.Row():
|
74 |
+
result = gr.Label(label="結果")
|
75 |
+
plot = gr.Plot(label="Logits")
|
76 |
+
|
77 |
+
btn.click(pipe, inputs=audio, outputs=[result, plot])
|
78 |
+
|
79 |
+
app.launch()
|
model.py
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Optional
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
from transformers.activations import get_activation
|
6 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
7 |
+
from transformers.models.wav2vec2.modeling_wav2vec2 import (
|
8 |
+
Wav2Vec2Model,
|
9 |
+
Wav2Vec2PreTrainedModel,
|
10 |
+
)
|
11 |
+
|
12 |
+
_HIDDEN_STATES_START_POSITION = 2
|
13 |
+
|
14 |
+
|
15 |
+
class ClassificationHead(nn.Module):
|
16 |
+
def __init__(self, config):
|
17 |
+
super().__init__()
|
18 |
+
print(f"classifier_proj_size: {config.classifier_proj_size}")
|
19 |
+
self.dense = nn.Linear(config.hidden_size, config.classifier_proj_size)
|
20 |
+
self.layer_norm = nn.LayerNorm(config.classifier_proj_size)
|
21 |
+
self.dropout = nn.Dropout(config.final_dropout)
|
22 |
+
self.out_proj = nn.Linear(config.classifier_proj_size, config.num_labels)
|
23 |
+
print(f"Head activation: {config.head_activation}")
|
24 |
+
self.activation = get_activation(config.head_activation)
|
25 |
+
|
26 |
+
def forward(self, features, **kwargs):
|
27 |
+
x = features
|
28 |
+
x = self.dense(x)
|
29 |
+
x = self.layer_norm(x)
|
30 |
+
x = self.activation(x)
|
31 |
+
x = self.dropout(x)
|
32 |
+
x = self.out_proj(x)
|
33 |
+
return x
|
34 |
+
|
35 |
+
|
36 |
+
class EmotionModel(Wav2Vec2PreTrainedModel):
|
37 |
+
"""Speech emotion classifier."""
|
38 |
+
|
39 |
+
def __init__(self, config, counts: Optional[dict[int, int]] = None):
|
40 |
+
super().__init__(config)
|
41 |
+
|
42 |
+
self.config = config
|
43 |
+
self.wav2vec2 = Wav2Vec2Model(config)
|
44 |
+
self.classifier = ClassificationHead(config)
|
45 |
+
num_layers = (
|
46 |
+
config.num_hidden_layers + 1
|
47 |
+
) # transformer layers + input embeddings
|
48 |
+
if config.use_weighted_layer_sum:
|
49 |
+
self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
|
50 |
+
self.init_weights()
|
51 |
+
|
52 |
+
# counts が与えられている場合、クラスの重みを計算
|
53 |
+
if counts is not None:
|
54 |
+
print(f"Using class weights: {counts}")
|
55 |
+
counts_list = [counts[i] for i in range(config.num_labels)]
|
56 |
+
counts_tensor = torch.tensor(
|
57 |
+
counts_list, dtype=torch.float, device="cuda:0"
|
58 |
+
)
|
59 |
+
total_samples = counts_tensor.sum()
|
60 |
+
class_weights = total_samples / (config.num_labels * counts_tensor)
|
61 |
+
# 重みを正規化(任意)
|
62 |
+
class_weights = class_weights / class_weights.sum() * config.num_labels
|
63 |
+
self.class_weights = class_weights
|
64 |
+
else:
|
65 |
+
self.class_weights = None # counts がない場合は None に設定
|
66 |
+
|
67 |
+
def forward(
|
68 |
+
self,
|
69 |
+
input_values: Optional[torch.Tensor],
|
70 |
+
attention_mask: Optional[torch.Tensor] = None,
|
71 |
+
output_attentions: Optional[bool] = None,
|
72 |
+
output_hidden_states: Optional[bool] = None,
|
73 |
+
return_dict: Optional[bool] = None,
|
74 |
+
labels: Optional[torch.Tensor] = None,
|
75 |
+
):
|
76 |
+
return_dict = (
|
77 |
+
return_dict if return_dict is not None else self.config.use_return_dict
|
78 |
+
)
|
79 |
+
output_hidden_states = (
|
80 |
+
True if self.config.use_weighted_layer_sum else output_hidden_states
|
81 |
+
)
|
82 |
+
# print(f"output_hidden_states: {output_hidden_states}")
|
83 |
+
|
84 |
+
outputs = self.wav2vec2(
|
85 |
+
input_values,
|
86 |
+
attention_mask=attention_mask,
|
87 |
+
output_attentions=output_attentions,
|
88 |
+
output_hidden_states=output_hidden_states,
|
89 |
+
return_dict=return_dict,
|
90 |
+
)
|
91 |
+
|
92 |
+
if self.config.use_weighted_layer_sum:
|
93 |
+
hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
|
94 |
+
hidden_states = torch.stack(hidden_states, dim=1)
|
95 |
+
norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
|
96 |
+
hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
|
97 |
+
else:
|
98 |
+
hidden_states = outputs[0]
|
99 |
+
|
100 |
+
if attention_mask is None:
|
101 |
+
pooled_output = hidden_states.mean(dim=1)
|
102 |
+
else:
|
103 |
+
padding_mask = self._get_feature_vector_attention_mask(
|
104 |
+
hidden_states.shape[1], attention_mask
|
105 |
+
)
|
106 |
+
hidden_states[~padding_mask] = 0.0
|
107 |
+
pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(
|
108 |
+
-1, 1
|
109 |
+
)
|
110 |
+
|
111 |
+
logits = self.classifier(pooled_output)
|
112 |
+
|
113 |
+
loss = None
|
114 |
+
if labels is not None:
|
115 |
+
# CrossEntropyLoss に重みを適用(class_weights が None でも機能する)
|
116 |
+
loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
|
117 |
+
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
|
118 |
+
|
119 |
+
return SequenceClassifierOutput(
|
120 |
+
loss=loss,
|
121 |
+
logits=logits,
|
122 |
+
hidden_states=outputs.hidden_states,
|
123 |
+
attentions=outputs.attentions,
|
124 |
+
)
|
125 |
+
|
126 |
+
def freeze_base_model(self):
|
127 |
+
r"""Freeze base model."""
|
128 |
+
for param in self.wav2vec2.parameters():
|
129 |
+
param.requires_grad = False
|
130 |
+
|
131 |
+
def freeze_feature_encoder(self):
|
132 |
+
r"""Freeze feature extractor."""
|
133 |
+
self.wav2vec2.freeze_feature_encoder()
|
my_model/added_tokens.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</s>": 1,
|
3 |
+
"<pad>": 3,
|
4 |
+
"<s>": 0,
|
5 |
+
"<unk>": 2
|
6 |
+
}
|
my_model/config.json
ADDED
@@ -0,0 +1,138 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "my_train\\relu_layer_norm\\checkpoint-0",
|
3 |
+
"activation_dropout": 0.1,
|
4 |
+
"adapter_attn_dim": null,
|
5 |
+
"adapter_kernel_size": 3,
|
6 |
+
"adapter_stride": 2,
|
7 |
+
"add_adapter": false,
|
8 |
+
"apply_spec_augment": true,
|
9 |
+
"architectures": [
|
10 |
+
"EmotionModel"
|
11 |
+
],
|
12 |
+
"attention_dropout": 0.1,
|
13 |
+
"bos_token_id": 1,
|
14 |
+
"classifier_proj_size": 256,
|
15 |
+
"codevector_dim": 768,
|
16 |
+
"contrastive_logits_temperature": 0.1,
|
17 |
+
"conv_bias": true,
|
18 |
+
"conv_dim": [
|
19 |
+
512,
|
20 |
+
512,
|
21 |
+
512,
|
22 |
+
512,
|
23 |
+
512,
|
24 |
+
512,
|
25 |
+
512
|
26 |
+
],
|
27 |
+
"conv_kernel": [
|
28 |
+
10,
|
29 |
+
3,
|
30 |
+
3,
|
31 |
+
3,
|
32 |
+
3,
|
33 |
+
2,
|
34 |
+
2
|
35 |
+
],
|
36 |
+
"conv_stride": [
|
37 |
+
5,
|
38 |
+
2,
|
39 |
+
2,
|
40 |
+
2,
|
41 |
+
2,
|
42 |
+
2,
|
43 |
+
2
|
44 |
+
],
|
45 |
+
"ctc_loss_reduction": "sum",
|
46 |
+
"ctc_zero_infinity": false,
|
47 |
+
"diversity_loss_weight": 0.1,
|
48 |
+
"do_stable_layer_norm": true,
|
49 |
+
"eos_token_id": 2,
|
50 |
+
"feat_extract_activation": "gelu",
|
51 |
+
"feat_extract_dropout": 0.0,
|
52 |
+
"feat_extract_norm": "layer",
|
53 |
+
"feat_proj_dropout": 0.1,
|
54 |
+
"feat_quantizer_dropout": 0.0,
|
55 |
+
"final_dropout": 0.1,
|
56 |
+
"finetuning_task": "wav2vec2_reg",
|
57 |
+
"gradient_checkpointing": false,
|
58 |
+
"head_activation": "relu",
|
59 |
+
"hidden_act": "gelu",
|
60 |
+
"hidden_dropout": 0.1,
|
61 |
+
"hidden_dropout_prob": 0.1,
|
62 |
+
"hidden_size": 1024,
|
63 |
+
"id2label": {
|
64 |
+
"0": "Angry",
|
65 |
+
"1": "Disgusted",
|
66 |
+
"2": "Embarrassed",
|
67 |
+
"3": "Fearful",
|
68 |
+
"4": "Happy",
|
69 |
+
"5": "Sad",
|
70 |
+
"6": "Surprised",
|
71 |
+
"7": "Neutral",
|
72 |
+
"8": "Sexual1",
|
73 |
+
"9": "Sexual2"
|
74 |
+
},
|
75 |
+
"initializer_range": 0.02,
|
76 |
+
"intermediate_size": 4096,
|
77 |
+
"label2id": {
|
78 |
+
"Angry": 0,
|
79 |
+
"Disgusted": 1,
|
80 |
+
"Embarrassed": 2,
|
81 |
+
"Fearful": 3,
|
82 |
+
"Happy": 4,
|
83 |
+
"Neutral": 7,
|
84 |
+
"Sad": 5,
|
85 |
+
"Sexual1": 8,
|
86 |
+
"Sexual2": 9,
|
87 |
+
"Surprised": 6
|
88 |
+
},
|
89 |
+
"layer_norm_eps": 1e-05,
|
90 |
+
"layerdrop": 0.1,
|
91 |
+
"mask_feature_length": 10,
|
92 |
+
"mask_feature_min_masks": 0,
|
93 |
+
"mask_feature_prob": 0.0,
|
94 |
+
"mask_time_length": 10,
|
95 |
+
"mask_time_min_masks": 2,
|
96 |
+
"mask_time_prob": 0.05,
|
97 |
+
"model_type": "wav2vec2",
|
98 |
+
"num_adapter_layers": 3,
|
99 |
+
"num_attention_heads": 16,
|
100 |
+
"num_codevector_groups": 2,
|
101 |
+
"num_codevectors_per_group": 320,
|
102 |
+
"num_conv_pos_embedding_groups": 16,
|
103 |
+
"num_conv_pos_embeddings": 128,
|
104 |
+
"num_feat_extract_layers": 7,
|
105 |
+
"num_hidden_layers": 12,
|
106 |
+
"num_negatives": 100,
|
107 |
+
"output_hidden_size": 1024,
|
108 |
+
"pad_token_id": 0,
|
109 |
+
"pooling_mode": "mean",
|
110 |
+
"problem_type": "regression",
|
111 |
+
"proj_codevector_dim": 768,
|
112 |
+
"tdnn_dilation": [
|
113 |
+
1,
|
114 |
+
2,
|
115 |
+
3,
|
116 |
+
1,
|
117 |
+
1
|
118 |
+
],
|
119 |
+
"tdnn_dim": [
|
120 |
+
512,
|
121 |
+
512,
|
122 |
+
512,
|
123 |
+
512,
|
124 |
+
1500
|
125 |
+
],
|
126 |
+
"tdnn_kernel": [
|
127 |
+
5,
|
128 |
+
3,
|
129 |
+
3,
|
130 |
+
1,
|
131 |
+
1
|
132 |
+
],
|
133 |
+
"torch_dtype": "float32",
|
134 |
+
"transformers_version": "4.46.0",
|
135 |
+
"use_weighted_layer_sum": false,
|
136 |
+
"vocab_size": null,
|
137 |
+
"xvector_output_dim": 512
|
138 |
+
}
|
my_model/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2f0e4688f66d2ed06c3a664be5d446e16273c060a7bc613a48c079a24e491b1
|
3 |
+
size 658226968
|
my_model/preprocessor_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"do_normalize": true,
|
3 |
+
"feature_extractor_type": "Wav2Vec2FeatureExtractor",
|
4 |
+
"feature_size": 1,
|
5 |
+
"padding_side": "right",
|
6 |
+
"padding_value": 0.0,
|
7 |
+
"processor_class": "Wav2Vec2Processor",
|
8 |
+
"return_attention_mask": true,
|
9 |
+
"sampling_rate": 16000
|
10 |
+
}
|
my_model/special_tokens_map.json
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<s>",
|
3 |
+
"eos_token": "</s>",
|
4 |
+
"pad_token": "<pad>",
|
5 |
+
"unk_token": "<unk>"
|
6 |
+
}
|
my_model/tokenizer_config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "</s>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "<unk>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<pad>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
}
|
35 |
+
},
|
36 |
+
"bos_token": "<s>",
|
37 |
+
"clean_up_tokenization_spaces": false,
|
38 |
+
"do_lower_case": false,
|
39 |
+
"eos_token": "</s>",
|
40 |
+
"model_max_length": 1000000000000000019884624838656,
|
41 |
+
"pad_token": "<pad>",
|
42 |
+
"processor_class": "Wav2Vec2Processor",
|
43 |
+
"replace_word_delimiter_char": " ",
|
44 |
+
"target_lang": null,
|
45 |
+
"tokenizer_class": "Wav2Vec2CTCTokenizer",
|
46 |
+
"unk_token": "<unk>",
|
47 |
+
"word_delimiter_token": "|"
|
48 |
+
}
|
my_model/vocab.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{}
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
librosa
|
3 |
+
loguru
|
4 |
+
numpy<2.0
|
5 |
+
plotly
|
6 |
+
spaces
|
7 |
+
torch
|
8 |
+
transformers[torch]
|