litagin commited on
Commit
3e40110
1 Parent(s): 84e2f1d
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ .venv/
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Anime Speech Emotion Recognition
3
- emoji: 🔥
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
 
1
  ---
2
  title: Anime Speech Emotion Recognition
3
+ emoji: 😊😱😠
4
  colorFrom: gray
5
  colorTo: red
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pprint
2
+
3
+ import gradio as gr
4
+ import librosa
5
+ import plotly.graph_objects as go
6
+ import spaces
7
+ import torch
8
+ from loguru import logger
9
+ from transformers import AutoFeatureExtractor
10
+ from transformers.modeling_outputs import SequenceClassifierOutput
11
+
12
+ from model import EmotionModel
13
+
14
+ repo_id = "my_model"
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ logger.info(f"device: {device}")
17
+ model = EmotionModel.from_pretrained(repo_id, device_map=device)
18
+ model.eval()
19
+ processor = AutoFeatureExtractor.from_pretrained(repo_id)
20
+
21
+ label_map = {
22
+ "Angry": "😠 怒り",
23
+ "Disgusted": "😒 嫌悪",
24
+ "Embarrassed": "😳 戸惑い",
25
+ "Fearful": "😨 恐怖",
26
+ "Happy": "😊 幸せ",
27
+ "Sad": "😢 悲しみ",
28
+ "Surprised": "😲 驚き",
29
+ "Neutral": "😐 中立",
30
+ "Sexual1": "🥰 NSFW1",
31
+ "Sexual2": "🍭 NSFW2",
32
+ }
33
+
34
+
35
+ @spaces.GPU
36
+ def pipe(filename: str) -> tuple[dict[str, float], go.Figure]:
37
+ audio, sr = librosa.load(filename, sr=16000)
38
+ duration = librosa.get_duration(y=audio, sr=sr)
39
+ logger.info(f"filename: {filename}, duration: {duration}")
40
+ if duration > 30.0:
41
+ return (
42
+ {f"Error: 音声ファイルの長さが長すぎます: {duration}秒": 0.0},
43
+ go.Figure(),
44
+ )
45
+ inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
46
+ inputs = {k: v.to(device) for k, v in inputs.items()}
47
+ with torch.no_grad():
48
+ outputs: SequenceClassifierOutput = model(**inputs)
49
+ logits = outputs.logits # shape: (batch_size, num_labels)
50
+ # ロジットの取得
51
+ logits = logits[0].cpu().numpy()
52
+ labels = [label_map[label] for id, label in model.config.id2label.items()]
53
+ sorted_pairs = sorted(zip(logits, labels), key=lambda x: x[0])
54
+ sorted_logits, sorted_labels = zip(*sorted_pairs)
55
+ logger.info(f"Result:\n{pprint.pformat(sorted_pairs)}")
56
+ probabilities = outputs.logits.softmax(dim=-1)
57
+ scores_dict = {label: prob.item() for label, prob in zip(labels, probabilities[0])}
58
+ fig = go.Figure([go.Bar(x=sorted_logits, y=sorted_labels, orientation="h")])
59
+ return scores_dict, fig
60
+
61
+
62
+ md = """
63
+ # 音声からの感情認識 ver 0.1
64
+
65
+ - 音声ファイルから感情を予測して、確率とlogits (softmax前の値) を表示します
66
+ - 30秒以上の音声ファイルは受け付けません
67
+ """
68
+
69
+ with gr.Blocks() as app:
70
+ gr.Markdown(md)
71
+ audio = gr.Audio(type="filepath")
72
+ btn = gr.Button("感情を予測")
73
+ with gr.Row():
74
+ result = gr.Label(label="結果")
75
+ plot = gr.Plot(label="Logits")
76
+
77
+ btn.click(pipe, inputs=audio, outputs=[result, plot])
78
+
79
+ app.launch()
model.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from transformers.activations import get_activation
6
+ from transformers.modeling_outputs import SequenceClassifierOutput
7
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
8
+ Wav2Vec2Model,
9
+ Wav2Vec2PreTrainedModel,
10
+ )
11
+
12
+ _HIDDEN_STATES_START_POSITION = 2
13
+
14
+
15
+ class ClassificationHead(nn.Module):
16
+ def __init__(self, config):
17
+ super().__init__()
18
+ print(f"classifier_proj_size: {config.classifier_proj_size}")
19
+ self.dense = nn.Linear(config.hidden_size, config.classifier_proj_size)
20
+ self.layer_norm = nn.LayerNorm(config.classifier_proj_size)
21
+ self.dropout = nn.Dropout(config.final_dropout)
22
+ self.out_proj = nn.Linear(config.classifier_proj_size, config.num_labels)
23
+ print(f"Head activation: {config.head_activation}")
24
+ self.activation = get_activation(config.head_activation)
25
+
26
+ def forward(self, features, **kwargs):
27
+ x = features
28
+ x = self.dense(x)
29
+ x = self.layer_norm(x)
30
+ x = self.activation(x)
31
+ x = self.dropout(x)
32
+ x = self.out_proj(x)
33
+ return x
34
+
35
+
36
+ class EmotionModel(Wav2Vec2PreTrainedModel):
37
+ """Speech emotion classifier."""
38
+
39
+ def __init__(self, config, counts: Optional[dict[int, int]] = None):
40
+ super().__init__(config)
41
+
42
+ self.config = config
43
+ self.wav2vec2 = Wav2Vec2Model(config)
44
+ self.classifier = ClassificationHead(config)
45
+ num_layers = (
46
+ config.num_hidden_layers + 1
47
+ ) # transformer layers + input embeddings
48
+ if config.use_weighted_layer_sum:
49
+ self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
50
+ self.init_weights()
51
+
52
+ # counts が与えられている場合、クラスの重みを計算
53
+ if counts is not None:
54
+ print(f"Using class weights: {counts}")
55
+ counts_list = [counts[i] for i in range(config.num_labels)]
56
+ counts_tensor = torch.tensor(
57
+ counts_list, dtype=torch.float, device="cuda:0"
58
+ )
59
+ total_samples = counts_tensor.sum()
60
+ class_weights = total_samples / (config.num_labels * counts_tensor)
61
+ # 重みを正規化(任意)
62
+ class_weights = class_weights / class_weights.sum() * config.num_labels
63
+ self.class_weights = class_weights
64
+ else:
65
+ self.class_weights = None # counts がない場合は None に設定
66
+
67
+ def forward(
68
+ self,
69
+ input_values: Optional[torch.Tensor],
70
+ attention_mask: Optional[torch.Tensor] = None,
71
+ output_attentions: Optional[bool] = None,
72
+ output_hidden_states: Optional[bool] = None,
73
+ return_dict: Optional[bool] = None,
74
+ labels: Optional[torch.Tensor] = None,
75
+ ):
76
+ return_dict = (
77
+ return_dict if return_dict is not None else self.config.use_return_dict
78
+ )
79
+ output_hidden_states = (
80
+ True if self.config.use_weighted_layer_sum else output_hidden_states
81
+ )
82
+ # print(f"output_hidden_states: {output_hidden_states}")
83
+
84
+ outputs = self.wav2vec2(
85
+ input_values,
86
+ attention_mask=attention_mask,
87
+ output_attentions=output_attentions,
88
+ output_hidden_states=output_hidden_states,
89
+ return_dict=return_dict,
90
+ )
91
+
92
+ if self.config.use_weighted_layer_sum:
93
+ hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
94
+ hidden_states = torch.stack(hidden_states, dim=1)
95
+ norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
96
+ hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
97
+ else:
98
+ hidden_states = outputs[0]
99
+
100
+ if attention_mask is None:
101
+ pooled_output = hidden_states.mean(dim=1)
102
+ else:
103
+ padding_mask = self._get_feature_vector_attention_mask(
104
+ hidden_states.shape[1], attention_mask
105
+ )
106
+ hidden_states[~padding_mask] = 0.0
107
+ pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(
108
+ -1, 1
109
+ )
110
+
111
+ logits = self.classifier(pooled_output)
112
+
113
+ loss = None
114
+ if labels is not None:
115
+ # CrossEntropyLoss に重みを適用(class_weights が None でも機能する)
116
+ loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
117
+ loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
118
+
119
+ return SequenceClassifierOutput(
120
+ loss=loss,
121
+ logits=logits,
122
+ hidden_states=outputs.hidden_states,
123
+ attentions=outputs.attentions,
124
+ )
125
+
126
+ def freeze_base_model(self):
127
+ r"""Freeze base model."""
128
+ for param in self.wav2vec2.parameters():
129
+ param.requires_grad = False
130
+
131
+ def freeze_feature_encoder(self):
132
+ r"""Freeze feature extractor."""
133
+ self.wav2vec2.freeze_feature_encoder()
my_model/added_tokens.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "</s>": 1,
3
+ "<pad>": 3,
4
+ "<s>": 0,
5
+ "<unk>": 2
6
+ }
my_model/config.json ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "my_train\\relu_layer_norm\\checkpoint-0",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "EmotionModel"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token_id": 1,
14
+ "classifier_proj_size": 256,
15
+ "codevector_dim": 768,
16
+ "contrastive_logits_temperature": 0.1,
17
+ "conv_bias": true,
18
+ "conv_dim": [
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512
26
+ ],
27
+ "conv_kernel": [
28
+ 10,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 2,
34
+ 2
35
+ ],
36
+ "conv_stride": [
37
+ 5,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2
44
+ ],
45
+ "ctc_loss_reduction": "sum",
46
+ "ctc_zero_infinity": false,
47
+ "diversity_loss_weight": 0.1,
48
+ "do_stable_layer_norm": true,
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.1,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.1,
56
+ "finetuning_task": "wav2vec2_reg",
57
+ "gradient_checkpointing": false,
58
+ "head_activation": "relu",
59
+ "hidden_act": "gelu",
60
+ "hidden_dropout": 0.1,
61
+ "hidden_dropout_prob": 0.1,
62
+ "hidden_size": 1024,
63
+ "id2label": {
64
+ "0": "Angry",
65
+ "1": "Disgusted",
66
+ "2": "Embarrassed",
67
+ "3": "Fearful",
68
+ "4": "Happy",
69
+ "5": "Sad",
70
+ "6": "Surprised",
71
+ "7": "Neutral",
72
+ "8": "Sexual1",
73
+ "9": "Sexual2"
74
+ },
75
+ "initializer_range": 0.02,
76
+ "intermediate_size": 4096,
77
+ "label2id": {
78
+ "Angry": 0,
79
+ "Disgusted": 1,
80
+ "Embarrassed": 2,
81
+ "Fearful": 3,
82
+ "Happy": 4,
83
+ "Neutral": 7,
84
+ "Sad": 5,
85
+ "Sexual1": 8,
86
+ "Sexual2": 9,
87
+ "Surprised": 6
88
+ },
89
+ "layer_norm_eps": 1e-05,
90
+ "layerdrop": 0.1,
91
+ "mask_feature_length": 10,
92
+ "mask_feature_min_masks": 0,
93
+ "mask_feature_prob": 0.0,
94
+ "mask_time_length": 10,
95
+ "mask_time_min_masks": 2,
96
+ "mask_time_prob": 0.05,
97
+ "model_type": "wav2vec2",
98
+ "num_adapter_layers": 3,
99
+ "num_attention_heads": 16,
100
+ "num_codevector_groups": 2,
101
+ "num_codevectors_per_group": 320,
102
+ "num_conv_pos_embedding_groups": 16,
103
+ "num_conv_pos_embeddings": 128,
104
+ "num_feat_extract_layers": 7,
105
+ "num_hidden_layers": 12,
106
+ "num_negatives": 100,
107
+ "output_hidden_size": 1024,
108
+ "pad_token_id": 0,
109
+ "pooling_mode": "mean",
110
+ "problem_type": "regression",
111
+ "proj_codevector_dim": 768,
112
+ "tdnn_dilation": [
113
+ 1,
114
+ 2,
115
+ 3,
116
+ 1,
117
+ 1
118
+ ],
119
+ "tdnn_dim": [
120
+ 512,
121
+ 512,
122
+ 512,
123
+ 512,
124
+ 1500
125
+ ],
126
+ "tdnn_kernel": [
127
+ 5,
128
+ 3,
129
+ 3,
130
+ 1,
131
+ 1
132
+ ],
133
+ "torch_dtype": "float32",
134
+ "transformers_version": "4.46.0",
135
+ "use_weighted_layer_sum": false,
136
+ "vocab_size": null,
137
+ "xvector_output_dim": 512
138
+ }
my_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e2f0e4688f66d2ed06c3a664be5d446e16273c060a7bc613a48c079a24e491b1
3
+ size 658226968
my_model/preprocessor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "processor_class": "Wav2Vec2Processor",
8
+ "return_attention_mask": true,
9
+ "sampling_rate": 16000
10
+ }
my_model/special_tokens_map.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "pad_token": "<pad>",
5
+ "unk_token": "<unk>"
6
+ }
my_model/tokenizer_config.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "</s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<unk>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token": "<s>",
37
+ "clean_up_tokenization_spaces": false,
38
+ "do_lower_case": false,
39
+ "eos_token": "</s>",
40
+ "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "<pad>",
42
+ "processor_class": "Wav2Vec2Processor",
43
+ "replace_word_delimiter_char": " ",
44
+ "target_lang": null,
45
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
46
+ "unk_token": "<unk>",
47
+ "word_delimiter_token": "|"
48
+ }
my_model/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ librosa
3
+ loguru
4
+ numpy<2.0
5
+ plotly
6
+ spaces
7
+ torch
8
+ transformers[torch]