Spaces:

litagin
/

anime-speech-emotion-recognition

Running

+import pprint
+import gradio as gr
+import librosa
+import plotly.graph_objects as go
+import spaces
+import torch
+from loguru import logger
+from transformers import AutoFeatureExtractor
+from transformers.modeling_outputs import SequenceClassifierOutput
+from model import EmotionModel
+repo_id = "my_model"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+logger.info(f"device: {device}")
+model = EmotionModel.from_pretrained(repo_id, device_map=device)
+model.eval()
+processor = AutoFeatureExtractor.from_pretrained(repo_id)
+label_map = {
+    "Angry": "😠 怒り",
+    "Disgusted": "😒 嫌悪",
+    "Embarrassed": "😳 戸惑い",
+    "Fearful": "😨 恐怖",
+    "Happy": "😊 幸せ",
+    "Sad": "😢 悲しみ",
+    "Surprised": "😲 驚き",
+    "Neutral": "😐 中立",
+    "Sexual1": "🥰 NSFW1",
+    "Sexual2": "🍭 NSFW2",
+}
+@spaces.GPU
+def pipe(filename: str) -> tuple[dict[str, float], go.Figure]:
+    audio, sr = librosa.load(filename, sr=16000)
+    duration = librosa.get_duration(y=audio, sr=sr)
+    logger.info(f"filename: {filename}, duration: {duration}")
+    if duration > 30.0:
+        return (
+            {f"Error: 音声ファイルの長さが長すぎます: {duration}秒": 0.0},
+            go.Figure(),
+        )
+    inputs = processor(audio, sampling_rate=sr, return_tensors="pt")
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs: SequenceClassifierOutput = model(**inputs)
+    logits = outputs.logits  # shape: (batch_size, num_labels)
+    # ロジットの取得
+    logits = logits[0].cpu().numpy()
+    labels = [label_map[label] for id, label in model.config.id2label.items()]
+    sorted_pairs = sorted(zip(logits, labels), key=lambda x: x[0])
+    sorted_logits, sorted_labels = zip(*sorted_pairs)
+    logger.info(f"Result:\n{pprint.pformat(sorted_pairs)}")
+    probabilities = outputs.logits.softmax(dim=-1)
+    scores_dict = {label: prob.item() for label, prob in zip(labels, probabilities[0])}
+    fig = go.Figure([go.Bar(x=sorted_logits, y=sorted_labels, orientation="h")])
+    return scores_dict, fig
+md = """
+# 音声からの感情認識 ver 0.1
+- 音声ファイルから感情を予測して、確率とlogits (softmax前の値) を表示します
+- 30秒以上の音声ファイルは受け付けません
+"""
+with gr.Blocks() as app:
+    gr.Markdown(md)
+    audio = gr.Audio(type="filepath")
+    btn = gr.Button("感情を予測")
+    with gr.Row():
+        result = gr.Label(label="結果")
+        plot = gr.Plot(label="Logits")
+    btn.click(pipe, inputs=audio, outputs=[result, plot])
+app.launch()

model.py ADDED Viewed

	@@ -0,0 +1,133 @@

+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers.activations import get_activation
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers.models.wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Model,
+    Wav2Vec2PreTrainedModel,
+)
+_HIDDEN_STATES_START_POSITION = 2
+class ClassificationHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        print(f"classifier_proj_size: {config.classifier_proj_size}")
+        self.dense = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.layer_norm = nn.LayerNorm(config.classifier_proj_size)
+        self.dropout = nn.Dropout(config.final_dropout)
+        self.out_proj = nn.Linear(config.classifier_proj_size, config.num_labels)
+        print(f"Head activation: {config.head_activation}")
+        self.activation = get_activation(config.head_activation)
+    def forward(self, features, **kwargs):
+        x = features
+        x = self.dense(x)
+        x = self.layer_norm(x)
+        x = self.activation(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+class EmotionModel(Wav2Vec2PreTrainedModel):
+    """Speech emotion classifier."""
+    def __init__(self, config, counts: Optional[dict[int, int]] = None):
+        super().__init__(config)
+        self.config = config
+        self.wav2vec2 = Wav2Vec2Model(config)
+        self.classifier = ClassificationHead(config)
+        num_layers = (
+            config.num_hidden_layers + 1
+        )  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.init_weights()
+        # counts が与えられている場合、クラスの重みを計算
+        if counts is not None:
+            print(f"Using class weights: {counts}")
+            counts_list = [counts[i] for i in range(config.num_labels)]
+            counts_tensor = torch.tensor(
+                counts_list, dtype=torch.float, device="cuda:0"
+            )
+            total_samples = counts_tensor.sum()
+            class_weights = total_samples / (config.num_labels * counts_tensor)
+            # 重みを正規化（任意）
+            class_weights = class_weights / class_weights.sum() * config.num_labels
+            self.class_weights = class_weights
+        else:
+            self.class_weights = None  # counts がない場合は None に設定
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        output_hidden_states = (
+            True if self.config.use_weighted_layer_sum else output_hidden_states
+        )
+        # print(f"output_hidden_states: {output_hidden_states}")
+        outputs = self.wav2vec2(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(
+                hidden_states.shape[1], attention_mask
+            )
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(
+                -1, 1
+            )
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            # CrossEntropyLoss に重みを適用（class_weights が None でも機能する）
+            loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def freeze_base_model(self):
+        r"""Freeze base model."""
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+    def freeze_feature_encoder(self):
+        r"""Freeze feature extractor."""
+        self.wav2vec2.freeze_feature_encoder()

my_model/added_tokens.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "</s>": 1,
+  "<pad>": 3,
+  "<s>": 0,
+  "<unk>": 2
+}

my_model/config.json ADDED Viewed

	@@ -0,0 +1,138 @@

+{
+  "_name_or_path": "my_train\\relu_layer_norm\\checkpoint-0",
+  "activation_dropout": 0.1,
+  "adapter_attn_dim": null,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "EmotionModel"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.1,
+  "finetuning_task": "wav2vec2_reg",
+  "gradient_checkpointing": false,
+  "head_activation": "relu",
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "Angry",
+    "1": "Disgusted",
+    "2": "Embarrassed",
+    "3": "Fearful",
+    "4": "Happy",
+    "5": "Sad",
+    "6": "Surprised",
+    "7": "Neutral",
+    "8": "Sexual1",
+    "9": "Sexual2"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "Angry": 0,
+    "Disgusted": 1,
+    "Embarrassed": 2,
+    "Fearful": 3,
+    "Happy": 4,
+    "Neutral": 7,
+    "Sad": 5,
+    "Sexual1": 8,
+    "Sexual2": 9,
+    "Surprised": 6
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "model_type": "wav2vec2",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 12,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "pooling_mode": "mean",
+  "problem_type": "regression",
+  "proj_codevector_dim": 768,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.0",
+  "use_weighted_layer_sum": false,
+  "vocab_size": null,
+  "xvector_output_dim": 512
+}

my_model/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e2f0e4688f66d2ed06c3a664be5d446e16273c060a7bc613a48c079a24e491b1
+size 658226968

my_model/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}

my_model/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

my_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<pad>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<unk>",
+  "word_delimiter_token": "|"
+}

my_model/vocab.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio
+librosa
+loguru
+numpy<2.0
+plotly
+spaces
+torch
+transformers[torch]