SeyedAli commited on
Commit
54063ad
1 Parent(s): 827628e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -24
app.py CHANGED
@@ -1,37 +1,44 @@
1
  import tempfile
2
  import torch
 
 
3
  import torchaudio
4
  import gradio as gr
5
- from transformers import Wav2Vec2FeatureExtractor,AutoConfig,pipeline
 
 
 
 
 
 
 
 
 
6
 
7
  config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
8
  model = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
9
 
10
- def speech_file_to_array_fn(path, sampling_rate):
11
- with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
12
- # Copy the contents of the uploaded audio file to the temporary file
13
- temp_audio_file.write(open(path, "rb").read())
14
- temp_audio_file.flush()
15
- # Load the audio file using torchaudio
16
- speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
17
- resampler = torchaudio.transforms.Resample(_sampling_rate)
18
- speech = resampler(speech_array).squeeze().numpy()
19
- return speech
20
 
21
- def predict(path, sampling_rate):
22
- speech = speech_file_to_array_fn(path, sampling_rate)
23
- inputs = model(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
24
- inputs = {key: inputs[key].to(device) for key in inputs}
25
-
26
- with torch.no_grad():
27
- logits = model(**inputs).logits
 
 
 
 
28
 
29
- scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
30
- outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
31
- return outputs
32
 
33
- def SER(audio):
34
- return predict(audio,model.sampling_rate)
 
35
 
36
- iface = gr.Interface(fn=SER, inputs="audio", outputs="text")
37
  iface.launch(share=False)
 
1
  import tempfile
2
  import torch
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
  import torchaudio
6
  import gradio as gr
7
+ from transformers import Wav2Vec2FeatureExtractor,AutoConfig
8
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
9
+ from transformers.models.wav2vec2.modeling_wav2vec2 import (
10
+ Wav2Vec2PreTrainedModel,
11
+ Wav2Vec2Model
12
+ )
13
+ from transformers.models.hubert.modeling_hubert import (
14
+ HubertPreTrainedModel,
15
+ HubertModel
16
+ )
17
 
18
  config = AutoConfig.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
19
  model = Wav2Vec2FeatureExtractor.from_pretrained("SeyedAli/Persian-Speech-Emotion-HuBert-V1")
20
 
21
+ audio_input = gr.Audio(label="صوت گفتار فارسی",type="filepath")
22
+ text_output = gr.TextArea(label="هیجان موجود در صوت گفتار",text_align="right",rtl=True,type="text")
 
 
 
 
 
 
 
 
23
 
24
+ def SER(audio):
25
+ with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio_file:
26
+ # Copy the contents of the uploaded audio file to the temporary file
27
+ temp_audio_file.write(open(audio, "rb").read())
28
+ temp_audio_file.flush()
29
+ # Load the audio file using torchaudio
30
+ speech_array, _sampling_rate = torchaudio.load(temp_audio_file.name)
31
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
32
+ speech = resampler(speech_array).squeeze().numpy()
33
+ inputs = model(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
34
+ inputs = {key: inputs[key].to(device) for key in inputs}
35
 
36
+ with torch.no_grad():
37
+ logits = model(**inputs).logits
 
38
 
39
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
40
+ outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
41
+ return outputs
42
 
43
+ iface = gr.Interface(fn=SER, inputs=audio_input, outputs=text_output)
44
  iface.launch(share=False)