vkn commited on
Commit
9d93173
·
1 Parent(s): c602f6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -49
app.py CHANGED
@@ -1,49 +1,49 @@
1
- import gradio as gr
2
-
3
- import torch
4
- import torch.nn as nn
5
- import torch.nn.functional as F
6
- import torchaudio
7
- from transformers import AutoConfig, Wav2Vec2FeatureExtractor, Wav2Vec2ForSpeechClassification
8
-
9
- import librosa
10
- import IPython.display as ipd
11
- import numpy as np
12
- import pandas as pd
13
-
14
-
15
-
16
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
- model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-voxpopuli-gtzan-music"
18
- config = AutoConfig.from_pretrained(model_name_or_path)
19
- feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
20
- sampling_rate = feature_extractor.sampling_rate
21
- model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
22
-
23
-
24
- def speech_file_to_array_fn(path, sampling_rate):
25
- speech_array, _sampling_rate = torchaudio.load(path)
26
- resampler = torchaudio.transforms.Resample(_sampling_rate)
27
- speech = resampler(speech_array).squeeze().numpy()
28
- return speech
29
-
30
-
31
- def predict(path, sampling_rate):
32
- speech = speech_file_to_array_fn(path, sampling_rate)
33
- inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
34
- inputs = {key: inputs[key].to(device) for key in inputs}
35
-
36
- with torch.no_grad():
37
- logits = model(**inputs).logits
38
-
39
- scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
40
- outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
41
- return outputs
42
-
43
-
44
- path = "La Campanella.mp3"
45
- outputs = predict(path, sampling_rate)
46
-
47
-
48
- iface = gr.Interface(fn=predict, inputs=path, outputs=predict(path, sampling_rate))
49
- iface.launch()
 
1
+ import gradio as gr
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import torchaudio
7
+ from transformers import AutoConfig, Wav2Vec2FeatureExtractor
8
+
9
+ import librosa
10
+ import IPython.display as ipd
11
+ import numpy as np
12
+ import pandas as pd
13
+
14
+
15
+
16
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
+ model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-voxpopuli-gtzan-music"
18
+ config = AutoConfig.from_pretrained(model_name_or_path)
19
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
20
+ sampling_rate = feature_extractor.sampling_rate
21
+ model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)
22
+
23
+
24
+ def speech_file_to_array_fn(path, sampling_rate):
25
+ speech_array, _sampling_rate = torchaudio.load(path)
26
+ resampler = torchaudio.transforms.Resample(_sampling_rate)
27
+ speech = resampler(speech_array).squeeze().numpy()
28
+ return speech
29
+
30
+
31
+ def predict(path, sampling_rate):
32
+ speech = speech_file_to_array_fn(path, sampling_rate)
33
+ inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
34
+ inputs = {key: inputs[key].to(device) for key in inputs}
35
+
36
+ with torch.no_grad():
37
+ logits = model(**inputs).logits
38
+
39
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
40
+ outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
41
+ return outputs
42
+
43
+
44
+ path = "La Campanella.mp3"
45
+ outputs = predict(path, sampling_rate)
46
+
47
+
48
+ iface = gr.Interface(fn=predict, inputs=path, outputs=predict(path, sampling_rate))
49
+ iface.launch()