terry-li-hm commited on
Commit
96bcc68
ยท
1 Parent(s): 9ecefd1
Files changed (1) hide show
  1. app.py +0 -129
app.py CHANGED
@@ -16,135 +16,6 @@ import torchaudio
16
  from funasr import AutoModel
17
  from sv import clean_and_emoji_annotate_speech, process_audio
18
 
19
- model = "FunAudioLLM/SenseVoiceSmall"
20
- model = AutoModel(
21
- model=model,
22
- vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
23
- vad_kwargs={"max_single_segment_time": 30000},
24
- hub="hf",
25
- device="cuda",
26
- )
27
-
28
- import re
29
-
30
- emo_dict = {
31
- "<|HAPPY|>": "๐Ÿ˜Š",
32
- "<|SAD|>": "๐Ÿ˜”",
33
- "<|ANGRY|>": "๐Ÿ˜ก",
34
- "<|NEUTRAL|>": "",
35
- "<|FEARFUL|>": "๐Ÿ˜ฐ",
36
- "<|DISGUSTED|>": "๐Ÿคข",
37
- "<|SURPRISED|>": "๐Ÿ˜ฎ",
38
- }
39
-
40
- event_dict = {
41
- "<|BGM|>": "๐ŸŽผ",
42
- "<|Speech|>": "",
43
- "<|Applause|>": "๐Ÿ‘",
44
- "<|Laughter|>": "๐Ÿ˜€",
45
- "<|Cry|>": "๐Ÿ˜ญ",
46
- "<|Sneeze|>": "๐Ÿคง",
47
- "<|Breath|>": "",
48
- "<|Cough|>": "๐Ÿคง",
49
- }
50
-
51
- emoji_dict = {
52
- "<|nospeech|><|Event_UNK|>": "โ“",
53
- "<|zh|>": "",
54
- "<|en|>": "",
55
- "<|yue|>": "",
56
- "<|ja|>": "",
57
- "<|ko|>": "",
58
- "<|nospeech|>": "",
59
- "<|HAPPY|>": "๐Ÿ˜Š",
60
- "<|SAD|>": "๐Ÿ˜”",
61
- "<|ANGRY|>": "๐Ÿ˜ก",
62
- "<|NEUTRAL|>": "",
63
- "<|BGM|>": "๐ŸŽผ",
64
- "<|Speech|>": "",
65
- "<|Applause|>": "๐Ÿ‘",
66
- "<|Laughter|>": "๐Ÿ˜€",
67
- "<|FEARFUL|>": "๐Ÿ˜ฐ",
68
- "<|DISGUSTED|>": "๐Ÿคข",
69
- "<|SURPRISED|>": "๐Ÿ˜ฎ",
70
- "<|Cry|>": "๐Ÿ˜ญ",
71
- "<|EMO_UNKNOWN|>": "",
72
- "<|Sneeze|>": "๐Ÿคง",
73
- "<|Breath|>": "",
74
- "<|Cough|>": "๐Ÿ˜ท",
75
- "<|Sing|>": "",
76
- "<|Speech_Noise|>": "",
77
- "<|withitn|>": "",
78
- "<|woitn|>": "",
79
- "<|GBG|>": "",
80
- "<|Event_UNK|>": "",
81
- }
82
-
83
- lang_dict = {
84
- "<|zh|>": "<|lang|>",
85
- "<|en|>": "<|lang|>",
86
- "<|yue|>": "<|lang|>",
87
- "<|ja|>": "<|lang|>",
88
- "<|ko|>": "<|lang|>",
89
- "<|nospeech|>": "<|lang|>",
90
- }
91
-
92
- emo_set = {"๐Ÿ˜Š", "๐Ÿ˜”", "๐Ÿ˜ก", "๐Ÿ˜ฐ", "๐Ÿคข", "๐Ÿ˜ฎ"}
93
- event_set = {"๐ŸŽผ", "๐Ÿ‘", "๐Ÿ˜€", "๐Ÿ˜ญ", "๐Ÿคง", "๐Ÿ˜ท"}
94
-
95
-
96
- def format_str(s):
97
- for sptk in emoji_dict:
98
- s = s.replace(sptk, emoji_dict[sptk])
99
- return s
100
-
101
-
102
- def format_str_v2(s):
103
- sptk_dict = {}
104
- for sptk in emoji_dict:
105
- sptk_dict[sptk] = s.count(sptk)
106
- s = s.replace(sptk, "")
107
- emo = "<|NEUTRAL|>"
108
- for e in emo_dict:
109
- if sptk_dict[e] > sptk_dict[emo]:
110
- emo = e
111
- for e in event_dict:
112
- if sptk_dict[e] > 0:
113
- s = event_dict[e] + s
114
- s = s + emo_dict[emo]
115
-
116
- for emoji in emo_set.union(event_set):
117
- s = s.replace(" " + emoji, emoji)
118
- s = s.replace(emoji + " ", emoji)
119
- return s.strip()
120
-
121
-
122
- def format_str_v3(s):
123
- def get_emo(s):
124
- return s[-1] if s[-1] in emo_set else None
125
-
126
- def get_event(s):
127
- return s[0] if s[0] in event_set else None
128
-
129
- s = s.replace("<|nospeech|><|Event_UNK|>", "โ“")
130
- for lang in lang_dict:
131
- s = s.replace(lang, "<|lang|>")
132
- s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
133
- new_s = " " + s_list[0]
134
- cur_ent_event = get_event(new_s)
135
- for i in range(1, len(s_list)):
136
- if len(s_list[i]) == 0:
137
- continue
138
- if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
139
- s_list[i] = s_list[i][1:]
140
- # else:
141
- cur_ent_event = get_event(s_list[i])
142
- if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
143
- new_s = new_s[:-1]
144
- new_s += s_list[i].strip().lstrip()
145
- new_s = new_s.replace("The.", " ")
146
- return new_s.strip()
147
-
148
 
149
  @spaces.GPU
150
  def model_inference(input_wav, language, fs=16000):
 
16
  from funasr import AutoModel
17
  from sv import clean_and_emoji_annotate_speech, process_audio
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  @spaces.GPU
21
  def model_inference(input_wav, language, fs=16000):