calmgoose commited on
Commit
98e0bae
·
1 Parent(s): ce952ae

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +202 -0
app.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pytube as pt
3
+ from transformers import pipeline
4
+ import json
5
+ import whisper_timestamped as whispertime
6
+ from pydub import AudioSegment
7
+ from spleeter.separator import Separator
8
+ import os
9
+ from profanity_check import predict
10
+ import sys
11
+ import tempfile
12
+ import uuid
13
+ import shutil
14
+ import json
15
+
16
+ import streamlit as st
17
+
18
+
19
+ # CORE #
20
+
21
+ MODEL_NAME = "openai/whisper-large-v2"
22
+
23
+ PROFANE_WORDS = ["falkona", "fuck"]
24
+
25
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
26
+
27
+ def create_tmp_copy_of_file(file, dir=None):
28
+ """
29
+ Creates a temporary copy of the file and returns the path to the copy.
30
+ :param file: the path to the file
31
+ :param dir: optional directory to place the copy in
32
+ :return: path to the temporary copy
33
+ """
34
+ if isinstance(file, dict):
35
+ file_path = file["path"]
36
+ else:
37
+ file_path = file
38
+
39
+ if dir is None:
40
+ dir = tempfile.gettempdir()
41
+
42
+ file_name = os.path.basename(file_path)
43
+ tmp_path = os.path.join(dir, f"{str(uuid.uuid4())}_{file_name}")
44
+ shutil.copy2(file_path, tmp_path)
45
+
46
+ return json.dumps(tmp_path).strip('"')
47
+
48
+ def source_separation(input_file, output_folder="separated_audio"):
49
+ separator = Separator('spleeter:2stems')
50
+ separator.separate_to_file(input_file, output_folder)
51
+ return f"{output_folder}/{os.path.splitext(os.path.basename(input_file))[0]}"
52
+
53
+ def process_audio(input_file, model_size='tiny', verbose=False, play_output=False):
54
+ if not os.path.isfile(input_file):
55
+ print('Error: input file not found')
56
+ sys.exit()
57
+
58
+ stems_dir = source_separation(input_file)
59
+ vocal_stem = os.path.join(stems_dir, 'vocals.wav')
60
+ instr_stem = os.path.join(stems_dir, 'accompaniment.wav')
61
+
62
+ model = whispertime.load_model(model_size, device=device)
63
+ result = whispertime.transcribe(model, vocal_stem, language="en")
64
+
65
+ if verbose:
66
+ print('\nTranscribed text:')
67
+ print(result['text']+'\n')
68
+
69
+ print(result["text"])
70
+
71
+ profane_indices = predict(result["text"].split())
72
+ profanities = [word for word, is_profane in zip(result["text"].split(), profane_indices) if is_profane]
73
+ if not profanities:
74
+ print(f'No profanities detected found in {input_file} - exiting')
75
+ # sys.exit()
76
+ if verbose:
77
+ print('Profanities found in text:')
78
+ print(profanities)
79
+
80
+ vocals = AudioSegment.from_wav(vocal_stem)
81
+
82
+ segments = result["segments"]
83
+
84
+ for segment in segments:
85
+ words = segment["words"]
86
+ for word in words:
87
+ if word["text"].lower() in PROFANE_WORDS:
88
+ start_time = int(word["start"] * 1000)
89
+ end_time = int(word["end"] * 1000)
90
+ silence = AudioSegment.silent(duration=(end_time - start_time))
91
+ vocals = vocals[:start_time] + silence + vocals[end_time:]
92
+
93
+ mix = AudioSegment.from_wav(instr_stem).overlay(vocals)
94
+ print("#### \n\n" + input_file)
95
+ outpath = input_file.replace('.mp3', '_masked.mp3').replace('.wav', '_masked.wav')
96
+ print("#### \n\n" + outpath)
97
+ # if input_file.endswith('.wav'):
98
+ # mix.export(outpath, format="wav")
99
+ # elif input_file.endswith('.mp3'):
100
+ final_mix = mix.export(outpath, format="wav")
101
+
102
+ print(f'Mixed file written to: {outpath}')
103
+
104
+ # out = create_tmp_copy_of_file(outpath)
105
+ print('\n Returning final mix: ', final_mix)
106
+ return outpath
107
+
108
+ # try getting it to work just returning the transcribed text
109
+ # return result["text"]
110
+
111
+ def transcribe(microphone=None, file_upload=None):
112
+ if (microphone is not None) and (file_upload is not None):
113
+ warn_output = (
114
+ "WARNING: You've uploaded an audio file and used the microphone. "
115
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
116
+ )
117
+ elif (microphone is None) and (file_upload is None):
118
+ return "ERROR: You have to e~ither use the microphone or upload an audio file"
119
+
120
+ file = microphone if microphone is not None else file_upload
121
+ processed_file = process_audio(file)
122
+ print('File sucessfully processed:, ', processed_file)
123
+ # audio = AudioSegment.from_file(processed_file, format="wav").export()
124
+ audio = processed_file
125
+
126
+ return str(audio)
127
+
128
+ def _return_yt_html_embed(yt_url):
129
+ video_id = yt_url.split("?v=")[-1]
130
+ HTML_str = (
131
+ f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
132
+ " </center>"
133
+ )
134
+ return HTML_str
135
+
136
+ def yt_transcribe(yt_url):
137
+ yt = pt.YouTube(yt_url)
138
+ html_embed_str = _return_yt_html_embed(yt_url)
139
+ stream = yt.streams.filter(only_audio=True)[0]
140
+ stream.download(filename="audio.mp3")
141
+
142
+ processed_file = process_audio("audio.mp3")
143
+ audio = AudioSegment.from_file(processed_file, format="mp3")
144
+
145
+ return html_embed_str, audio
146
+
147
+
148
+
149
+
150
+ # STREAMLIT #
151
+
152
+ import streamlit as st
153
+
154
+ st.title("Whisper Large V2: Transcribe Audio")
155
+
156
+ f"""
157
+ Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the
158
+ checkpoint {MODEL_NAME} and 🤗 Transformers to transcribe audio files of arbitrary length.
159
+ """
160
+
161
+ tab1, tab2 = st.tabs(["Transcribe Audio", "Transcribe YouTube"])
162
+
163
+ with tab1: # file upload
164
+ uploaded_files = st.file_uploader("Upload your audio file here", type=["mp3", "wav"], help="Drag and drop or click to choose file")
165
+ if uploaded_files is not None:
166
+ bytes_data = uploaded_files.read()
167
+
168
+ st.write("Your uploaded file")
169
+ st.audio(bytes_data)
170
+ # format can be specified, default is wav
171
+ # st.audio(bytes_data, format="audio/mp3")
172
+
173
+ st.markdown("---")
174
+ st.write("## Your processed file")
175
+ with st.spinner("...is being processed"):
176
+
177
+ # uploaded file is stored in RAM, so save it to a file to pass into `transcribe`
178
+ with open(uploaded_files.name, "wb") as f:
179
+ f.write((uploaded_files).getbuffer())
180
+
181
+ processed_audio = transcribe(microphone=None, file_upload=uploaded_files.name)
182
+
183
+ audio_file = open(processed_audio, 'rb')
184
+ audio_bytes2 = audio_file.read()
185
+ st.audio(audio_bytes2)
186
+
187
+ with tab2: # youtube
188
+ link = st.text_input("Paste your YouTube link", placeholder="https://www.youtube.com/watch?v=EuEe3WKpbCo")
189
+ if link != "":
190
+
191
+ try:
192
+ st.video(link)
193
+ except:
194
+ st.warning("Not a video")
195
+ st.stop()
196
+
197
+ with st.spinner("YouTube link is being processed"):
198
+ html_embed_str, audio = yt_transcribe(link)
199
+
200
+ audio_file = open(audio, 'rb')
201
+ audio_bytes_yt = audio_file.read()
202
+ st.audio(audio_bytes_yt)