File size: 12,515 Bytes
54fe84c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 |
# app.py
from utils import language_dict
import math
import torch
import gc
import time
import subprocess
from faster_whisper import WhisperModel
import os
import mimetypes
import shutil
import re
import uuid
from pydub import AudioSegment
import torch
def get_language_name(lang_code):
global language_dict
# Iterate through the language dictionary
for language, details in language_dict.items():
# Check if the language code matches
if details["lang_code"] == lang_code:
return language # Return the language name
return None
def clean_file_name(file_path):
# Get the base file name and extension
file_name = os.path.basename(file_path)
file_name, file_extension = os.path.splitext(file_name)
# Replace non-alphanumeric characters with an underscore
cleaned = re.sub(r'[^a-zA-Z\d]+', '_', file_name)
# Remove any multiple underscores
clean_file_name = re.sub(r'_+', '_', cleaned).strip('_')
# Generate a random UUID for uniqueness
random_uuid = uuid.uuid4().hex[:6]
# Combine cleaned file name with the original extension
clean_file_path = os.path.join(os.path.dirname(file_path), clean_file_name + f"_{random_uuid}" + file_extension)
return clean_file_path
def get_audio_file(uploaded_file):
global base_path
# ,device
device = "cuda" if torch.cuda.is_available() else "cpu"
# Detect the file type (audio/video)
mime_type, _ = mimetypes.guess_type(uploaded_file)
# Create the folder path to store audio files
audio_folder = f"{base_path}/subtitle_audio"
os.makedirs(audio_folder, exist_ok=True)
# Initialize variable for the audio file path
audio_file_path = ""
if mime_type and mime_type.startswith('audio'):
# If it's an audio file, save it as is
audio_file_path = os.path.join(audio_folder, os.path.basename(uploaded_file))
audio_file_path=clean_file_name(audio_file_path)
shutil.copy(uploaded_file, audio_file_path) # Move file to audio folder
elif mime_type and mime_type.startswith('video'):
# If it's a video file, extract the audio
audio_file_name = os.path.splitext(os.path.basename(uploaded_file))[0] + ".mp3"
audio_file_path = os.path.join(audio_folder, audio_file_name)
audio_file_path=clean_file_name(audio_file_path)
# Extract the file extension from the uploaded file
file_extension = os.path.splitext(uploaded_file)[1] # Includes the dot, e.g., '.mp4'
# Generate a random UUID and create a new file name with the same extension
random_uuid = uuid.uuid4().hex[:6]
new_file_name = random_uuid + file_extension
# Set the new file path in the subtitle_audio folder
new_file_path = os.path.join(audio_folder, new_file_name)
# Copy the original video file to the new location with the new name
shutil.copy(uploaded_file, new_file_path)
if device=="cuda":
command = f"ffmpeg -hwaccel cuda -i {new_file_path} {audio_file_path} -y"
else:
command = f"ffmpeg -i {new_file_path} {audio_file_path} -y"
subprocess.run(command, shell=True)
if os.path.exists(new_file_path):
os.remove(new_file_path)
# Return the saved audio file path
audio = AudioSegment.from_file(audio_file_path)
# Get the duration in seconds
duration_seconds = len(audio) / 1000.0 # pydub measures duration in milliseconds
return audio_file_path,duration_seconds
def format_segments(segments):
saved_segments = list(segments)
sentence_timestamp = []
words_timestamp = []
speech_to_text = ""
for i in saved_segments:
temp_sentence_timestamp = {}
# Store sentence information in sentence_timestamp
text = i.text.strip()
sentence_id = len(sentence_timestamp) # Get the current index for the new entry
sentence_timestamp.append({
"id": sentence_id, # Use the index as the id
"text": text,
"start": i.start,
"end": i.end,
"words": [] # Initialize words as an empty list within the sentence
})
speech_to_text += text + " "
# Process each word in the sentence
for word in i.words:
word_data = {
"word": word.word.strip(),
"start": word.start,
"end": word.end
}
# Append word timestamps to the sentence's word list
sentence_timestamp[sentence_id]["words"].append(word_data)
# Optionally, add the word data to the global words_timestamp list
words_timestamp.append(word_data)
return sentence_timestamp, words_timestamp, speech_to_text
def combine_word_segments(words_timestamp, max_words_per_subtitle=8, min_silence_between_words=0.5):
before_translate = {}
id = 1
text = ""
start = None
end = None
word_count = 0
last_end_time = None
for i in words_timestamp:
try:
word = i['word']
word_start = i['start']
word_end = i['end']
# Check for sentence-ending punctuation
is_end_of_sentence = word.endswith(('.', '?', '!'))
# Check for conditions to create a new subtitle
if ((last_end_time is not None and word_start - last_end_time > min_silence_between_words)
or word_count >= max_words_per_subtitle
or is_end_of_sentence):
# Store the previous subtitle if there's any
if text:
before_translate[id] = {
"text": text,
"start": start,
"end": end
}
id += 1
# Reset for the new subtitle segment
text = word
start = word_start # Set the start time for the new subtitle
word_count = 1
else:
if word_count == 0: # First word in the subtitle
start = word_start # Ensure the start time is set
text += " " + word
word_count += 1
end = word_end # Update the end timestamp
last_end_time = word_end # Update the last end timestamp
except KeyError as e:
print(f"KeyError: {e} - Skipping word")
pass
# After the loop, make sure to add the last subtitle segment
if text:
before_translate[id] = {
"text": text,
"start": start,
"end": end
}
return before_translate
def convert_time_to_srt_format(seconds):
""" Convert seconds to SRT time format (HH:MM:SS,ms) """
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
milliseconds = int((seconds - int(seconds)) * 1000)
return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
def write_subtitles_to_file(subtitles, filename="subtitles.srt"):
# Open the file with UTF-8 encoding
with open(filename, 'w', encoding='utf-8') as f:
for id, entry in subtitles.items():
# Write the subtitle index
f.write(f"{id}\n")
if entry['start'] is None or entry['end'] is None:
print(id)
# Write the start and end time in SRT format
start_time = convert_time_to_srt_format(entry['start'])
end_time = convert_time_to_srt_format(entry['end'])
f.write(f"{start_time} --> {end_time}\n")
# Write the text and speaker information
f.write(f"{entry['text']}\n\n")
def word_level_srt(words_timestamp, srt_path="world_level_subtitle.srt"):
with open(srt_path, 'w', encoding='utf-8') as srt_file:
for i, word_info in enumerate(words_timestamp, start=1):
start_time = convert_time_to_srt_format(word_info['start'])
end_time = convert_time_to_srt_format(word_info['end'])
srt_file.write(f"{i}\n{start_time} --> {end_time}\n{word_info['word']}\n\n")
def generate_srt_from_sentences(sentence_timestamp, srt_path="default_subtitle.srt"):
with open(srt_path, 'w', encoding='utf-8') as srt_file:
for index, sentence in enumerate(sentence_timestamp):
start_time = convert_time_to_srt_format(sentence['start'])
end_time = convert_time_to_srt_format(sentence['end'])
srt_file.write(f"{index + 1}\n{start_time} --> {end_time}\n{sentence['text']}\n\n")
def whisper_subtitle(uploaded_file,Source_Language,max_words_per_subtitle=8):
global language_dict,base_path
#Load model
if torch.cuda.is_available():
# If CUDA is available, use GPU with float16 precision
device = "cuda"
compute_type = "float16"
# compute_type="int8_float16"
else:
# If CUDA is not available, use CPU with int8 precision
device = "cpu"
compute_type = "int8"
faster_whisper_model = WhisperModel("deepdml/faster-whisper-large-v3-turbo-ct2",device=device, compute_type=compute_type)
audio_path,audio_duration=get_audio_file(uploaded_file)
if Source_Language=="Automatic":
segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True)
lang_code=d.language
src_lang=get_language_name(lang_code)
else:
lang=language_dict[Source_Language]['lang_code']
segments,d = faster_whisper_model.transcribe(audio_path, word_timestamps=True,language=lang)
src_lang=Source_Language
if os.path.exists(audio_path):
os.remove(audio_path)
sentence_timestamp,words_timestamp,text=format_segments(segments)
del faster_whisper_model
gc.collect()
torch.cuda.empty_cache()
word_segments=combine_word_segments(words_timestamp, max_words_per_subtitle=max_words_per_subtitle, min_silence_between_words=0.5)
#setup srt file names
base_name = os.path.basename(uploaded_file).rsplit('.', 1)[0][:30]
save_name = f"{base_path}/generated_subtitle/{base_name}_{src_lang}.srt"
original_srt_name=clean_file_name(save_name)
original_txt_name=original_srt_name.replace(".srt",".txt")
word_level_srt_name=original_srt_name.replace(".srt","_word_level.srt")
default_srt_name=original_srt_name.replace(".srt","_default.srt")
generate_srt_from_sentences(sentence_timestamp, srt_path=default_srt_name)
word_level_srt(words_timestamp, srt_path=word_level_srt_name)
write_subtitles_to_file(word_segments, filename=original_srt_name)
with open(original_txt_name, 'w', encoding='utf-8') as f1:
f1.write(text)
return default_srt_name,original_srt_name,word_level_srt_name,original_txt_name
#@title Using Gradio Interface
def subtitle_maker(Audio_or_Video_File,Source_Language,max_words_per_subtitle):
try:
default_srt_path,customize_srt_path,word_level_srt_path,text_path=whisper_subtitle(Audio_or_Video_File,Source_Language,max_words_per_subtitle=max_words_per_subtitle)
except:
default_srt_path,customize_srt_path,word_level_srt_path,text_path=None,None,None,None
return default_srt_path,customize_srt_path,word_level_srt_path,text_path
import gradio as gr
import click
base_path="."
if not os.path.exists(f"{base_path}/generated_subtitle"):
os.makedirs(f"{base_path}/generated_subtitle", exist_ok=True)
source_lang_list = ['Automatic']
available_language=language_dict.keys()
source_lang_list.extend(available_language)
@click.command()
@click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
@click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
def main(debug, share):
# Define Gradio inputs and outputs
gradio_inputs = [
gr.File(label="Upload Audio or Video File"),
gr.Dropdown(label="Language", choices=source_lang_list, value="Automatic"),
gr.Number(label="Max Word Per Subtitle Segment", value=8)
]
gradio_outputs = [
gr.File(label="Default SRT File", show_label=True),
gr.File(label="Customize SRT File", show_label=True),
gr.File(label="Word Level SRT File", show_label=True),
gr.File(label="Text File", show_label=True)
]
# Create Gradio interface
demo = gr.Interface(fn=subtitle_maker, inputs=gradio_inputs, outputs=gradio_outputs, title="Whisper-Large-V3-Turbo-Ct2 Subtitle Maker")
# Launch Gradio with command-line options
demo.launch(debug=debug, share=share)
if __name__ == "__main__":
main() |