Spaces:
Runtime error
Runtime error
Commit
·
df07f44
1
Parent(s):
1718583
added functions for video download, captions, translations, and audio
Browse files
app.py
CHANGED
@@ -1,4 +1,154 @@
|
|
1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
+
import whisper
|
3 |
+
import os
|
4 |
+
import ffmpeg
|
5 |
+
import textwrap
|
6 |
+
from flask import Flask
|
7 |
+
from pytube import YouTube
|
8 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
9 |
+
from youtube_transcript_api.formatters import SRTFormatter
|
10 |
+
from deep_translator import GoogleTranslator
|
11 |
|
12 |
+
|
13 |
+
'''
|
14 |
+
to run api paste " uvicorn milestone-2:app " in terminal
|
15 |
+
'''
|
16 |
+
|
17 |
+
def download_audio(url:str, download_path:str):
|
18 |
+
|
19 |
+
try:
|
20 |
+
yt = YouTube(url)
|
21 |
+
audio = yt.streams.filter(only_audio=True).first()
|
22 |
+
vid_title = yt.title
|
23 |
+
file_name = vid_title + '.mp3'
|
24 |
+
audio.download(output_path=download_path, filename=file_name)
|
25 |
+
|
26 |
+
except KeyError:
|
27 |
+
return 400, "Error: audio souce not avaliable or cannot be download"
|
28 |
+
except ValueError:
|
29 |
+
return 400, "Error: invalide URL"
|
30 |
+
except Exception as e:
|
31 |
+
return 400, "Error downloading video: " + str(e)
|
32 |
+
|
33 |
+
return os.path.join(download_path, file_name)
|
34 |
+
|
35 |
+
|
36 |
+
def download_captions(url:str, download_path:str):
|
37 |
+
|
38 |
+
formatter = SRTFormatter()
|
39 |
+
|
40 |
+
try:
|
41 |
+
yt = YouTube(url)
|
42 |
+
vid_id = url.split("v=")[1]
|
43 |
+
caption = YouTubeTranscriptApi.get_transcript(vid_id)
|
44 |
+
srt_formatted = formatter.format_transcript(caption)
|
45 |
+
file_name = yt.title + '.srt'
|
46 |
+
file_path = os.path.join(download_path, file_name)
|
47 |
+
with open(file_path, 'w', encoding='utf-8') as srt_file:
|
48 |
+
srt_file.write(srt_formatted)
|
49 |
+
except KeyError:
|
50 |
+
return 400, "Error: video not avaliable or cannot be download"
|
51 |
+
except ValueError:
|
52 |
+
return 400, "Error: invalide URL"
|
53 |
+
except Exception as e:
|
54 |
+
400, "Error extracting transcript from: " + str(e)
|
55 |
+
|
56 |
+
srt_file.close()
|
57 |
+
|
58 |
+
def sep_audio(video:str, output_path): #seperates audio from video file
|
59 |
+
|
60 |
+
try:
|
61 |
+
input = ffmpeg.input(video)
|
62 |
+
audio = input.audio.filter("anull")
|
63 |
+
except FileNotFoundError:
|
64 |
+
print("%s file couldn't be accessed"%video)
|
65 |
+
|
66 |
+
temp = video.split('/')[-1] #gets last element if a file path
|
67 |
+
file_name = temp.split('.')[0] + '.mp3'
|
68 |
+
file_path = os.path.join(output_path, file_name)
|
69 |
+
|
70 |
+
try:
|
71 |
+
output = ffmpeg.output(audio, file_path)
|
72 |
+
output.run()
|
73 |
+
return file_path
|
74 |
+
except:
|
75 |
+
print("error creating audio file")
|
76 |
+
|
77 |
+
|
78 |
+
def transcribe_audio(input_file:str, output_path:str): #eventually add a check for if file is mp3
|
79 |
+
try:
|
80 |
+
model = whisper.load_model("base")
|
81 |
+
result = model.transcribe(input_file)
|
82 |
+
except FileNotFoundError:
|
83 |
+
print("%s file was not found " % input_file)
|
84 |
+
|
85 |
+
try:
|
86 |
+
file_name = input_file.split('/')[-1]
|
87 |
+
file_name = file_name.split('.')[0]
|
88 |
+
file_path = os.path.join(output_path, file_name) + ".txt"
|
89 |
+
with open(file_path, 'w', encoding='utf-8') as out_file:
|
90 |
+
wrapped_text = textwrap.fill(result["text"], width=100)
|
91 |
+
out_file.write(wrapped_text)
|
92 |
+
|
93 |
+
except FileNotFoundError:
|
94 |
+
print("%s this dir can't be accessed " % output_path)
|
95 |
+
|
96 |
+
out_file.close()
|
97 |
+
return(file_path)
|
98 |
+
|
99 |
+
def translate_text(input_file:str, output_path:str, lang: str):
|
100 |
+
|
101 |
+
translator = GoogleTranslator(source= 'english', target=lang)
|
102 |
+
|
103 |
+
try: #try to open our caption file
|
104 |
+
in_file = open(input_file, 'r', encoding="utf8") #opening file to read
|
105 |
+
except FileNotFoundError:
|
106 |
+
print("%s file was not found " % input_file)
|
107 |
+
|
108 |
+
try: #try to create a new file to store translation
|
109 |
+
out_file_name = (input_file.split('/')[-1]).split('.')[0] + ' translation.txt' # we do a split incase file is abs path then take old name
|
110 |
+
out_file_path = os.path.join(output_path, out_file_name)
|
111 |
+
out_file = open(out_file_path, 'w', encoding='utf8')
|
112 |
+
except FileNotFoundError:
|
113 |
+
print("%s this dir can't be accessed " % output_path)
|
114 |
+
|
115 |
+
for i in in_file.readlines(): #reading all files in the 'captions' directory
|
116 |
+
translated_line = translator.translate(i)
|
117 |
+
out_file.write(translated_line+'\n')
|
118 |
+
|
119 |
+
print('%s has be sucessfully translate' % input_file)
|
120 |
+
in_file.close()
|
121 |
+
out_file.close()
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
app = Flask(__name__)
|
126 |
+
|
127 |
+
|
128 |
+
@app.route("/")
|
129 |
+
|
130 |
+
def main():
|
131 |
+
audio_dir_name = 'audio'
|
132 |
+
video_dir_name = 'videos'
|
133 |
+
whisper_dir_name = 'whisper_transcripts'
|
134 |
+
current_dir = os.getcwd()
|
135 |
+
audio_dir_path = os.path.join(current_dir, audio_dir_name)
|
136 |
+
video_dir_path = os.path.join(current_dir, video_dir_name)
|
137 |
+
whisper_dir_path = os.path.join(current_dir, whisper_dir_name)
|
138 |
+
|
139 |
+
if(not(os.path.exists(audio_dir_path))):
|
140 |
+
os.makedirs(audio_dir_path)
|
141 |
+
if(not(os.path.exists(whisper_dir_path))):
|
142 |
+
os.makedirs(whisper_dir_path)
|
143 |
+
|
144 |
+
|
145 |
+
files = os.listdir(video_dir_path) # accessing all files in audio dir
|
146 |
+
in_file = os.path.join(video_dir_path, files[0]) #grabs first file in the directory
|
147 |
+
|
148 |
+
|
149 |
+
translate_text(transcribe_audio(sep_audio(in_file,audio_dir_path),whisper_dir_path),current_dir,'german')
|
150 |
+
|
151 |
+
return 200, "audio has been transcribed successfully"
|
152 |
+
|
153 |
+
if __name__ == "__main__":
|
154 |
+
app.run(debug=True)
|