Mei000 commited on
Commit
025c85b
1 Parent(s): 6ef2142

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.py
3
+ Automatically generated by Colaboratory.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/1sjpiZeNESzMVcNpywmkNPS6Wwu6pTfrE
6
+ """
7
+
8
+
9
+ import os
10
+ import gradio as gr
11
+ from transformers import pipeline
12
+ from pytube import YouTube
13
+ from datasets import Dataset, Audio
14
+ from moviepy.editor import AudioFileClip
15
+
16
+ pipe = pipeline(model="Qinglinz/whisper-small-hi")
17
+
18
+ def download_from_youtube(url):
19
+ """
20
+ Downloads the video from the given YouTube URL and returns the path to the audio file.
21
+ """
22
+ streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4')
23
+ fpath = streams.first().download()
24
+ return fpath
25
+
26
+ def get_timestamp(seconds):
27
+ """
28
+ Creates %M:%S timestamp from seconds.
29
+ """
30
+ minutes = int(seconds / 60)
31
+ seconds = int(seconds % 60)
32
+ return f"{str(minutes).zfill(2)}:{str(seconds).zfill(2)}"
33
+
34
+ def divide_into_30s_segments(audio_fpath, seconds_max):
35
+ """
36
+ Divides the audio file into 30s segments and returns the paths to the segments and the start times of the segments.
37
+ :param audio_fpath: Path to the audio file.
38
+ :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
39
+ """
40
+ if not os.path.exists("segmented_audios"):
41
+ os.makedirs("segmented_audios")
42
+
43
+ sound = AudioFileClip(audio_fpath)
44
+ n_full_segments = int(sound.duration / 30)
45
+ len_last_segment = sound.duration % 30
46
+
47
+ max_segments = int(seconds_max / 30)
48
+ if n_full_segments > max_segments:
49
+ n_full_segments = max_segments
50
+ len_last_segment = 0
51
+
52
+ segment_paths = []
53
+ segment_start_times = []
54
+
55
+ segments_available = n_full_segments + 1
56
+ for i in range(min(segments_available, max_segments)):
57
+ start = i * 30
58
+
59
+ # Skip last segment if it is smaller than two seconds
60
+ is_last_segment = i == n_full_segments
61
+ if is_last_segment and not len_last_segment > 2:
62
+ continue
63
+ elif is_last_segment:
64
+ end = start + len_last_segment
65
+ else:
66
+ end = (i + 1) * 30
67
+
68
+ segment_path = os.path.join("segmented_audios", f"segment_{i}.wav")
69
+ segment = sound.subclip(start, end)
70
+ segment.write_audiofile(segment_path)
71
+ segment_paths.append(segment_path)
72
+ segment_start_times.append(start)
73
+
74
+ return segment_paths, segment_start_times
75
+ def get_translation(text):
76
+ """
77
+ Translates the given Chinese text to English.
78
+ """
79
+ return "TODO: Make API call to Google Translate to get English translation"
80
+
81
+ def transcribe(audio, url, seconds_max):
82
+ """
83
+ Transcribes a YouTube video if a url is specified and returns the transcription.
84
+ If not url is specified, it transcribes the audio file as passed by Gradio.
85
+ :param audio: Audio file as passed by Gradio. Only used if no url is specified.
86
+ :param url: YouTube URL to transcribe.
87
+ :param seconds_max: Maximum number of seconds to consider. If the audio file is longer than this, it will be truncated.
88
+ """
89
+ if url:
90
+ fpath = download_from_youtube(url)
91
+ segment_paths, segment_start_times = divide_into_30s_segments(fpath, seconds_max)
92
+
93
+ audio_dataset = Dataset.from_dict({"audio": segment_paths}).cast_column("audio", Audio(sampling_rate=16000))
94
+ pred = pipe(audio_dataset["audio"])
95
+ text = ""
96
+ n_segments = len(segment_start_times)
97
+ for i, (seconds, output) in enumerate(zip(segment_start_times, pred)):
98
+ text += f"[Segment {i+1}/{n_segments}, start time {get_timestamp(seconds)}]\n"
99
+ text += f"{output['text']}\n"
100
+ text += f"[Translation]\n{get_translation(output['text'])}\n\n"
101
+ return text
102
+
103
+ else:
104
+ text = pipe(audio)["text"]
105
+ return text
106
+
107
+ block = gr.Interface(
108
+ fn=transcribe,
109
+ inputs=[
110
+ gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"),
111
+ gr.Text(max_lines=1, placeholder="Enter YouTube Link which has a Swedish video", label="Transcribe from YouTube URL"),
112
+ gr.Slider(minimum=30, maximum=300, value=30, step=30, label="Number of seconds to transcribe from YouTube URL")
113
+ ],
114
+ outputs="text",
115
+ title="Whisper Small Swedish",
116
+ description="Realtime Swedish speech recognition",
117
+ )
118
+
119
+ block.launch()