Amrrs commited on
Commit
8caf80e
1 Parent(s): 02c7738

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -0
app.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ from pytube import YouTube
3
+ from huggingsound import SpeechRecognitionModel
4
+ import torch
5
+ from transformers import pipeline
6
+ from IPython.display import Audio
7
+ from pprint import pprint
8
+ import os
9
+ import gradio as gr
10
+
11
+ # Constants
12
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
+ MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE)
14
+ summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline
15
+ SAMPLING_RATE = 16000
16
+ torch.set_num_threads(1)
17
+
18
+ def transcribeVideo(VIDEO_URL):
19
+ # Download the '.mp4' & save it as an audio file ('.wav') for the video
20
+ ytVideo = YouTube(VIDEO_URL)
21
+ ytVideo.streams \
22
+ .filter(only_audio = True, file_extension = 'mp4') \
23
+ .first() \
24
+ .download(filename = 'ytaudio.mp4') \
25
+
26
+ os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav")
27
+
28
+ # Audio Chunking with Silero VAD
29
+ model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
30
+ model='silero_vad',
31
+ force_reload=True,
32
+ onnx=USE_ONNX)
33
+ (get_speech_timestamps,
34
+ save_audio,
35
+ read_audio,
36
+ VADIterator,
37
+ collect_chunks) = utils
38
+
39
+ # Read '.wav' audio file
40
+ audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE)
41
+ # get speech timestamps from full audio file
42
+ speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE)
43
+
44
+ # Save the audio chunks as separate audio files
45
+ index = 0
46
+ for timestamp in speechTimestamps:
47
+ startTime = timestamp['start']
48
+ endTime = timestamp['end']
49
+ save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE)
50
+ index += 1
51
+
52
+ # Concatenate the path of these separated audio chunks
53
+ audioChunksPath = []
54
+ for i in range(len(speechTimestamps)):
55
+ audioChunksPath.append(f'/content/speech-{i}.wav')
56
+
57
+ # Generate individual transcriptions & concatenate them
58
+ transcriptions = MODEL.transcribe(audioChunksPath)
59
+
60
+ fullTranscript = ''
61
+ for transcript in transcriptions:
62
+ fullTranscript += ''.join(transcript['transcription']) + ' '
63
+
64
+ return fullTranscript
65
+
66
+ def summarizeTranscription(VIDEO_URL):
67
+ fullTranscript = transcribeVideo(VIDEO_URL)
68
+
69
+ # Generate summary from the full transcript
70
+ summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False)
71
+ return summarizedText[0]['summary_text']
72
+
73
+ iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)