File size: 3,108 Bytes
8caf80e
 
1d9765b
1f542fb
1d9765b
 
 
30d5110
 
1d9765b
 
7200fe2
1d9765b
 
 
8caf80e
 
 
 
 
 
 
 
07ae715
 
 
 
8caf80e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Imports
from pytube import YouTube

import os 
import subprocess # process in the os
from subprocess import STDOUT, check_call #os process manipulation

os.system('pip uninstall -y soundfile')
os.system('pip install --upgrade --force-reinstall SoundFile')

"""install libsndfile on the linux machine"""
proc = subprocess.Popen('apt-get update -y && apt-get install -y --no-install-recommends build-essential gcc libsndfile1', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash")
proc.wait()


from huggingsound import SpeechRecognitionModel
import torch
from transformers import pipeline
from IPython.display import Audio
from pprint import pprint
import os
import gradio as gr





# Constants
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE)
summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline
SAMPLING_RATE = 16000
torch.set_num_threads(1)

def transcribeVideo(VIDEO_URL):
  # Download the '.mp4' & save it as an audio file ('.wav') for the video
  ytVideo = YouTube(VIDEO_URL)
  ytVideo.streams \
    .filter(only_audio = True, file_extension = 'mp4') \
    .first() \
    .download(filename = 'ytaudio.mp4') \

  os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav")

  # Audio Chunking with Silero VAD
  model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                model='silero_vad',
                                force_reload=True,
                                onnx=USE_ONNX)
  (get_speech_timestamps,
  save_audio,
  read_audio,
  VADIterator,
  collect_chunks) = utils

  # Read '.wav' audio file
  audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE)
  # get speech timestamps from full audio file
  speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE)

  # Save the audio chunks as separate audio files
  index = 0
  for timestamp in speechTimestamps:
    startTime = timestamp['start']
    endTime = timestamp['end']
    save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE)
    index += 1

  # Concatenate the path of these separated audio chunks 
  audioChunksPath = []
  for i in range(len(speechTimestamps)):
    audioChunksPath.append(f'/content/speech-{i}.wav')

  # Generate individual transcriptions & concatenate them
  transcriptions = MODEL.transcribe(audioChunksPath)

  fullTranscript = ''
  for transcript in transcriptions:
    fullTranscript += ''.join(transcript['transcription']) + ' '
  
  return fullTranscript

def summarizeTranscription(VIDEO_URL):
  fullTranscript = transcribeVideo(VIDEO_URL)

  # Generate summary from the full transcript
  summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False)
  return summarizedText[0]['summary_text']

iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)