# Imports from pytube import YouTube import os import subprocess # process in the os from subprocess import STDOUT, check_call #os process manipulation os.system('pip uninstall -y soundfile') os.system('pip install --upgrade --force-reinstall SoundFile') """install libsndfile on the linux machine""" proc = subprocess.Popen('apt-get update -y && apt-get install -y --no-install-recommends build-essential gcc libsndfile1', shell=True, stdin=None, stdout=open(os.devnull,"wb"), stderr=STDOUT, executable="/bin/bash") proc.wait() from huggingsound import SpeechRecognitionModel import torch from transformers import pipeline from IPython.display import Audio from pprint import pprint import os import gradio as gr # Constants DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english", device = DEVICE) summarizationPipeline = pipeline('summarization') # Hugging Face's default summarization pipeline SAMPLING_RATE = 16000 torch.set_num_threads(1) def transcribeVideo(VIDEO_URL): # Download the '.mp4' & save it as an audio file ('.wav') for the video ytVideo = YouTube(VIDEO_URL) ytVideo.streams \ .filter(only_audio = True, file_extension = 'mp4') \ .first() \ .download(filename = 'ytaudio.mp4') \ os.system("ffmpeg -i ytaudio.mp4 -acodec pcm_s16le -ar 16000 ytaudio.wav") # Audio Chunking with Silero VAD model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=True, onnx=USE_ONNX) (get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils # Read '.wav' audio file audioFile = read_audio('ytaudio.wav', sampling_rate=SAMPLING_RATE) # get speech timestamps from full audio file speechTimestamps = get_speech_timestamps(audioFile, model, sampling_rate=SAMPLING_RATE) # Save the audio chunks as separate audio files index = 0 for timestamp in speechTimestamps: startTime = timestamp['start'] endTime = timestamp['end'] save_audio(f'speech-{index}.wav', audioFile[startTime:endTime], sampling_rate=SAMPLING_RATE) index += 1 # Concatenate the path of these separated audio chunks audioChunksPath = [] for i in range(len(speechTimestamps)): audioChunksPath.append(f'/content/speech-{i}.wav') # Generate individual transcriptions & concatenate them transcriptions = MODEL.transcribe(audioChunksPath) fullTranscript = '' for transcript in transcriptions: fullTranscript += ''.join(transcript['transcription']) + ' ' return fullTranscript def summarizeTranscription(VIDEO_URL): fullTranscript = transcribeVideo(VIDEO_URL) # Generate summary from the full transcript summarizedText = summarizationPipeline(fullTranscript, max_length=300, min_length=75, do_sample=False) return summarizedText[0]['summary_text'] iface = gr.Interface(fn=summarizeTranscription, inputs=["text"], outputs=["textbox"], title='YouTube Video Summarizer').launch(inline=False)