Spaces:
Runtime error
Runtime error
File size: 6,492 Bytes
59aa281 c796c98 59aa281 d2828e6 59aa281 d2828e6 59aa281 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import torch
import pytube as pt
import torchaudio
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from pyannote.audio import Pipeline
from pydub import AudioSegment
import pandas as pd
from tqdm import tqdm
import shutil
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import (
AutomaticSpeechRecognitionPipeline,
WhisperForConditionalGeneration,
WhisperTokenizer,
WhisperProcessor,
)
import tempfile
import time
import os
from moviepy.editor import *
MODEL_NAME = "nadsoft/hamsa-v0.2-beta"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
lang = 'ar'
device = 0 if torch.cuda.is_available() else "cpu"
auth_token = os.environ.get("auth_token")
language = "arabic"
task = "transcribe"
file1 = 'meet.mp4'
file2 = 'audio.wav'
file3 = 'result.csv'
file4 = 'transcripts.csv'
#delete the file if exists
if os.path.exists(file1):
os.remove(file1)
if os.path.exists(file2):
os.remove(file2)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME,use_auth_token=auth_token)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME, language=language, task=task,use_auth_token=auth_token)
processor = WhisperProcessor.from_pretrained(MODEL_NAME, language=language, task=task,use_auth_token=auth_token)
feature_extractor = processor.feature_extractor
forced_decoder_ids = processor.get_decoder_prompt_ids(language=language, task=task)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
chunk_length_s=30,
device=device,
)
pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
def speaker_diarization():
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=str(auth_token), # optional)
).to(torch.device("cuda"))
# send pipeline to GPU (when available)
waveform, sample_rate = torchaudio.load('audio.wav')
diarization = pipeline({"waveform": waveform, "sample_rate": sample_rate})
df = pd.DataFrame(columns=['start', 'stop', 'speaker'])
for turn, _, speaker in diarization.itertracks(yield_label=True):
df = df.append({'start': turn.start, 'stop': turn.end, 'speaker': speaker}, ignore_index=True)
# if the speaker is the same for 2 rows or more then merge them
new_df = pd.DataFrame(columns=['start', 'stop', 'speaker'])
for i in range(len(df)):
if i == 0:
new_df = new_df.append({'start': df['start'][i], 'stop': df['stop'][i], 'speaker': df['speaker'][i]}, ignore_index=True)
else:
if df['speaker'][i] == df['speaker'][i-1]:
new_df['stop'][len(new_df)-1] = df['stop'][i]
else:
new_df = new_df.append({'start': df['start'][i], 'stop': df['stop'][i], 'speaker': df['speaker'][i]}, ignore_index=True)
new_df.to_csv('result.csv', index=False)
def save_audio_chunks(data_path, new_df):
# load the audio file
audio = AudioSegment.from_wav('audio.wav')
# save each chunk
for i in tqdm(range(len(new_df))):
start = new_df['start'][i]* 1000
stop = new_df['stop'][i]* 1000
audio_chunk = audio[start:stop]
audio_chunk.export(data_path + '/audio_'+new_df['speaker'][i] + '_'+ str(i) + '.wav', format="wav")
def download(url):
os.system('yt-dlp '+url+' -o meet.mp4')
def mp4_2_audio():
video = VideoFileClip("meet.mp4")
audio = video.audio
audio.write_audiofile('audio.wav')
def random_response(message):
if os.path.exists('data/'):
shutil.rmtree('data/')
#delete the file if exists
if os.path.exists(file1):
os.remove(file1)
if os.path.exists(file2):
os.remove(file2)
if os.path.exists(file3):
os.remove(file3)
if os.path.exists(file4):
os.remove(file4)
download(message)
mp4_2_audio()
full_transcript = pipe('audio.wav')['text']
print('full trans : ', full_transcript )
data_path = 'data'
#check if the folder exists
if not os.path.exists(data_path):
os.makedirs(data_path)
speaker_diarization()
df = pd.read_csv('result.csv')
#get all the speakers and
speakers = df['speaker'].unique()
#create a new dataframe speaker and transcript
new_df = pd.DataFrame(columns=['speaker', 'transcript','index'])
# save the audio chunks
save_audio_chunks(data_path, df)
# for each speaker
speakers_all = df['speaker'].unique()
# make a for a list for each speaker audio path from the data folder
all_audio_files = [audio for audio in os.listdir(data_path) if audio.endswith('.wav')]
# loop over the audio files and add the speaker and transcript to the new dataframe
for audio in tqdm(all_audio_files):
try:
# get the speaker name the name is (audio_SPEAKER_04_3) the speaker is the second part and the number is the third part and the number after the _ is the index
speaker = audio.split('_')[1]
speaker_num = audio.split('_')[2]
speaker = speaker + '_' + speaker_num
index = audio.split('_')[3].split('.')[0]
# get the transcript
transcript = pipe(data_path + '/' + audio)['text']
except:
transcript = "no_text"
# append to the new dataframe
new_df = new_df.append({'speaker': speaker, 'transcript': transcript , 'index': index
}, ignore_index=True)
# save the new dataframe
new_df.to_csv('transcripts.csv', index=False)
#make sure the index is int and the data sorted by the index from 0 to the end
new_df['index'] = new_df['index'].astype(int)
return full_transcript, 'meet.mp4' , new_df.sort_values(by=['index'])
demo = gr.Interface(
fn=random_response,
inputs=[gr.Textbox(label='Tiktok Video URL')],
outputs=[gr.Textbox(rtl = True , text_align = 'rights',label='Full text transcript'),gr.Video(label='Tiktok Video'),gr.Dataframe(label='Speakers')],
theme=gr.themes.Monochrome(),
)
if __name__ == "__main__":
demo.queue().launch(share=True)
|