yt-chunks / transcriber.py
archit11's picture
Create transcriber.py
957fb6e verified
import yt_dlp as youtube_dl
def download_youtube_audio(url, output_path, preferred_quality="192"):
ydl_opts = {
'format': 'bestaudio/best', # Select best audio quality
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': preferred_quality,
}],
'outtmpl': output_path, # Specify the output path and file name
}
try:
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
info_dict = ydl.extract_info(url, download=False)
video_title = info_dict.get('title', None)
print(f"Downloading audio for: {video_title}")
ydl.download([url])
print(f"Audio file saved as: {output_path}")
return output_path
except youtube_dl.utils.DownloadError as e:
print(f"Error downloading audio: {e}")
return None # Indicate failure
def transcribe(path ,model):
model = WhisperModel(model)
print(f"reading {path}")
segments, info = model.transcribe(path)
return segments
def process_segments(segments: Generator):
result = {}
print("processing...")
for i, segment in enumerate(segments):
chunk_id = f"chunk_{i}"
result[chunk_id] = {
'chunk_id': segment.id,
'chunk_length': segment.end - segment.start,
'text': segment.text,
'start_time': segment.start,
'end_time': segment.end
}
df = pd.DataFrame.from_dict(result, orient='index')
return df
def gen_csv():
df = process_segments(transcribe(download_youtube_audio("https://www.youtube.com/watch?v=Sby1uJ_NFIY", path), "distil-large-v3"))
df.to_csv('alo.csv')