Spaces:
Runtime error
Runtime error
File size: 5,893 Bytes
8b6bb87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
import streamlit as st
# Transcript
from youtube_transcript_api import YouTubeTranscriptApi
import os
# Summarization
from transformers import (
pipeline,
AutoModelForSpeechSeq2Seq,
AutoProcessor,
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
import torch
import re
def fetch_transcript(video_url):
try:
# Extract the video ID from the URL
video_id = video_url.split("v=")[1]
# Fetch the transcript for the video
transcript = YouTubeTranscriptApi.get_transcript(video_id)
# Process the transcript data
text_transcript = "\n".join([entry['text'] for entry in transcript])
return text_transcript
except Exception as e:
return str(e)
def clean_transcript(transcript):
# Remove non-speech elements (e.g., laughter, background noises)
transcript = re.sub(r'\[.*?\]', '', transcript)
# Correct spelling and grammar (you can use libraries like NLTK or spaCy for this)
# Example:
# import nltk
# transcript = ' '.join(nltk.word_tokenize(transcript))
# Normalize punctuation and formatting
transcript = transcript.replace('\n', ' ') # Remove line breaks
transcript = re.sub(r'\s+', ' ', transcript) # Remove extra whitespaces
# Remove timestamps and annotations
transcript = re.sub(r'\[\d+:\d+:\d+\]', '', transcript)
# Handle speaker identification (if present)
# Example: transcript = re.sub(r'Speaker\d+:', '', transcript)
# Remove filler words and phrases
filler_words = ['like', 'you know', 'sort of'] # Add more as needed
for word in filler_words:
transcript = transcript.replace(word, '')
# Replace common contractions with their expanded forms
transcript = transcript.replace("won't", "will not")
transcript = transcript.replace("can't", "cannot")
transcript = transcript.replace("n't", " not")
transcript = transcript.replace("'ll", " will")
transcript = transcript.replace("'ve", " have")
transcript = transcript.replace("'re", " are")
transcript = transcript.replace("'d", " would")
transcript = transcript.replace("'s", " is")
return transcript.strip() # Trim leading/trailing whitespaces
def extract_video_id(url):
"""Extracts the YouTube video ID from the URL."""
match = re.search(r"(?<=v=)[\w-]+", url)
if match:
return match.group(0)
else:
return None
def summarize_transcript(text, llama_pipeline):
def summarize_text(llama_pipeline, system_prompt, text):
# Format the input text with special tokens for the model
text = f"""
<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>
{text}[/INST]
"""
# Generate sequences using the pipeline with specified parameters
sequences = llama_pipeline(text)
# Extract the generated text from the sequences
generated_text = sequences[0]["generated_text"]
# Trim the generated text to remove the instruction part
generated_text = generated_text[generated_text.find('[/INST]')+len('[/INST]'):]
# Return the processed generated text
return generated_text
# Define the maximum input length for each iteration of summarization
input_len = 1000
# Start an infinite loop to repeatedly summarize the text
while True:
# Print the current length of the text
print(len(text))
# Call the chat function to summarize the text. Only the first 'input_len' characters are considered for summarization
summary = summarize_text(llama_pipeline, "", "Summarize the following: " + text[0:input_len])
if len(summary) < input_len:
return summary
# Concatenate the current summary with the remaining part of the text for the next iteration
text = summary + " " + text[input_len:]
# Load the model and tokenizer
@st.cache_resource()
def load_model():
# Define the model name to be used for the chat function
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline_llama2 = pipeline(
"text-generation", #task
model=model_name,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
device_map="auto",
# max_length=max_token_length,
do_sample=True,
top_k=10,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id
)
return pipeline_llama2
def main():
st.title("YouTube Video Preview")
with st.spinner('Loading checkpoint shards of LLAMA-2'):
pipeline_llama2 = load_model()
st.success('Done!')
# Input field for the YouTube video link
youtube_url = st.text_input("Paste YouTube Video Link:")
# Extract video ID from the URL
video_id = extract_video_id(youtube_url)
# Display video preview if video ID is found
if video_id:
video_url = f"https://www.youtube.com/watch?v={video_id}"
st.video(video_url, format='video/mp4')
video_transcript = clean_transcript(fetch_transcript(video_url))
if video_transcript:
# Display transcript and summary side by side
col1, col2 = st.columns(2)
with col1:
st.subheader("Transcript:")
st.text_area(" ", video_transcript, height=400)
with col2:
st.subheader("Summary:")
video_summary = summarize_transcript(video_transcript, pipeline_llama2)
st.text_area(" ", video_summary, height=400)
print(f"Summary:{video_summary}")
else:
st.error("Failed to fetch video transcript. Please check the video ID or try again later.")
elif youtube_url:
st.warning("Invalid YouTube Video Link")
if __name__ == "__main__":
main()
|