Spaces:
Sleeping
Sleeping
Dhrumit1314
commited on
Commit
·
4e1967e
1
Parent(s):
be8e27d
Files Added
Browse files- Dockerfile +20 -0
- app.py +232 -0
- requirements.txt +11 -0
Dockerfile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Use the official Python image as the base image
|
2 |
+
FROM python:3.9
|
3 |
+
|
4 |
+
# Set the working directory inside the container
|
5 |
+
WORKDIR /code
|
6 |
+
|
7 |
+
# Copy the requirements file into the container at /code
|
8 |
+
COPY ./requirements.txt /code/requirements.txt
|
9 |
+
|
10 |
+
# Install the required Python packages
|
11 |
+
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
12 |
+
|
13 |
+
# Copy the entire current directory into the container at /code
|
14 |
+
COPY . .
|
15 |
+
|
16 |
+
# Expose port 5000 to the outside world
|
17 |
+
EXPOSE 5000
|
18 |
+
|
19 |
+
# Command to run the Flask applicationW
|
20 |
+
CMD ["python", "app.py"]
|
app.py
ADDED
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import time
|
4 |
+
from concurrent.futures import ThreadPoolExecutor
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import moviepy.editor as mp
|
7 |
+
import requests
|
8 |
+
import spacy
|
9 |
+
import speech_recognition as sr
|
10 |
+
import tensorflow as tf
|
11 |
+
from flask import Flask, jsonify, request
|
12 |
+
from flask_cors import CORS
|
13 |
+
from io import BytesIO
|
14 |
+
from requests import get
|
15 |
+
from string import punctuation
|
16 |
+
from tqdm import tqdm
|
17 |
+
from transformers import BartTokenizer, T5ForConditionalGeneration, T5Tokenizer, TFBartForConditionalGeneration
|
18 |
+
from youtube_transcript_api import YouTubeTranscriptApi as yta
|
19 |
+
from wordcloud import WordCloud
|
20 |
+
from heapq import nlargest
|
21 |
+
from werkzeug.utils import secure_filename
|
22 |
+
from spacy.lang.en.stop_words import STOP_WORDS
|
23 |
+
|
24 |
+
# Change the directory to the backend folder
|
25 |
+
os.chdir("E:/Centennial/SEMESTER 6/Software Development Project/backend/")
|
26 |
+
|
27 |
+
# Create a Flask app
|
28 |
+
app = Flask(__name__)
|
29 |
+
CORS(app)
|
30 |
+
|
31 |
+
# Function to extract video ID from YouTube link
|
32 |
+
def extract_video_id(youtube_link):
|
33 |
+
pattern = re.compile(r'(?<=v=)[a-zA-Z0-9_-]+(?=&|\b|$)')
|
34 |
+
match = pattern.search(youtube_link)
|
35 |
+
if match:
|
36 |
+
return match.group()
|
37 |
+
else:
|
38 |
+
return None
|
39 |
+
|
40 |
+
# Route for uploading video files
|
41 |
+
@app.route('/upload_video', methods=['POST'])
|
42 |
+
def upload_video():
|
43 |
+
start_time = time.time()
|
44 |
+
if 'video' not in request.files:
|
45 |
+
return jsonify({'error': 'No video file found in the request'})
|
46 |
+
video = request.files['video']
|
47 |
+
if video.mimetype.split('/')[0] != 'video':
|
48 |
+
return jsonify({'error': 'The file uploaded is not a video'})
|
49 |
+
|
50 |
+
model_name = request.form.get('modelName')
|
51 |
+
print("MODEL:", model_name)
|
52 |
+
|
53 |
+
backend_folder = 'backend_videos'
|
54 |
+
if not os.path.exists(backend_folder):
|
55 |
+
os.makedirs(backend_folder)
|
56 |
+
video_path = os.path.join(backend_folder, secure_filename(video.filename))
|
57 |
+
video.save(video_path)
|
58 |
+
|
59 |
+
transcript = transcribe_audio(video_path)
|
60 |
+
|
61 |
+
summary = ""
|
62 |
+
if model_name == 'T5':
|
63 |
+
summary = summarize_text_t5(transcript)
|
64 |
+
elif model_name == 'BART':
|
65 |
+
summary = summarize_text_bart(transcript)
|
66 |
+
else:
|
67 |
+
summary = summarizer(transcript)
|
68 |
+
|
69 |
+
end_time = time.time()
|
70 |
+
elapsed_time = end_time - start_time
|
71 |
+
print(f"Video saved successfully. Time taken: {elapsed_time} seconds")
|
72 |
+
|
73 |
+
return jsonify({'message': 'successful', 'transcript': transcript, 'summary': summary, 'modelName': model_name})
|
74 |
+
|
75 |
+
# Route for uploading YouTube video links
|
76 |
+
@app.route('/youtube_upload_video', methods=['POST'])
|
77 |
+
def upload_youtube_video():
|
78 |
+
start_time = time.time()
|
79 |
+
transcript = "Testing text"
|
80 |
+
summary = "Testing text"
|
81 |
+
|
82 |
+
model_name = request.form.get('modelName')
|
83 |
+
youtube_link = request.form.get('link')
|
84 |
+
print('link', youtube_link)
|
85 |
+
video_id = extract_video_id(youtube_link)
|
86 |
+
if video_id is None:
|
87 |
+
return jsonify({'message': 'successful', 'transcript': "error with youtube link", 'summary': "error with youtube link", 'modelName': model_name})
|
88 |
+
|
89 |
+
transcript = generate_and_save_transcript_with_visuals(video_id)
|
90 |
+
summary = ""
|
91 |
+
if model_name == 'T5':
|
92 |
+
summary = summarize_text_t5(transcript)
|
93 |
+
elif model_name == 'BART':
|
94 |
+
summary = summarize_text_bart(transcript)
|
95 |
+
else:
|
96 |
+
summary = summarizer(transcript)
|
97 |
+
|
98 |
+
end_time = time.time()
|
99 |
+
elapsed_time = end_time - start_time
|
100 |
+
print(f"Video saved successfully. Time taken: {elapsed_time} seconds")
|
101 |
+
|
102 |
+
return jsonify({'message': 'successful', 'transcript': transcript, 'summary': summary, 'modelName': model_name})
|
103 |
+
|
104 |
+
# Function to generate transcript and visuals for YouTube videos
|
105 |
+
def generate_and_save_transcript_with_visuals(video_id, file_name="yt_generated_transcript.txt"):
|
106 |
+
try:
|
107 |
+
data = yta.get_transcript(video_id)
|
108 |
+
transcript = ''
|
109 |
+
for value in tqdm(data, desc="Downloading Transcript", unit=" lines"):
|
110 |
+
for key, val in value.items():
|
111 |
+
if key == 'text':
|
112 |
+
transcript += val + ' '
|
113 |
+
transcript = transcript.strip()
|
114 |
+
return transcript
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Error: {str(e)}")
|
117 |
+
|
118 |
+
# Transcribe audio from video
|
119 |
+
def transcribe_audio(file_path, chunk_duration=30):
|
120 |
+
video = mp.VideoFileClip(file_path)
|
121 |
+
audio = video.audio
|
122 |
+
audio.write_audiofile("sample_audio.wav", codec='pcm_s16le')
|
123 |
+
|
124 |
+
r = sr.Recognizer()
|
125 |
+
with sr.AudioFile("sample_audio.wav") as source:
|
126 |
+
audio = r.record(source)
|
127 |
+
|
128 |
+
total_duration = len(audio.frame_data) / audio.sample_rate
|
129 |
+
total_chunks = int(total_duration / chunk_duration) + 1
|
130 |
+
|
131 |
+
all_text = []
|
132 |
+
|
133 |
+
def transcribe_chunk(start):
|
134 |
+
nonlocal all_text
|
135 |
+
chunk = audio.get_segment(start * 1000, (start + chunk_duration) * 1000)
|
136 |
+
try:
|
137 |
+
text = r.recognize_google(chunk)
|
138 |
+
all_text.append(text)
|
139 |
+
print(f" Chunk {start}-{start+chunk_duration}: {text}")
|
140 |
+
except sr.UnknownValueError:
|
141 |
+
all_text.append("")
|
142 |
+
except sr.RequestError as e:
|
143 |
+
all_text.append(f"[Error: {e}]")
|
144 |
+
|
145 |
+
num_threads = min(total_chunks, total_chunks + 5)
|
146 |
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
147 |
+
list(tqdm(executor.map(transcribe_chunk, range(0, int(total_duration), chunk_duration)),
|
148 |
+
total=total_chunks, desc="Transcribing on multithreading: "))
|
149 |
+
|
150 |
+
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(' '.join(all_text))
|
151 |
+
plt.figure(figsize=(10, 5))
|
152 |
+
plt.imshow(wordcloud, interpolation='bilinear')
|
153 |
+
plt.axis("off")
|
154 |
+
plt.show()
|
155 |
+
|
156 |
+
return ' '.join(all_text)
|
157 |
+
|
158 |
+
# Load pre-trained models and tokenizers
|
159 |
+
tokenizer_bart = BartTokenizer.from_pretrained('facebook/bart-large')
|
160 |
+
tokenizer_t5 = T5Tokenizer.from_pretrained('t5-small')
|
161 |
+
|
162 |
+
with tf.device('/CPU:0'):
|
163 |
+
model_t5 = T5ForConditionalGeneration.from_pretrained("Dhrumit1314/T5_TextSummary")
|
164 |
+
model_bart = TFBartForConditionalGeneration.from_pretrained("Dhrumit1314/BART_TextSummary")
|
165 |
+
|
166 |
+
# Function to summarize text using T5 model
|
167 |
+
def summarize_text_t5(text):
|
168 |
+
start_time = time.time()
|
169 |
+
t5_prepared_Text = "summarize: "+text
|
170 |
+
tokenized_text = tokenizer_t5.encode(t5_prepared_Text, return_tensors="pt")
|
171 |
+
summary_ids = model_t5.generate(tokenized_text,
|
172 |
+
num_beams=4,
|
173 |
+
no_repeat_ngram_size=2,
|
174 |
+
min_length=256,
|
175 |
+
max_length=512,
|
176 |
+
early_stopping=True)
|
177 |
+
output = tokenizer_t5.decode(summary_ids[0], skip_special_tokens=True)
|
178 |
+
end_time = time.time()
|
179 |
+
print(f"Execution time for T5 Model: {end_time - start_time} seconds")
|
180 |
+
return output
|
181 |
+
|
182 |
+
def summarize_text_bart(text):
|
183 |
+
start_time = time.time()
|
184 |
+
inputs = tokenizer_bart([text], max_length=1024, return_tensors='tf')
|
185 |
+
summary_ids = model_bart.generate(inputs['input_ids'], num_beams=4, max_length=256, early_stopping=True)
|
186 |
+
output = [tokenizer_bart.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
|
187 |
+
end_time = time.time()
|
188 |
+
print(f"Execution time for BART Model: {end_time - start_time} seconds")
|
189 |
+
return output[0]
|
190 |
+
|
191 |
+
# Spacy summarizer
|
192 |
+
def summarizer(rawdocs):
|
193 |
+
stopwords = list(STOP_WORDS)
|
194 |
+
nlp = spacy.load('en_core_web_sm')
|
195 |
+
doc = nlp(rawdocs)
|
196 |
+
tokens = [token.text for token in doc]
|
197 |
+
word_freq = {}
|
198 |
+
for word in doc:
|
199 |
+
if word.text.lower() not in stopwords and word.text.lower() not in punctuation:
|
200 |
+
if word.text not in word_freq.keys():
|
201 |
+
word_freq[word.text] = 1
|
202 |
+
else:
|
203 |
+
word_freq[word.text] += 1
|
204 |
+
|
205 |
+
max_freq = max(word_freq.values())
|
206 |
+
|
207 |
+
for word in word_freq.keys():
|
208 |
+
word_freq[word] = word_freq[word]/max_freq
|
209 |
+
|
210 |
+
sent_tokens = [sent for sent in doc.sents]
|
211 |
+
|
212 |
+
sent_scores = {}
|
213 |
+
|
214 |
+
for sent in sent_tokens:
|
215 |
+
for word in sent:
|
216 |
+
if word.text in word_freq.keys():
|
217 |
+
if sent not in sent_scores.keys():
|
218 |
+
sent_scores[sent] = word_freq[word.text]
|
219 |
+
else:
|
220 |
+
sent_scores[sent] += word_freq[word.text]
|
221 |
+
|
222 |
+
select_len = int(len(sent_tokens) * 0.3)
|
223 |
+
summary = nlargest(select_len, sent_scores, key=sent_scores.get)
|
224 |
+
final_summary = [word.text for word in summary]
|
225 |
+
summary = ' '.join(final_summary)
|
226 |
+
|
227 |
+
return summary
|
228 |
+
|
229 |
+
# Main run function
|
230 |
+
if __name__ == '__main__':
|
231 |
+
os.chdir("E:/Centennial/SEMESTER 6/Software Development Project/backend/")
|
232 |
+
app.run(debug=True, port=5000, use_reloader=False)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
matplotlib==3.8
|
2 |
+
moviepy==1.0
|
3 |
+
requests
|
4 |
+
spacy==3.7
|
5 |
+
speechrecognition==3.10
|
6 |
+
tensorflow==2.10
|
7 |
+
flask==3.0
|
8 |
+
flask-cors==4.0
|
9 |
+
transformers==4.38
|
10 |
+
youtube-transcript-api==0.6
|
11 |
+
wordcloud==1.9
|