reflection777 commited on
Commit
41d69fc
1 Parent(s): 877be9d

clickable links updated

Browse files
Files changed (1) hide show
  1. app.py +792 -792
app.py CHANGED
@@ -1,792 +1,792 @@
1
- # 0- libraries
2
- import transformers
3
- import gradio as gr
4
-
5
- from youtube_transcript_api import YouTubeTranscriptApi
6
- from huggingface_hub import InferenceClient
7
- from pytube import YouTube
8
- import pytube
9
- import torch
10
-
11
- # 1 - abstractive_summary
12
- # 1.1 - initialize
13
- import os
14
- save_dir = os.path.join(os.getcwd(), "docs")
15
- if not os.path.exists(save_dir):
16
- os.mkdir(save_dir)
17
- transcription_model_id = "openai/whisper-large"
18
- llm_model_id = "tiiuae/falcon-7b-instruct"
19
- HF_TOKEN = os.environ.get("HF_TOKEN", None)
20
-
21
- # 1.2 - transcription
22
- def get_yt_transcript(url):
23
- text = ""
24
- vid_id = pytube.extract.video_id(url)
25
- temp = YouTubeTranscriptApi.get_transcript(vid_id)
26
- for t in temp:
27
- text += t["text"] + " "
28
- return text
29
-
30
- # 1.2.1 - locally_transcribe
31
- def transcribe_yt_vid(url):
32
- # download YouTube video's audio
33
- yt = YouTube(str(url))
34
- audio = yt.streams.filter(only_audio=True).first()
35
- out_file = audio.download(filename="audio.mp3", output_path=save_dir)
36
-
37
- # defining an automatic-speech-recognition pipeline
38
- asr = transformers.pipeline(
39
- "automatic-speech-recognition",
40
- model=transcription_model_id,
41
- device_map="auto",
42
- )
43
-
44
- # setting model config parameters
45
- asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
46
- language="en", task="transcribe"
47
- )
48
-
49
- # invoking the Whisper model
50
- temp = asr(out_file, chunk_length_s=20)
51
- text = temp["text"]
52
-
53
- # we can do this at the end to release GPU memory
54
- del asr
55
- torch.cuda.empty_cache()
56
-
57
- return text
58
-
59
- # 1.2.1 - api_transcribe
60
- def transcribe_yt_vid_api(url, api_token):
61
- # download YouTube video's audio
62
- yt = YouTube(str(url))
63
- audio = yt.streams.filter(only_audio=True).first()
64
- out_file = audio.download(filename="audio.wav", output_path=save_dir)
65
-
66
- # Initialize client for the Whisper model
67
- client = InferenceClient(model=transcription_model_id, token=api_token)
68
-
69
- import librosa
70
- import soundfile as sf
71
-
72
- text = ""
73
- t = 25 # audio chunk length in seconds
74
- x, sr = librosa.load(out_file, sr=None)
75
- # This gives x as audio file in numpy array and sr as original sampling rate
76
- # The audio needs to be split in 20 second chunks since the API call truncates the response
77
- for _, i in enumerate(range(0, (len(x) // (t * sr)) + 1)):
78
- y = x[t * sr * i : t * sr * (i + 1)]
79
- split_path = os.path.join(save_dir, "audio_split.wav")
80
- sf.write(split_path, y, sr)
81
- text += client.automatic_speech_recognition(split_path)
82
-
83
- return text
84
-
85
-
86
- # 1.2.3 - transcribe locally or api
87
- def transcribe_youtube_video(url, force_transcribe=False, use_api=False, api_token=None):
88
-
89
- yt = YouTube(str(url))
90
- text = ""
91
- # get the transcript from YouTube if available
92
- try:
93
- text = get_yt_transcript(url)
94
- except:
95
- pass
96
-
97
- # transcribes the video if YouTube did not provide a transcription
98
- # or if you want to force_transcribe anyway
99
- if text == "" or force_transcribe:
100
- if use_api:
101
- text = transcribe_yt_vid_api(url, api_token=api_token)
102
- transcript_source = "The transcript was generated using {} via the Hugging Face Hub API.".format(
103
- transcription_model_id
104
- )
105
- else:
106
- text = transcribe_yt_vid(url)
107
- transcript_source = (
108
- "The transcript was generated using {} hosted locally.".format(
109
- transcription_model_id
110
- )
111
- )
112
- else:
113
- transcript_source = "The transcript was downloaded from YouTube."
114
-
115
- return yt.title, text, transcript_source
116
-
117
-
118
- # 1.3 - turn to paragraph or points
119
- def turn_to_paragraph(text):
120
- # REMOVE HTML TAGS
121
- from bs4 import BeautifulSoup
122
-
123
- # Parse the HTML text
124
- soup = BeautifulSoup(text, "html.parser")
125
- # Get the text without HTML tags
126
- text = soup.get_text() # print(text_without_tags)
127
-
128
- # Remove leading and trailing whitespaces
129
- text = text.strip()
130
- # Check if the string ends with "User" and remove it
131
- if text.endswith("User"):
132
- text = text[: -len("User")]
133
- # Replace dashes and extra whitespaces with spaces
134
- text = (
135
- text.replace(" -", "")
136
- .replace(" ", "")
137
- .replace("\n", " ")
138
- .replace("- ", "")
139
- .replace("`", "")
140
- )
141
- # text = text.replace(" ", "\n\n") # to keep second paragraph if it exists # sometime it's good to turn this on. but let's keep it off
142
- text = text.replace(" ", " ") # off this if ^ is on
143
-
144
- return text
145
-
146
-
147
- # 1.3.1
148
- def turn_to_points(text): # input must be from `turn_to_paragraph()`
149
- # text = text.replace(". ", ".\n-") # to keep second paragraph if it exists
150
- text_with_dashes = ".\n".join("- " + line.strip() for line in text.split(". "))
151
- text_with_dashes = text_with_dashes.replace("\n\n", "\n\n- ") # for first sentence of new paragraph
152
- return text_with_dashes
153
-
154
- # 1.3.2 - combined funtions above for paragraph_or_points
155
- def paragraph_or_points(text, pa_or_po):
156
- if pa_or_po == "Points":
157
- return turn_to_points(turn_to_paragraph(text))
158
- else: # default is Paragraph
159
- return turn_to_paragraph(text)
160
-
161
- # 1.4 - summarization
162
- def summarize_text(title, text, temperature, words, use_api=False, api_token=None, do_sample=False, length="Short", pa_or_po="Paragraph",):
163
-
164
- from langchain.chains.llm import LLMChain
165
- from langchain.prompts import PromptTemplate
166
- from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
167
- from langchain.chains.combine_documents.stuff import StuffDocumentsChain
168
- import torch
169
- import transformers
170
- from transformers import BitsAndBytesConfig
171
- from transformers import AutoTokenizer, AutoModelForCausalLM
172
-
173
- from langchain import HuggingFacePipeline
174
- import torch
175
-
176
- model_kwargs1 = {
177
- "temperature": temperature,
178
- "do_sample": do_sample,
179
- "min_new_tokens": 200 - 25,
180
- "max_new_tokens": 200 + 25,
181
- "repetition_penalty": 20.0,
182
- }
183
- model_kwargs2 = {
184
- "temperature": temperature,
185
- "do_sample": do_sample,
186
- "min_new_tokens": words,
187
- "max_new_tokens": words + 100,
188
- "repetition_penalty": 20.0,
189
- }
190
- if not do_sample:
191
- del model_kwargs1["temperature"]
192
- del model_kwargs2["temperature"]
193
-
194
- if use_api:
195
-
196
- from langchain import HuggingFaceHub
197
-
198
- # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
199
- llm = HuggingFaceHub(
200
- repo_id=llm_model_id,
201
- model_kwargs=model_kwargs1,
202
- huggingfacehub_api_token=api_token,
203
- )
204
- llm2 = HuggingFaceHub(
205
- repo_id=llm_model_id,
206
- model_kwargs=model_kwargs2,
207
- huggingfacehub_api_token=api_token,
208
- )
209
- summary_source = (
210
- "The summary was generated using {} via Hugging Face API.".format(
211
- llm_model_id
212
- )
213
- )
214
-
215
- else:
216
- quantization_config = BitsAndBytesConfig(
217
- load_in_4bit=True,
218
- bnb_4bit_compute_dtype=torch.float16,
219
- bnb_4bit_quant_type="nf4",
220
- bnb_4bit_use_double_quant=True,
221
- )
222
-
223
- tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
224
- model = AutoModelForCausalLM.from_pretrained(
225
- llm_model_id,
226
- # quantization_config=quantization_config
227
- )
228
- model.to_bettertransformer()
229
-
230
- pipeline = transformers.pipeline(
231
- "text-generation",
232
- model=model,
233
- tokenizer=tokenizer,
234
- torch_dtype=torch.bfloat16,
235
- device_map="auto",
236
- pad_token_id=tokenizer.eos_token_id,
237
- **model_kwargs1,
238
- )
239
- pipeline2 = transformers.pipeline(
240
- "text-generation",
241
- model=model,
242
- tokenizer=tokenizer,
243
- torch_dtype=torch.bfloat16,
244
- device_map="auto",
245
- pad_token_id=tokenizer.eos_token_id,
246
- **model_kwargs2,
247
- )
248
- llm = HuggingFacePipeline(pipeline=pipeline)
249
- llm2 = HuggingFacePipeline(pipeline=pipeline2)
250
-
251
- summary_source = "The summary was generated using {} hosted locally.".format(
252
- llm_model_id
253
- )
254
-
255
- # Map
256
- map_template = """
257
- Summarize the following video in a clear way:\n
258
- ----------------------- \n
259
- TITLE: `{title}`\n
260
- TEXT:\n
261
- `{docs}`\n
262
- ----------------------- \n
263
- SUMMARY:\n
264
- """
265
- map_prompt = PromptTemplate(
266
- template=map_template, input_variables=["title", "docs"]
267
- )
268
- map_chain = LLMChain(llm=llm, prompt=map_prompt)
269
-
270
- # Reduce - Collapse
271
- collapse_template = """
272
- TITLE: `{title}`\n
273
- TEXT:\n
274
- `{doc_summaries}`\n
275
- ----------------------- \n
276
- Turn the text of a video above into a long essay:\n
277
- """
278
-
279
- collapse_prompt = PromptTemplate(
280
- template=collapse_template, input_variables=["title", "doc_summaries"]
281
- )
282
- collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt) # LLM 1 <-- LLM
283
-
284
- # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
285
- collapse_documents_chain = StuffDocumentsChain(
286
- llm_chain=collapse_chain, document_variable_name="doc_summaries"
287
- )
288
-
289
- # Final Reduce - Combine
290
- combine_template_short = """\n
291
- TITLE: `{title}`\n
292
- TEXT:\n
293
- `{doc_summaries}`\n
294
- ----------------------- \n
295
- Turn the text of a video above into a 3-sentence summary:\n
296
- """
297
- combine_template_medium = """\n
298
- TITLE: `{title}`\n
299
- TEXT:\n
300
- `{doc_summaries}`\n
301
- ----------------------- \n
302
- Turn the text of a video above into a long summary:\n
303
- """
304
- combine_template_long = """\n
305
- TITLE: `{title}`\n
306
- TEXT:\n
307
- `{doc_summaries}`\n
308
- ----------------------- \n
309
- Turn the text of a video above into a long essay:\n
310
- """
311
- # Turn the text of a video above into a 3-sentence summary:\n
312
- # Turn the text of a video above into a long summary:\n
313
- # Turn the text of a video above into a long essay:\n
314
- if length == "Medium":
315
- combine_prompt = PromptTemplate(
316
- template=combine_template_medium,
317
- input_variables=["title", "doc_summaries", "words"],
318
- )
319
- elif length == "Long":
320
- combine_prompt = PromptTemplate(
321
- template=combine_template_long,
322
- input_variables=["title", "doc_summaries", "words"],
323
- )
324
- else: # default is short
325
- combine_prompt = PromptTemplate(
326
- template=combine_template_short,
327
- input_variables=["title", "doc_summaries", "words"],
328
- )
329
- combine_chain = LLMChain(llm=llm2, prompt=combine_prompt) # LLM 2 <-- LLM2
330
-
331
- # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
332
- combine_documents_chain = StuffDocumentsChain(
333
- llm_chain=combine_chain, document_variable_name="doc_summaries"
334
- )
335
-
336
- # Combines and iteratively reduces the mapped documents
337
- reduce_documents_chain = ReduceDocumentsChain(
338
- # This is final chain that is called.
339
- combine_documents_chain=combine_documents_chain,
340
- # If documents exceed context for `StuffDocumentsChain`
341
- collapse_documents_chain=collapse_documents_chain,
342
- # The maximum number of tokens to group documents into.
343
- token_max=800,
344
- )
345
-
346
- # Combining documents by mapping a chain over them, then combining results
347
- map_reduce_chain = MapReduceDocumentsChain(
348
- # Map chain
349
- llm_chain=map_chain,
350
- # Reduce chain
351
- reduce_documents_chain=reduce_documents_chain,
352
- # The variable name in the llm_chain to put the documents in
353
- document_variable_name="docs",
354
- # Return the results of the map steps in the output
355
- return_intermediate_steps=False,
356
- )
357
-
358
- from langchain.document_loaders import TextLoader
359
- from langchain.text_splitter import TokenTextSplitter
360
-
361
- with open(save_dir + "/transcript.txt", "w") as f:
362
- f.write(text)
363
- loader = TextLoader(save_dir + "/transcript.txt")
364
- doc = loader.load()
365
- text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100)
366
- docs = text_splitter.split_documents(doc)
367
-
368
- summary = map_reduce_chain.run(
369
- {"input_documents": docs, "title": title, "words": words}
370
- )
371
-
372
- try:
373
- del (map_reduce_chain, reduce_documents_chain,
374
- combine_chain, collapse_documents_chain,
375
- map_chain, collapse_chain,
376
- llm, llm2,
377
- pipeline, pipeline2,
378
- model, tokenizer)
379
- except:
380
- pass
381
- torch.cuda.empty_cache()
382
-
383
- summary = paragraph_or_points(summary, pa_or_po)
384
-
385
- return summary, summary_source
386
-
387
-
388
- # 1.5 - complete function [DELETED]
389
-
390
- # 2 - extractive/low-abstractive summary for Key Sentence Highlight
391
- # 2.1 - chunking + hosted inference, summary [DELETED]
392
-
393
- # 2.2 - add spaces between punctuations
394
- import re
395
- def add_space_before_punctuation(text):
396
- # Define a regular expression pattern to match punctuation
397
- punctuation_pattern = r"([.,!?;:])"
398
-
399
- # Use re.sub to add a space before each punctuation
400
- modified_text = re.sub(punctuation_pattern, r" \1", text)
401
-
402
- bracket_pattern = r'([()])'
403
- modified_text = re.sub(bracket_pattern, r" \1 ", modified_text)
404
-
405
- return modified_text
406
-
407
-
408
- # 2.3 - highlight same words (yellow)
409
- from difflib import ndiff
410
- def highlight_text_with_diff(text1, text2):
411
- diff = list(ndiff(text1.split(), text2.split()))
412
-
413
- highlighted_diff = []
414
- for item in diff:
415
- if item.startswith(" "):
416
- highlighted_diff.append(
417
- '<span style="background-color: rgba(255, 255, 0, 0.25);">'
418
- + item
419
- + " </span>"
420
- ) # Unchanged words
421
- elif item.startswith("+"):
422
- highlighted_diff.append(item[2:] + " ")
423
-
424
- return "".join(highlighted_diff) # output in string HTML format
425
-
426
- # 2.4 - combined - `highlight_key_sentences`
427
- # extractive/low-abstractive summarizer with facebook/bart-large-cnn
428
- # highlight feature
429
- def highlight_key_sentences(original_text, api_key):
430
-
431
- import requests
432
-
433
- API_TOKEN = api_key
434
- headers = {"Authorization": f"Bearer {API_TOKEN}"}
435
- API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
436
-
437
- def query(payload):
438
- response = requests.post(API_URL, headers=headers, json=payload)
439
- return response.json()
440
-
441
- def chunk_text(text, chunk_size=1024):
442
- # Split the text into chunks
443
- chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
444
- return chunks
445
-
446
- def summarize_long_text(long_text):
447
- # Split the long text into chunks
448
- text_chunks = chunk_text(long_text)
449
-
450
- # Summarize each chunk
451
- summaries = []
452
- for chunk in text_chunks:
453
- data = query(
454
- {
455
- "inputs": f"{chunk}",
456
- "parameters": {"do_sample": False},
457
- }
458
- ) # what if do_sample=True?
459
- summaries.append(data[0]["summary_text"])
460
-
461
- # Combine the summaries of all chunks
462
- full_summary = " ".join(summaries)
463
- return full_summary
464
-
465
- summarized_text = summarize_long_text(original_text)
466
-
467
- original_text = add_space_before_punctuation(original_text)
468
- summarized_text = add_space_before_punctuation(summarized_text)
469
-
470
- return highlight_text_with_diff(summarized_text, original_text) # output in string HTML format
471
-
472
-
473
- # 3 - extract_keywords
474
- # 3.1 - initialize & load pipeline
475
- from transformers import (
476
- TokenClassificationPipeline,
477
- AutoModelForTokenClassification,
478
- AutoTokenizer,
479
- )
480
- from transformers.pipelines import AggregationStrategy
481
- import numpy as np
482
-
483
- # Define keyphrase extraction pipeline
484
- class KeyphraseExtractionPipeline(TokenClassificationPipeline):
485
- def __init__(self, model, *args, **kwargs):
486
- super().__init__(
487
- model=AutoModelForTokenClassification.from_pretrained(model),
488
- tokenizer=AutoTokenizer.from_pretrained(model),
489
- *args,
490
- **kwargs,
491
- )
492
-
493
- def postprocess(self, all_outputs):
494
- results = super().postprocess(
495
- all_outputs=all_outputs,
496
- aggregation_strategy=AggregationStrategy.SIMPLE,
497
- )
498
- return np.unique([result.get("word").strip() for result in results])
499
-
500
-
501
- # Load pipeline
502
- model_name = "ml6team/keyphrase-extraction-kbir-inspec"
503
- extractor = KeyphraseExtractionPipeline(model=model_name)
504
-
505
- # 3.2 - re-arrange keywords order
506
- import re
507
- def rearrange_keywords(text, keywords): # text:str, keywords:List
508
- # Find the positions of each keyword in the text
509
- keyword_positions = {word: text.lower().index(word.lower()) for word in keywords}
510
-
511
- # Sort the keywords based on their positions in the text
512
- sorted_keywords = sorted(keywords, key=lambda x: keyword_positions[x])
513
-
514
- return sorted_keywords
515
-
516
- # 3.3 - `keywords_extractor` function
517
- def keywords_extractor_list(summary): # List : Flashcards
518
- keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
519
- list_keyphrases = keyphrases.tolist()
520
-
521
- # rearrange first
522
- list_keyphrases = rearrange_keywords(summary, list_keyphrases)
523
-
524
- return list_keyphrases # returns List
525
-
526
- def keywords_extractor_str(summary): # str : Keywords Highlight & Fill in the Blank
527
- keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
528
- list_keyphrases = keyphrases.tolist()
529
-
530
- # rearrange first
531
- list_keyphrases = rearrange_keywords(summary, list_keyphrases)
532
-
533
- # join List elements to one string
534
- all_keyphrases = " ".join(list_keyphrases)
535
-
536
- return all_keyphrases # returns one string
537
-
538
- # 3.4 - keywords highlight
539
- # 3.4.1 - highlight same words (green)
540
- def highlight_green(text1, text2): # keywords(str), text
541
- diff = list(ndiff(text1.split(), text2.split()))
542
-
543
- highlighted_diff = []
544
- for item in diff:
545
- if item.startswith(" "):
546
- highlighted_diff.append(
547
- '<span style="background-color: rgba(0, 255, 0, 0.25);">'
548
- + item
549
- + " </span>"
550
- ) # Unchanged words
551
- elif item.startswith("+"):
552
- highlighted_diff.append(item[2:] + " ")
553
-
554
- return "".join(highlighted_diff) # output in string HTML format
555
-
556
-
557
- # 3.4.2 - combined - keywords highlight
558
- def keywords_highlight(text):
559
- keywords = keywords_extractor_str(text) # keywords; one string
560
- text = add_space_before_punctuation(text)
561
- return highlight_green(keywords, text) # output in string HTML format
562
-
563
-
564
- # 3.5 - flashcards
565
- # 3.5.1 - pair_keywords_sentences
566
- def pair_keywords_sentences(text, search_words): # text:str, search_words:List
567
-
568
- result_html = "<span style='text-align: center;'>"
569
-
570
- # Split the text into sentences
571
- sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
572
-
573
- # Create a dictionary to store sentences for each keyword
574
- keyword_sentences = {word: [] for word in search_words}
575
-
576
- # Iterate through sentences and search for keywords
577
- for sentence in sentences:
578
- for word in search_words:
579
- if re.search(
580
- r"\b{}\b".format(re.escape(word)), sentence, flags=re.IGNORECASE
581
- ):
582
- keyword_sentences[word].append(sentence)
583
-
584
- # Print the results
585
- for word, sentences in keyword_sentences.items():
586
- result_html += "<h2>" + word + "</h2> \n"
587
-
588
- for sentence in sentences:
589
- result_html += "<p>" + sentence + "</p> \n"
590
-
591
- result_html += "\n"
592
-
593
- result_html += "</span>"
594
-
595
- return result_html
596
-
597
- # 3.5.2 combined - flashcards
598
- def flashcards(text):
599
- keywords = keywords_extractor_list(text) # keywords; a List
600
- text = add_space_before_punctuation(text)
601
- return pair_keywords_sentences(text, keywords) # output in string HTML format
602
-
603
-
604
- # 3.6 - fill in the blank
605
- # 3.6.1 - underline same words
606
- def underline_keywords(text1, text2): # keywords(str), text
607
- diff = list(ndiff(text1.split(), text2.split()))
608
-
609
- highlighted_diff = []
610
- for item in diff:
611
- if item.startswith(" "):
612
- highlighted_diff.append(
613
- "_______"
614
- ) # Unchanged words. make length independent of word length?
615
- elif item.startswith("+"):
616
- highlighted_diff.append(item[2:] + " ")
617
-
618
- return "".join(highlighted_diff) # output in string HTML format
619
-
620
-
621
- # 3.6.2 - combined - underline
622
- def fill_in_blanks(text):
623
- keywords = keywords_extractor_str(text) # keywords; one string
624
- text = add_space_before_punctuation(text)
625
- return underline_keywords(keywords, text) # output in string HTML format
626
-
627
-
628
- # 4 - misc
629
- emptyTabHTML = "<br>\n<p style='color: gray; text-align: center'>Please generate a summary first.</p>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n"
630
-
631
-
632
- def empty_tab():
633
- return emptyTabHTML
634
-
635
-
636
- # 5 - the app
637
- import gradio as gr
638
-
639
- with gr.Blocks() as demo:
640
- gr.Markdown("<br>")
641
-
642
- with gr.Row():
643
- with gr.Column():
644
- gr.Markdown("# ✍️ Summarizer for Learning")
645
- with gr.Column():
646
- gr.HTML("<div style='color: red; text-align: right'>Please use your <a href='#HFAPI' style='color: red'>Hugging Face Access Token.</a></div>")
647
-
648
- with gr.Row():
649
- with gr.Column():
650
- with gr.Tab("YouTube"):
651
- yt_link = gr.Textbox(show_label=False, placeholder="Insert YouTube link here ... (video needs to have caption)")
652
- yt_transcript = gr.Textbox(show_label=False, placeholder="Transcript will be shown here ...", lines=12)
653
- with gr.Tab("Article"):
654
- gr.Textbox(show_label=False, placeholder="WORK IN PROGRESS", interactive=False)
655
- gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
656
- with gr.Tab("Text"):
657
- gr.Dropdown(["WORK IN PROGRESS", "Example 2"], show_label=False, value="WORK IN PROGRESS", interactive=False)
658
- gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
659
- with gr.Row():
660
- clrButton = gr.ClearButton([yt_link, yt_transcript])
661
- subButton = gr.Button(variant="primary", value="Summarize")
662
-
663
- with gr.Accordion("Settings", open=False):
664
- length = gr.Radio(["Short", "Medium", "Long"], label="Length", value="Short", interactive=True)
665
- pa_or_po = gr.Radio(["Paragraphs", "Points"], label="Summarize to", value="Paragraphs", interactive=True)
666
- gr.Checkbox(label="Add headings", interactive=False)
667
- gr.Radio(["One section", "Few sections"], label="Summarize into", interactive=False) # info="Only for 'Medium' or 'Long'"
668
- with gr.Row():
669
- clrButtonSt1 = gr.ClearButton([length, pa_or_po], interactive=True)
670
- subButtonSt1 = gr.Button(value="Set Current as Default", interactive=False)
671
- subButtonSt1 = gr.Button(value="Show Default", interactive=False)
672
-
673
- with gr.Accordion("Advanced Settings", open=False):
674
- with gr.Group(visible=False):
675
- gr.HTML("<p style='text-align: center;'>&nbsp; YouTube transcription</p>")
676
- force_transcribe_with_app = gr.Checkbox(
677
- label="Always transcribe with app",
678
- info="The app first checks if caption on YouTube is available. If ticked, the app will transcribe the video for you but slower.",
679
- )
680
- with gr.Group():
681
- gr.HTML("<p style='text-align: center;'>&nbsp; Summarization</p>")
682
- gr.Radio(["High Abstractive", "Low Abstractive", "Extractive"], label="Type of summarization", value="High Abstractive", interactive=False)
683
- gr.Dropdown(
684
- [
685
- "tiiuae/falcon-7b-instruct",
686
- "GPT2 (work in progress)",
687
- "OpenChat 3.5 (work in progress)",
688
- ],
689
- label="Model",
690
- value="tiiuae/falcon-7b-instruct",
691
- interactive=False,
692
- )
693
- temperature = gr.Slider(0.10, 0.30, step=0.05, label="Temperature", value=0.15,
694
- info="Temperature is limited to 0.1 ~ 0.3 window, as it is shown to produce best result.",
695
- interactive=True,
696
- )
697
- do_sample = gr.Checkbox(label="do_sample", value=True,
698
- info="If ticked, do_sample produces more creative and diverse text, otherwise the app will use greedy decoding that generate more consistent and predictable summary.",
699
- )
700
-
701
- with gr.Group():
702
- gr.HTML("<p style='text-align: center;'>&nbsp; Highlight</p>")
703
- check_key_sen = gr.Checkbox(label="Highlight key sentences", info="In original text", value=True, interactive=False)
704
- gr.Checkbox(label="Highlight keywords", info="In summary", value=True, interactive=False)
705
- gr.Checkbox(label="Turn text to paragraphs", interactive=False)
706
-
707
- with gr.Group():
708
- gr.HTML("<p style='text-align: center;'>&nbsp; Quiz mode</p>")
709
- gr.Checkbox(label="Fill in the blanks", value=True, interactive=False)
710
- gr.Checkbox(label="Flashcards", value=True, interactive=False)
711
- gr.Checkbox(label="Re-write summary", interactive=False) # info="Only for 'Short'"
712
-
713
- with gr.Row():
714
- clrButtonSt2 = gr.ClearButton(interactive=True)
715
- subButtonSt2 = gr.Button(value="Set Current as Default", interactive=False)
716
- subButtonSt2 = gr.Button(value="Show Default", interactive=False)
717
-
718
- with gr.Column():
719
- with gr.Tab("Summary"): # Output
720
- title = gr.Textbox(show_label=False, placeholder="Title")
721
- summary = gr.Textbox(lines=11, show_copy_button=True, label="", placeholder="Summarized output ...")
722
- with gr.Tab("Key sentences", render=True):
723
- key_sentences = gr.HTML(emptyTabHTML)
724
- showButtonKeySen = gr.Button(value="Generate")
725
- with gr.Tab("Keywords", render=True):
726
- keywords = gr.HTML(emptyTabHTML)
727
- showButtonKeyWor = gr.Button(value="Generate")
728
- with gr.Tab("Fill in the blank", render=True):
729
- blanks = gr.HTML(emptyTabHTML)
730
- showButtonFilBla = gr.Button(value="Generate")
731
- with gr.Tab("Flashcards", render=True):
732
- flashCrd = gr.HTML(emptyTabHTML)
733
- showButtonFlash = gr.Button(value="Generate")
734
- gr.Markdown("<span style='color: gray'>The app is a work in progress. Output may be odd and some features are disabled. [Learn more]().</span>")
735
- with gr.Group():
736
- gr.HTML("<p id='HFAPI' style='text-align: center;'>&nbsp; 🤗 Hugging Face Access Token [<a href='https://huggingface.co/docs/hub/security-tokens'>more</a>]</p>")
737
- hf_access_token = gr.Textbox(
738
- show_label=False,
739
- placeholder="example: hf_******************************",
740
- type="password",
741
- info="The app does not store the token.",
742
- )
743
- with gr.Accordion("Info", open=False, visible=False):
744
- transcript_source = gr.Textbox(show_label=False, placeholder="transcript_source")
745
- summary_source = gr.Textbox(show_label=False, placeholder="summary_source")
746
- words = gr.Slider(minimum=100, maximum=500, value=250, label="Length of the summary")
747
- # words: what should be the constant value?
748
- use_api = gr.Checkbox(label="use_api", value=True)
749
-
750
- subButton.click(
751
- fn=transcribe_youtube_video,
752
- inputs=[yt_link, force_transcribe_with_app, use_api, hf_access_token],
753
- outputs=[title, yt_transcript, transcript_source],
754
- queue=True,
755
- ).then(
756
- fn=summarize_text,
757
- inputs=[title, yt_transcript, temperature, words, use_api, hf_access_token, do_sample, length, pa_or_po],
758
- outputs=[summary, summary_source],
759
- api_name="summarize_text",
760
- queue=True,
761
- )
762
-
763
- subButton.click(fn=empty_tab, outputs=[key_sentences])
764
- subButton.click(fn=empty_tab, outputs=[keywords])
765
- subButton.click(fn=empty_tab, outputs=[flashCrd])
766
- subButton.click(fn=empty_tab, outputs=[blanks])
767
-
768
- showButtonKeySen.click(
769
- fn=highlight_key_sentences,
770
- inputs=[yt_transcript, hf_access_token],
771
- outputs=[key_sentences],
772
- queue=True,
773
- )
774
-
775
- # Keywords
776
- showButtonKeyWor.click(fn=keywords_highlight, inputs=[summary], outputs=[keywords], queue=True)
777
-
778
- # Flashcards
779
- showButtonFlash.click(fn=flashcards, inputs=[summary], outputs=[flashCrd], queue=True)
780
-
781
- # Fill in the blanks
782
- showButtonFilBla.click(fn=fill_in_blanks, inputs=[summary], outputs=[blanks], queue=True)
783
-
784
- gr.Examples(
785
- examples=["https://www.youtube.com/watch?v=P6FORpg0KVo", "https://www.youtube.com/watch?v=bwEIqjU2qgk"],
786
- inputs=[yt_link]
787
- )
788
-
789
- if __name__ == "__main__":
790
- demo.launch(show_api=False)
791
- # demo.launch(show_api=False, debug=True)
792
- # demo.launch(show_api=False, share=True)
 
1
+ # 0- libraries
2
+ import transformers
3
+ import gradio as gr
4
+
5
+ from youtube_transcript_api import YouTubeTranscriptApi
6
+ from huggingface_hub import InferenceClient
7
+ from pytube import YouTube
8
+ import pytube
9
+ import torch
10
+
11
+ # 1 - abstractive_summary
12
+ # 1.1 - initialize
13
+ import os
14
+ save_dir = os.path.join(os.getcwd(), "docs")
15
+ if not os.path.exists(save_dir):
16
+ os.mkdir(save_dir)
17
+ transcription_model_id = "openai/whisper-large"
18
+ llm_model_id = "tiiuae/falcon-7b-instruct"
19
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
20
+
21
+ # 1.2 - transcription
22
+ def get_yt_transcript(url):
23
+ text = ""
24
+ vid_id = pytube.extract.video_id(url)
25
+ temp = YouTubeTranscriptApi.get_transcript(vid_id)
26
+ for t in temp:
27
+ text += t["text"] + " "
28
+ return text
29
+
30
+ # 1.2.1 - locally_transcribe
31
+ def transcribe_yt_vid(url):
32
+ # download YouTube video's audio
33
+ yt = YouTube(str(url))
34
+ audio = yt.streams.filter(only_audio=True).first()
35
+ out_file = audio.download(filename="audio.mp3", output_path=save_dir)
36
+
37
+ # defining an automatic-speech-recognition pipeline
38
+ asr = transformers.pipeline(
39
+ "automatic-speech-recognition",
40
+ model=transcription_model_id,
41
+ device_map="auto",
42
+ )
43
+
44
+ # setting model config parameters
45
+ asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
46
+ language="en", task="transcribe"
47
+ )
48
+
49
+ # invoking the Whisper model
50
+ temp = asr(out_file, chunk_length_s=20)
51
+ text = temp["text"]
52
+
53
+ # we can do this at the end to release GPU memory
54
+ del asr
55
+ torch.cuda.empty_cache()
56
+
57
+ return text
58
+
59
+ # 1.2.1 - api_transcribe
60
+ def transcribe_yt_vid_api(url, api_token):
61
+ # download YouTube video's audio
62
+ yt = YouTube(str(url))
63
+ audio = yt.streams.filter(only_audio=True).first()
64
+ out_file = audio.download(filename="audio.wav", output_path=save_dir)
65
+
66
+ # Initialize client for the Whisper model
67
+ client = InferenceClient(model=transcription_model_id, token=api_token)
68
+
69
+ import librosa
70
+ import soundfile as sf
71
+
72
+ text = ""
73
+ t = 25 # audio chunk length in seconds
74
+ x, sr = librosa.load(out_file, sr=None)
75
+ # This gives x as audio file in numpy array and sr as original sampling rate
76
+ # The audio needs to be split in 20 second chunks since the API call truncates the response
77
+ for _, i in enumerate(range(0, (len(x) // (t * sr)) + 1)):
78
+ y = x[t * sr * i : t * sr * (i + 1)]
79
+ split_path = os.path.join(save_dir, "audio_split.wav")
80
+ sf.write(split_path, y, sr)
81
+ text += client.automatic_speech_recognition(split_path)
82
+
83
+ return text
84
+
85
+
86
+ # 1.2.3 - transcribe locally or api
87
+ def transcribe_youtube_video(url, force_transcribe=False, use_api=False, api_token=None):
88
+
89
+ yt = YouTube(str(url))
90
+ text = ""
91
+ # get the transcript from YouTube if available
92
+ try:
93
+ text = get_yt_transcript(url)
94
+ except:
95
+ pass
96
+
97
+ # transcribes the video if YouTube did not provide a transcription
98
+ # or if you want to force_transcribe anyway
99
+ if text == "" or force_transcribe:
100
+ if use_api:
101
+ text = transcribe_yt_vid_api(url, api_token=api_token)
102
+ transcript_source = "The transcript was generated using {} via the Hugging Face Hub API.".format(
103
+ transcription_model_id
104
+ )
105
+ else:
106
+ text = transcribe_yt_vid(url)
107
+ transcript_source = (
108
+ "The transcript was generated using {} hosted locally.".format(
109
+ transcription_model_id
110
+ )
111
+ )
112
+ else:
113
+ transcript_source = "The transcript was downloaded from YouTube."
114
+
115
+ return yt.title, text, transcript_source
116
+
117
+
118
+ # 1.3 - turn to paragraph or points
119
+ def turn_to_paragraph(text):
120
+ # REMOVE HTML TAGS
121
+ from bs4 import BeautifulSoup
122
+
123
+ # Parse the HTML text
124
+ soup = BeautifulSoup(text, "html.parser")
125
+ # Get the text without HTML tags
126
+ text = soup.get_text() # print(text_without_tags)
127
+
128
+ # Remove leading and trailing whitespaces
129
+ text = text.strip()
130
+ # Check if the string ends with "User" and remove it
131
+ if text.endswith("User"):
132
+ text = text[: -len("User")]
133
+ # Replace dashes and extra whitespaces with spaces
134
+ text = (
135
+ text.replace(" -", "")
136
+ .replace(" ", "")
137
+ .replace("\n", " ")
138
+ .replace("- ", "")
139
+ .replace("`", "")
140
+ )
141
+ # text = text.replace(" ", "\n\n") # to keep second paragraph if it exists # sometime it's good to turn this on. but let's keep it off
142
+ text = text.replace(" ", " ") # off this if ^ is on
143
+
144
+ return text
145
+
146
+
147
+ # 1.3.1
148
+ def turn_to_points(text): # input must be from `turn_to_paragraph()`
149
+ # text = text.replace(". ", ".\n-") # to keep second paragraph if it exists
150
+ text_with_dashes = ".\n".join("- " + line.strip() for line in text.split(". "))
151
+ text_with_dashes = text_with_dashes.replace("\n\n", "\n\n- ") # for first sentence of new paragraph
152
+ return text_with_dashes
153
+
154
+ # 1.3.2 - combined funtions above for paragraph_or_points
155
+ def paragraph_or_points(text, pa_or_po):
156
+ if pa_or_po == "Points":
157
+ return turn_to_points(turn_to_paragraph(text))
158
+ else: # default is Paragraph
159
+ return turn_to_paragraph(text)
160
+
161
+ # 1.4 - summarization
162
+ def summarize_text(title, text, temperature, words, use_api=False, api_token=None, do_sample=False, length="Short", pa_or_po="Paragraph",):
163
+
164
+ from langchain.chains.llm import LLMChain
165
+ from langchain.prompts import PromptTemplate
166
+ from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
167
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
168
+ import torch
169
+ import transformers
170
+ from transformers import BitsAndBytesConfig
171
+ from transformers import AutoTokenizer, AutoModelForCausalLM
172
+
173
+ from langchain import HuggingFacePipeline
174
+ import torch
175
+
176
+ model_kwargs1 = {
177
+ "temperature": temperature,
178
+ "do_sample": do_sample,
179
+ "min_new_tokens": 200 - 25,
180
+ "max_new_tokens": 200 + 25,
181
+ "repetition_penalty": 20.0,
182
+ }
183
+ model_kwargs2 = {
184
+ "temperature": temperature,
185
+ "do_sample": do_sample,
186
+ "min_new_tokens": words,
187
+ "max_new_tokens": words + 100,
188
+ "repetition_penalty": 20.0,
189
+ }
190
+ if not do_sample:
191
+ del model_kwargs1["temperature"]
192
+ del model_kwargs2["temperature"]
193
+
194
+ if use_api:
195
+
196
+ from langchain import HuggingFaceHub
197
+
198
+ # os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_token
199
+ llm = HuggingFaceHub(
200
+ repo_id=llm_model_id,
201
+ model_kwargs=model_kwargs1,
202
+ huggingfacehub_api_token=api_token,
203
+ )
204
+ llm2 = HuggingFaceHub(
205
+ repo_id=llm_model_id,
206
+ model_kwargs=model_kwargs2,
207
+ huggingfacehub_api_token=api_token,
208
+ )
209
+ summary_source = (
210
+ "The summary was generated using {} via Hugging Face API.".format(
211
+ llm_model_id
212
+ )
213
+ )
214
+
215
+ else:
216
+ quantization_config = BitsAndBytesConfig(
217
+ load_in_4bit=True,
218
+ bnb_4bit_compute_dtype=torch.float16,
219
+ bnb_4bit_quant_type="nf4",
220
+ bnb_4bit_use_double_quant=True,
221
+ )
222
+
223
+ tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
224
+ model = AutoModelForCausalLM.from_pretrained(
225
+ llm_model_id,
226
+ # quantization_config=quantization_config
227
+ )
228
+ model.to_bettertransformer()
229
+
230
+ pipeline = transformers.pipeline(
231
+ "text-generation",
232
+ model=model,
233
+ tokenizer=tokenizer,
234
+ torch_dtype=torch.bfloat16,
235
+ device_map="auto",
236
+ pad_token_id=tokenizer.eos_token_id,
237
+ **model_kwargs1,
238
+ )
239
+ pipeline2 = transformers.pipeline(
240
+ "text-generation",
241
+ model=model,
242
+ tokenizer=tokenizer,
243
+ torch_dtype=torch.bfloat16,
244
+ device_map="auto",
245
+ pad_token_id=tokenizer.eos_token_id,
246
+ **model_kwargs2,
247
+ )
248
+ llm = HuggingFacePipeline(pipeline=pipeline)
249
+ llm2 = HuggingFacePipeline(pipeline=pipeline2)
250
+
251
+ summary_source = "The summary was generated using {} hosted locally.".format(
252
+ llm_model_id
253
+ )
254
+
255
+ # Map
256
+ map_template = """
257
+ Summarize the following video in a clear way:\n
258
+ ----------------------- \n
259
+ TITLE: `{title}`\n
260
+ TEXT:\n
261
+ `{docs}`\n
262
+ ----------------------- \n
263
+ SUMMARY:\n
264
+ """
265
+ map_prompt = PromptTemplate(
266
+ template=map_template, input_variables=["title", "docs"]
267
+ )
268
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
269
+
270
+ # Reduce - Collapse
271
+ collapse_template = """
272
+ TITLE: `{title}`\n
273
+ TEXT:\n
274
+ `{doc_summaries}`\n
275
+ ----------------------- \n
276
+ Turn the text of a video above into a long essay:\n
277
+ """
278
+
279
+ collapse_prompt = PromptTemplate(
280
+ template=collapse_template, input_variables=["title", "doc_summaries"]
281
+ )
282
+ collapse_chain = LLMChain(llm=llm, prompt=collapse_prompt) # LLM 1 <-- LLM
283
+
284
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
285
+ collapse_documents_chain = StuffDocumentsChain(
286
+ llm_chain=collapse_chain, document_variable_name="doc_summaries"
287
+ )
288
+
289
+ # Final Reduce - Combine
290
+ combine_template_short = """\n
291
+ TITLE: `{title}`\n
292
+ TEXT:\n
293
+ `{doc_summaries}`\n
294
+ ----------------------- \n
295
+ Turn the text of a video above into a 3-sentence summary:\n
296
+ """
297
+ combine_template_medium = """\n
298
+ TITLE: `{title}`\n
299
+ TEXT:\n
300
+ `{doc_summaries}`\n
301
+ ----------------------- \n
302
+ Turn the text of a video above into a long summary:\n
303
+ """
304
+ combine_template_long = """\n
305
+ TITLE: `{title}`\n
306
+ TEXT:\n
307
+ `{doc_summaries}`\n
308
+ ----------------------- \n
309
+ Turn the text of a video above into a long essay:\n
310
+ """
311
+ # Turn the text of a video above into a 3-sentence summary:\n
312
+ # Turn the text of a video above into a long summary:\n
313
+ # Turn the text of a video above into a long essay:\n
314
+ if length == "Medium":
315
+ combine_prompt = PromptTemplate(
316
+ template=combine_template_medium,
317
+ input_variables=["title", "doc_summaries", "words"],
318
+ )
319
+ elif length == "Long":
320
+ combine_prompt = PromptTemplate(
321
+ template=combine_template_long,
322
+ input_variables=["title", "doc_summaries", "words"],
323
+ )
324
+ else: # default is short
325
+ combine_prompt = PromptTemplate(
326
+ template=combine_template_short,
327
+ input_variables=["title", "doc_summaries", "words"],
328
+ )
329
+ combine_chain = LLMChain(llm=llm2, prompt=combine_prompt) # LLM 2 <-- LLM2
330
+
331
+ # Takes a list of documents, combines them into a single string, and passes this to an LLMChain
332
+ combine_documents_chain = StuffDocumentsChain(
333
+ llm_chain=combine_chain, document_variable_name="doc_summaries"
334
+ )
335
+
336
+ # Combines and iteratively reduces the mapped documents
337
+ reduce_documents_chain = ReduceDocumentsChain(
338
+ # This is final chain that is called.
339
+ combine_documents_chain=combine_documents_chain,
340
+ # If documents exceed context for `StuffDocumentsChain`
341
+ collapse_documents_chain=collapse_documents_chain,
342
+ # The maximum number of tokens to group documents into.
343
+ token_max=800,
344
+ )
345
+
346
+ # Combining documents by mapping a chain over them, then combining results
347
+ map_reduce_chain = MapReduceDocumentsChain(
348
+ # Map chain
349
+ llm_chain=map_chain,
350
+ # Reduce chain
351
+ reduce_documents_chain=reduce_documents_chain,
352
+ # The variable name in the llm_chain to put the documents in
353
+ document_variable_name="docs",
354
+ # Return the results of the map steps in the output
355
+ return_intermediate_steps=False,
356
+ )
357
+
358
+ from langchain.document_loaders import TextLoader
359
+ from langchain.text_splitter import TokenTextSplitter
360
+
361
+ with open(save_dir + "/transcript.txt", "w") as f:
362
+ f.write(text)
363
+ loader = TextLoader(save_dir + "/transcript.txt")
364
+ doc = loader.load()
365
+ text_splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100)
366
+ docs = text_splitter.split_documents(doc)
367
+
368
+ summary = map_reduce_chain.run(
369
+ {"input_documents": docs, "title": title, "words": words}
370
+ )
371
+
372
+ try:
373
+ del (map_reduce_chain, reduce_documents_chain,
374
+ combine_chain, collapse_documents_chain,
375
+ map_chain, collapse_chain,
376
+ llm, llm2,
377
+ pipeline, pipeline2,
378
+ model, tokenizer)
379
+ except:
380
+ pass
381
+ torch.cuda.empty_cache()
382
+
383
+ summary = paragraph_or_points(summary, pa_or_po)
384
+
385
+ return summary, summary_source
386
+
387
+
388
+ # 1.5 - complete function [DELETED]
389
+
390
+ # 2 - extractive/low-abstractive summary for Key Sentence Highlight
391
+ # 2.1 - chunking + hosted inference, summary [DELETED]
392
+
393
+ # 2.2 - add spaces between punctuations
394
+ import re
395
+ def add_space_before_punctuation(text):
396
+ # Define a regular expression pattern to match punctuation
397
+ punctuation_pattern = r"([.,!?;:])"
398
+
399
+ # Use re.sub to add a space before each punctuation
400
+ modified_text = re.sub(punctuation_pattern, r" \1", text)
401
+
402
+ bracket_pattern = r'([()])'
403
+ modified_text = re.sub(bracket_pattern, r" \1 ", modified_text)
404
+
405
+ return modified_text
406
+
407
+
408
+ # 2.3 - highlight same words (yellow)
409
+ from difflib import ndiff
410
+ def highlight_text_with_diff(text1, text2):
411
+ diff = list(ndiff(text1.split(), text2.split()))
412
+
413
+ highlighted_diff = []
414
+ for item in diff:
415
+ if item.startswith(" "):
416
+ highlighted_diff.append(
417
+ '<span style="background-color: rgba(255, 255, 0, 0.25);">'
418
+ + item
419
+ + " </span>"
420
+ ) # Unchanged words
421
+ elif item.startswith("+"):
422
+ highlighted_diff.append(item[2:] + " ")
423
+
424
+ return "".join(highlighted_diff) # output in string HTML format
425
+
426
+ # 2.4 - combined - `highlight_key_sentences`
427
+ # extractive/low-abstractive summarizer with facebook/bart-large-cnn
428
+ # highlight feature
429
+ def highlight_key_sentences(original_text, api_key):
430
+
431
+ import requests
432
+
433
+ API_TOKEN = api_key
434
+ headers = {"Authorization": f"Bearer {API_TOKEN}"}
435
+ API_URL = "https://api-inference.huggingface.co/models/facebook/bart-large-cnn"
436
+
437
+ def query(payload):
438
+ response = requests.post(API_URL, headers=headers, json=payload)
439
+ return response.json()
440
+
441
+ def chunk_text(text, chunk_size=1024):
442
+ # Split the text into chunks
443
+ chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
444
+ return chunks
445
+
446
+ def summarize_long_text(long_text):
447
+ # Split the long text into chunks
448
+ text_chunks = chunk_text(long_text)
449
+
450
+ # Summarize each chunk
451
+ summaries = []
452
+ for chunk in text_chunks:
453
+ data = query(
454
+ {
455
+ "inputs": f"{chunk}",
456
+ "parameters": {"do_sample": False},
457
+ }
458
+ ) # what if do_sample=True?
459
+ summaries.append(data[0]["summary_text"])
460
+
461
+ # Combine the summaries of all chunks
462
+ full_summary = " ".join(summaries)
463
+ return full_summary
464
+
465
+ summarized_text = summarize_long_text(original_text)
466
+
467
+ original_text = add_space_before_punctuation(original_text)
468
+ summarized_text = add_space_before_punctuation(summarized_text)
469
+
470
+ return highlight_text_with_diff(summarized_text, original_text) # output in string HTML format
471
+
472
+
473
+ # 3 - extract_keywords
474
+ # 3.1 - initialize & load pipeline
475
+ from transformers import (
476
+ TokenClassificationPipeline,
477
+ AutoModelForTokenClassification,
478
+ AutoTokenizer,
479
+ )
480
+ from transformers.pipelines import AggregationStrategy
481
+ import numpy as np
482
+
483
+ # Define keyphrase extraction pipeline
484
+ class KeyphraseExtractionPipeline(TokenClassificationPipeline):
485
+ def __init__(self, model, *args, **kwargs):
486
+ super().__init__(
487
+ model=AutoModelForTokenClassification.from_pretrained(model),
488
+ tokenizer=AutoTokenizer.from_pretrained(model),
489
+ *args,
490
+ **kwargs,
491
+ )
492
+
493
+ def postprocess(self, all_outputs):
494
+ results = super().postprocess(
495
+ all_outputs=all_outputs,
496
+ aggregation_strategy=AggregationStrategy.SIMPLE,
497
+ )
498
+ return np.unique([result.get("word").strip() for result in results])
499
+
500
+
501
+ # Load pipeline
502
+ model_name = "ml6team/keyphrase-extraction-kbir-inspec"
503
+ extractor = KeyphraseExtractionPipeline(model=model_name)
504
+
505
+ # 3.2 - re-arrange keywords order
506
+ import re
507
+ def rearrange_keywords(text, keywords): # text:str, keywords:List
508
+ # Find the positions of each keyword in the text
509
+ keyword_positions = {word: text.lower().index(word.lower()) for word in keywords}
510
+
511
+ # Sort the keywords based on their positions in the text
512
+ sorted_keywords = sorted(keywords, key=lambda x: keyword_positions[x])
513
+
514
+ return sorted_keywords
515
+
516
+ # 3.3 - `keywords_extractor` function
517
+ def keywords_extractor_list(summary): # List : Flashcards
518
+ keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
519
+ list_keyphrases = keyphrases.tolist()
520
+
521
+ # rearrange first
522
+ list_keyphrases = rearrange_keywords(summary, list_keyphrases)
523
+
524
+ return list_keyphrases # returns List
525
+
526
+ def keywords_extractor_str(summary): # str : Keywords Highlight & Fill in the Blank
527
+ keyphrases = extractor(summary) # extractor() from above | text.replace("\n", " ")
528
+ list_keyphrases = keyphrases.tolist()
529
+
530
+ # rearrange first
531
+ list_keyphrases = rearrange_keywords(summary, list_keyphrases)
532
+
533
+ # join List elements to one string
534
+ all_keyphrases = " ".join(list_keyphrases)
535
+
536
+ return all_keyphrases # returns one string
537
+
538
+ # 3.4 - keywords highlight
539
+ # 3.4.1 - highlight same words (green)
540
+ def highlight_green(text1, text2): # keywords(str), text
541
+ diff = list(ndiff(text1.split(), text2.split()))
542
+
543
+ highlighted_diff = []
544
+ for item in diff:
545
+ if item.startswith(" "):
546
+ highlighted_diff.append(
547
+ '<span style="background-color: rgba(0, 255, 0, 0.25);">'
548
+ + item
549
+ + " </span>"
550
+ ) # Unchanged words
551
+ elif item.startswith("+"):
552
+ highlighted_diff.append(item[2:] + " ")
553
+
554
+ return "".join(highlighted_diff) # output in string HTML format
555
+
556
+
557
+ # 3.4.2 - combined - keywords highlight
558
+ def keywords_highlight(text):
559
+ keywords = keywords_extractor_str(text) # keywords; one string
560
+ text = add_space_before_punctuation(text)
561
+ return highlight_green(keywords, text) # output in string HTML format
562
+
563
+
564
+ # 3.5 - flashcards
565
+ # 3.5.1 - pair_keywords_sentences
566
+ def pair_keywords_sentences(text, search_words): # text:str, search_words:List
567
+
568
+ result_html = "<span style='text-align: center;'>"
569
+
570
+ # Split the text into sentences
571
+ sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
572
+
573
+ # Create a dictionary to store sentences for each keyword
574
+ keyword_sentences = {word: [] for word in search_words}
575
+
576
+ # Iterate through sentences and search for keywords
577
+ for sentence in sentences:
578
+ for word in search_words:
579
+ if re.search(
580
+ r"\b{}\b".format(re.escape(word)), sentence, flags=re.IGNORECASE
581
+ ):
582
+ keyword_sentences[word].append(sentence)
583
+
584
+ # Print the results
585
+ for word, sentences in keyword_sentences.items():
586
+ result_html += "<h2>" + word + "</h2> \n"
587
+
588
+ for sentence in sentences:
589
+ result_html += "<p>" + sentence + "</p> \n"
590
+
591
+ result_html += "\n"
592
+
593
+ result_html += "</span>"
594
+
595
+ return result_html
596
+
597
+ # 3.5.2 combined - flashcards
598
+ def flashcards(text):
599
+ keywords = keywords_extractor_list(text) # keywords; a List
600
+ text = add_space_before_punctuation(text)
601
+ return pair_keywords_sentences(text, keywords) # output in string HTML format
602
+
603
+
604
+ # 3.6 - fill in the blank
605
+ # 3.6.1 - underline same words
606
+ def underline_keywords(text1, text2): # keywords(str), text
607
+ diff = list(ndiff(text1.split(), text2.split()))
608
+
609
+ highlighted_diff = []
610
+ for item in diff:
611
+ if item.startswith(" "):
612
+ highlighted_diff.append(
613
+ "_______"
614
+ ) # Unchanged words. make length independent of word length?
615
+ elif item.startswith("+"):
616
+ highlighted_diff.append(item[2:] + " ")
617
+
618
+ return "".join(highlighted_diff) # output in string HTML format
619
+
620
+
621
+ # 3.6.2 - combined - underline
622
+ def fill_in_blanks(text):
623
+ keywords = keywords_extractor_str(text) # keywords; one string
624
+ text = add_space_before_punctuation(text)
625
+ return underline_keywords(keywords, text) # output in string HTML format
626
+
627
+
628
+ # 4 - misc
629
+ emptyTabHTML = "<br>\n<p style='color: gray; text-align: center'>Please generate a summary first.</p>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n<br>\n"
630
+
631
+
632
+ def empty_tab():
633
+ return emptyTabHTML
634
+
635
+
636
+ # 5 - the app
637
+ import gradio as gr
638
+
639
+ with gr.Blocks() as demo:
640
+ gr.Markdown("<br>")
641
+
642
+ with gr.Row():
643
+ with gr.Column():
644
+ gr.Markdown("# ✍️ Summarizer for Learning")
645
+ with gr.Column():
646
+ gr.HTML("<div style='color: red; text-align: right'>Please use your <a href='#HFAPI' style='color: red'>Hugging Face Access Token.</a></div>")
647
+
648
+ with gr.Row():
649
+ with gr.Column():
650
+ with gr.Tab("YouTube"):
651
+ yt_link = gr.Textbox(show_label=False, placeholder="Insert YouTube link here ... (video needs to have caption)")
652
+ yt_transcript = gr.Textbox(show_label=False, placeholder="Transcript will be shown here ...", lines=12)
653
+ with gr.Tab("Article"):
654
+ gr.Textbox(show_label=False, placeholder="WORK IN PROGRESS", interactive=False)
655
+ gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
656
+ with gr.Tab("Text"):
657
+ gr.Dropdown(["WORK IN PROGRESS", "Example 2"], show_label=False, value="WORK IN PROGRESS", interactive=False)
658
+ gr.Textbox(show_label=False, placeholder="", lines=12, interactive=False)
659
+ with gr.Row():
660
+ clrButton = gr.ClearButton([yt_link, yt_transcript])
661
+ subButton = gr.Button(variant="primary", value="Summarize")
662
+
663
+ with gr.Accordion("Settings", open=False):
664
+ length = gr.Radio(["Short", "Medium", "Long"], label="Length", value="Short", interactive=True)
665
+ pa_or_po = gr.Radio(["Paragraphs", "Points"], label="Summarize to", value="Paragraphs", interactive=True)
666
+ gr.Checkbox(label="Add headings", interactive=False)
667
+ gr.Radio(["One section", "Few sections"], label="Summarize into", interactive=False) # info="Only for 'Medium' or 'Long'"
668
+ with gr.Row():
669
+ clrButtonSt1 = gr.ClearButton([length, pa_or_po], interactive=True)
670
+ subButtonSt1 = gr.Button(value="Set Current as Default", interactive=False)
671
+ subButtonSt1 = gr.Button(value="Show Default", interactive=False)
672
+
673
+ with gr.Accordion("Advanced Settings", open=False):
674
+ with gr.Group(visible=False):
675
+ gr.HTML("<p style='text-align: center;'>&nbsp; YouTube transcription</p>")
676
+ force_transcribe_with_app = gr.Checkbox(
677
+ label="Always transcribe with app",
678
+ info="The app first checks if caption on YouTube is available. If ticked, the app will transcribe the video for you but slower.",
679
+ )
680
+ with gr.Group():
681
+ gr.HTML("<p style='text-align: center;'>&nbsp; Summarization</p>")
682
+ gr.Radio(["High Abstractive", "Low Abstractive", "Extractive"], label="Type of summarization", value="High Abstractive", interactive=False)
683
+ gr.Dropdown(
684
+ [
685
+ "tiiuae/falcon-7b-instruct",
686
+ "GPT2 (work in progress)",
687
+ "OpenChat 3.5 (work in progress)",
688
+ ],
689
+ label="Model",
690
+ value="tiiuae/falcon-7b-instruct",
691
+ interactive=False,
692
+ )
693
+ temperature = gr.Slider(0.10, 0.30, step=0.05, label="Temperature", value=0.15,
694
+ info="Temperature is limited to 0.1 ~ 0.3 window, as it is shown to produce best result.",
695
+ interactive=True,
696
+ )
697
+ do_sample = gr.Checkbox(label="do_sample", value=True,
698
+ info="If ticked, do_sample produces more creative and diverse text, otherwise the app will use greedy decoding that generate more consistent and predictable summary.",
699
+ )
700
+
701
+ with gr.Group():
702
+ gr.HTML("<p style='text-align: center;'>&nbsp; Highlight</p>")
703
+ check_key_sen = gr.Checkbox(label="Highlight key sentences", info="In original text", value=True, interactive=False)
704
+ gr.Checkbox(label="Highlight keywords", info="In summary", value=True, interactive=False)
705
+ gr.Checkbox(label="Turn text to paragraphs", interactive=False)
706
+
707
+ with gr.Group():
708
+ gr.HTML("<p style='text-align: center;'>&nbsp; Quiz mode</p>")
709
+ gr.Checkbox(label="Fill in the blanks", value=True, interactive=False)
710
+ gr.Checkbox(label="Flashcards", value=True, interactive=False)
711
+ gr.Checkbox(label="Re-write summary", interactive=False) # info="Only for 'Short'"
712
+
713
+ with gr.Row():
714
+ clrButtonSt2 = gr.ClearButton(interactive=True)
715
+ subButtonSt2 = gr.Button(value="Set Current as Default", interactive=False)
716
+ subButtonSt2 = gr.Button(value="Show Default", interactive=False)
717
+
718
+ with gr.Column():
719
+ with gr.Tab("Summary"): # Output
720
+ title = gr.Textbox(show_label=False, placeholder="Title")
721
+ summary = gr.Textbox(lines=11, show_copy_button=True, label="", placeholder="Summarized output ...")
722
+ with gr.Tab("Key sentences", render=True):
723
+ key_sentences = gr.HTML(emptyTabHTML)
724
+ showButtonKeySen = gr.Button(value="Generate")
725
+ with gr.Tab("Keywords", render=True):
726
+ keywords = gr.HTML(emptyTabHTML)
727
+ showButtonKeyWor = gr.Button(value="Generate")
728
+ with gr.Tab("Fill in the blank", render=True):
729
+ blanks = gr.HTML(emptyTabHTML)
730
+ showButtonFilBla = gr.Button(value="Generate")
731
+ with gr.Tab("Flashcards", render=True):
732
+ flashCrd = gr.HTML(emptyTabHTML)
733
+ showButtonFlash = gr.Button(value="Generate")
734
+ gr.Markdown("<span style='color: gray'>The app is a work in progress. Output may be odd and some features are disabled. [Learn more](https://huggingface.co/spaces/reflection777/summarizer-for-learning/blob/main/README.md).</span>")
735
+ with gr.Group():
736
+ gr.HTML("<p id='HFAPI' style='text-align: center;'>&nbsp; 🤗 Hugging Face Access Token [<a href='https://huggingface.co/settings/tokens'>more</a>]</p>")
737
+ hf_access_token = gr.Textbox(
738
+ show_label=False,
739
+ placeholder="example: hf_******************************",
740
+ type="password",
741
+ info="The app does not store the token.",
742
+ )
743
+ with gr.Accordion("Info", open=False, visible=False):
744
+ transcript_source = gr.Textbox(show_label=False, placeholder="transcript_source")
745
+ summary_source = gr.Textbox(show_label=False, placeholder="summary_source")
746
+ words = gr.Slider(minimum=100, maximum=500, value=250, label="Length of the summary")
747
+ # words: what should be the constant value?
748
+ use_api = gr.Checkbox(label="use_api", value=True)
749
+
750
+ subButton.click(
751
+ fn=transcribe_youtube_video,
752
+ inputs=[yt_link, force_transcribe_with_app, use_api, hf_access_token],
753
+ outputs=[title, yt_transcript, transcript_source],
754
+ queue=True,
755
+ ).then(
756
+ fn=summarize_text,
757
+ inputs=[title, yt_transcript, temperature, words, use_api, hf_access_token, do_sample, length, pa_or_po],
758
+ outputs=[summary, summary_source],
759
+ api_name="summarize_text",
760
+ queue=True,
761
+ )
762
+
763
+ subButton.click(fn=empty_tab, outputs=[key_sentences])
764
+ subButton.click(fn=empty_tab, outputs=[keywords])
765
+ subButton.click(fn=empty_tab, outputs=[flashCrd])
766
+ subButton.click(fn=empty_tab, outputs=[blanks])
767
+
768
+ showButtonKeySen.click(
769
+ fn=highlight_key_sentences,
770
+ inputs=[yt_transcript, hf_access_token],
771
+ outputs=[key_sentences],
772
+ queue=True,
773
+ )
774
+
775
+ # Keywords
776
+ showButtonKeyWor.click(fn=keywords_highlight, inputs=[summary], outputs=[keywords], queue=True)
777
+
778
+ # Flashcards
779
+ showButtonFlash.click(fn=flashcards, inputs=[summary], outputs=[flashCrd], queue=True)
780
+
781
+ # Fill in the blanks
782
+ showButtonFilBla.click(fn=fill_in_blanks, inputs=[summary], outputs=[blanks], queue=True)
783
+
784
+ gr.Examples(
785
+ examples=["https://www.youtube.com/watch?v=P6FORpg0KVo", "https://www.youtube.com/watch?v=bwEIqjU2qgk"],
786
+ inputs=[yt_link]
787
+ )
788
+
789
+ if __name__ == "__main__":
790
+ demo.launch(show_api=False)
791
+ # demo.launch(show_api=False, debug=True)
792
+ # demo.launch(show_api=False, share=True)