Spaces:
Sleeping
Sleeping
import os | |
import requests | |
import json | |
import re | |
import gradio as gr | |
from pytube import YouTube | |
import whisper | |
import time | |
import pickle | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.schema.document import Document | |
from langchain.chains.mapreduce import MapReduceChain | |
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain | |
from langchain.chat_models import ChatOpenAI | |
from langchain.chains.llm import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain.chains.combine_documents.stuff import StuffDocumentsChain | |
from PIL import Image | |
from io import BytesIO | |
openai_api_key = "" | |
# for API | |
# # == | |
def youtube_text(link): | |
yt = YouTube(link) | |
yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3") | |
start = time.time() | |
model = whisper.load_model("base") | |
text = model.transcribe("test.mp3") | |
end = time.time() | |
print(text["text"]) | |
print(f"{end - start:.2f}sec") | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=2000, | |
chunk_overlap=50, | |
length_function=len, ) | |
full_docs = text["text"] | |
docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])] | |
split_docs = text_splitter.split_documents(docs) | |
with open("temp/split_example_small.pkl", "wb") as f: | |
pickle.dump(split_docs, f) | |
return split_docs, full_docs | |
def youtube_sum(split_docs, full_docs, API_KEY): | |
openai_key = API_KEY | |
llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key) | |
# Map prompt | |
map_template = """The following is a set of documents | |
{docs} | |
Based on this list of Video subtitles , please identify the main themes | |
Helpful Answer:""" | |
map_prompt = PromptTemplate.from_template(map_template) | |
# Reduce prompt | |
reduce_template = """The following is set of summaries: | |
{doc_summaries} | |
You need to output two things from the above Video Subtitles. | |
1. Write an executive summary | |
Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video. | |
Your summary should. | |
- Must be written in Korean | |
- Be a 1~2 paragraph | |
- Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video. | |
- There are no more than three main topics in the video. | |
- Please also briefly describe the overall content of the video | |
2. Choose your keyword | |
The keywords have the following conditions | |
- Must be written in Korean | |
- Must be a single word | |
- Must be a noun | |
- Must be a word that appears in the Video | |
- Must be a word that is not a stopword | |
- Must be a word that is not a proper noun | |
- Must be a word that is not a number | |
- Must be a word that is not a verb | |
- Must be a word that is not a pronoun | |
- Must be a word that is not a preposition | |
- Must be a word that is not a conjunction | |
- Must be a word that is not an interjection | |
- Must be a word that is not an adjective | |
- Must be a word that is not an adverb | |
- Must be a word that is not a determiner | |
- Must be a word that is not a particle | |
- Must be a word that is not a numeral | |
- Output only one keyword | |
Here is an example of the final output | |
Summary: Summary of The video | |
Keyword: keyword | |
Don't output any other text outside of the given format | |
Helpful Answer:""" | |
reduce_prompt = PromptTemplate.from_template(reduce_template) | |
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt) | |
combine_documents_chain = StuffDocumentsChain( | |
llm_chain=reduce_chain, document_variable_name="doc_summaries" | |
) | |
# Combines and iteravely reduces the mapped documents | |
reduce_documents_chain = ReduceDocumentsChain( | |
# This is final chain that is called. | |
combine_documents_chain=combine_documents_chain, | |
# If documents exceed context for `StuffDocumentsChain` | |
collapse_documents_chain=combine_documents_chain, | |
# The maximum number of tokens to group documents into. | |
token_max=4000, | |
) | |
# 2. Map chain | |
map_chain = LLMChain(llm=llm, prompt=map_prompt) | |
# Combining documents by mapping a chain over them, then combining results | |
map_reduce_chain = MapReduceDocumentsChain( | |
# Map chain | |
llm_chain=map_chain, | |
# Reduce chain | |
reduce_documents_chain=reduce_documents_chain, | |
# The variable name in the llm_chain to put the documents in | |
document_variable_name="docs", | |
# Return the results of the map steps in the output | |
return_intermediate_steps=False, | |
) | |
# Run | |
result = map_reduce_chain.run(split_docs) | |
print(result) | |
with open("temp/result.txt", "w") as f: | |
f.write(result) | |
return result | |
def text_to_arr(result): | |
text = result | |
# Regular expression to find the keyword | |
match = re.search(r"Keyword:\s*(\w+)", text) | |
if match: | |
keyword = match.group(1) | |
print("Keyword:", keyword) # The keyword is in the first capturing group | |
else: | |
match = re.search(r"ํค์๋:\s*(\w+)", text) | |
keyword = match.group(1) # No keyword found | |
print("Keyword:", keyword) | |
return keyword | |
def aladin_api(keyword, selected_option): | |
aladin_key = 'ttbkangmj08250027001' | |
keyword = keyword | |
all_data = [] | |
if selected_option == "์ฌํ": | |
key = keyword | |
print(key) | |
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1" | |
response = requests.get(url) | |
response_json = json.loads(response.text) | |
all_data.append(response_json) | |
elif selected_option == "๊ณผํ": | |
key = keyword | |
print(key) | |
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1" | |
response = requests.get(url) | |
response_json = json.loads(response.text) | |
all_data.append(response_json) | |
elif selected_option == "์์ค": | |
key = keyword | |
print(key) | |
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1" | |
response = requests.get(url) | |
response_json = json.loads(response.text) | |
all_data.append(response_json) | |
elif selected_option == "๊ธ์ต": | |
key = keyword | |
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \ | |
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1" | |
response = requests.get(url) | |
response_json = json.loads(response.text) | |
all_data.append(response_json) | |
# request ๋ณด๋ด๊ธฐ | |
all_data = json.dumps(all_data, ensure_ascii=False, indent=4) | |
with open("temp/book.json", "wb") as f: | |
f.write(all_data.encode("utf-8")) | |
print(type(all_data)) | |
print(all_data) | |
return all_data | |
def book_output(book_json): | |
data = json.loads(book_json) | |
if len(data[0]['item'][0]) != 0: | |
title1 = data[0]['item'][0]['title'] | |
book_link1 = data[0]['item'][0]['link'] | |
cover_link1 = data[0]['item'][0]['cover'] | |
response1 = requests.get(cover_link1) | |
image1 = Image.open(BytesIO(response1.content)) | |
else: | |
title1 = "No Data" | |
book_link1 = "No Data" | |
image1 = Image.open("NO DATA.jpeg") | |
if len(data[0]['item'][1]) != 0: | |
title2 = data[0]['item'][1]['title'] | |
book_link2 = data[0]['item'][1]['link'] | |
cover_link2 = data[0]['item'][1]['cover'] | |
response2 = requests.get(cover_link2) | |
image2 = Image.open(BytesIO(response2.content)) | |
else: | |
title2 = "No Data" | |
book_link2 = "No Data" | |
image2 = Image.open("NO DATA.jpeg") | |
if len(data[0]['item'][2]) != 0: | |
title3 = data[0]['item'][2]['title'] | |
book_link3 = data[0]['item'][2]['link'] | |
cover_link3 = data[0]['item'][2]['cover'] | |
response3 = requests.get(cover_link3) | |
image3 = Image.open(BytesIO(response3.content)) | |
else: | |
title3 = "No Data" | |
book_link3 = "No Data" | |
image3 = Image.open("NO DATA.jpeg") | |
return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3 | |
def get_title(API_KEY, link, selected_option): | |
docs, split_docs = youtube_text(link) | |
result = youtube_sum(docs, split_docs, API_KEY) | |
keywords = text_to_arr(result) | |
all_data = aladin_api(keywords, selected_option) | |
title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data) | |
return result, title1, image1, title2, image2, title3, image3, link1, link2, link3 | |
# Define the list of options for the Dropdown | |
options_list = ["์ฌํ", "๊ณผํ", "์์ค", "๊ธ์ต"] | |
with gr.Blocks() as demo: | |
gr.Markdown("Paste your Youtube Link and get the book recommandation") | |
with gr.Column(): | |
with gr.Row(): | |
inp1 = gr.Textbox(label="Your OpenAI KEY") | |
inp2 = gr.Textbox(label="Input Link") | |
inp3 = gr.Dropdown(choices=options_list, label="Select a category") | |
btn = gr.Button("Find the book") | |
with gr.Column(): | |
out1 = gr.Textbox(label="Summary") | |
with gr.Row(): | |
out2 = gr.Textbox(label="Title1") | |
out4 = gr.Textbox(label="Title2") | |
out6 = gr.Textbox(label="Title3") | |
with gr.Row(): | |
out3 = gr.Image(label="Image1") | |
out5 = gr.Image(label="Image2") | |
out7 = gr.Image(label="Image3") | |
with gr.Row(): | |
out8 = gr.HTML(label="Book Link1") | |
out9 = gr.HTML(label="Book Link2") | |
out10 = gr.HTML(label="Book Link3") | |
btn.click(fn=get_title, inputs=[inp1, inp2, inp3], | |
outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10]) | |
demo.launch(share=True) | |