raphael825's picture
Update app.py
a5d3b8f
import os
import requests
import json
import re
import gradio as gr
from pytube import YouTube
import whisper
import time
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from PIL import Image
from io import BytesIO
openai_api_key = ""
# for API
# # ==
def youtube_text(link):
yt = YouTube(link)
yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3")
start = time.time()
model = whisper.load_model("base")
text = model.transcribe("test.mp3")
end = time.time()
print(text["text"])
print(f"{end - start:.2f}sec")
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=2000,
chunk_overlap=50,
length_function=len, )
full_docs = text["text"]
docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])]
split_docs = text_splitter.split_documents(docs)
with open("temp/split_example_small.pkl", "wb") as f:
pickle.dump(split_docs, f)
return split_docs, full_docs
def youtube_sum(split_docs, full_docs, API_KEY):
openai_key = API_KEY
llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key)
# Map prompt
map_template = """The following is a set of documents
{docs}
Based on this list of Video subtitles , please identify the main themes
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
# Reduce prompt
reduce_template = """The following is set of summaries:
{doc_summaries}
You need to output two things from the above Video Subtitles.
1. Write an executive summary
Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video.
Your summary should.
- Must be written in Korean
- Be a 1~2 paragraph
- Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
- There are no more than three main topics in the video.
- Please also briefly describe the overall content of the video
2. Choose your keyword
The keywords have the following conditions
- Must be written in Korean
- Must be a single word
- Must be a noun
- Must be a word that appears in the Video
- Must be a word that is not a stopword
- Must be a word that is not a proper noun
- Must be a word that is not a number
- Must be a word that is not a verb
- Must be a word that is not a pronoun
- Must be a word that is not a preposition
- Must be a word that is not a conjunction
- Must be a word that is not an interjection
- Must be a word that is not an adjective
- Must be a word that is not an adverb
- Must be a word that is not a determiner
- Must be a word that is not a particle
- Must be a word that is not a numeral
- Output only one keyword
Here is an example of the final output
Summary: Summary of The video
Keyword: keyword
Don't output any other text outside of the given format
Helpful Answer:"""
reduce_prompt = PromptTemplate.from_template(reduce_template)
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
combine_documents_chain = StuffDocumentsChain(
llm_chain=reduce_chain, document_variable_name="doc_summaries"
)
# Combines and iteravely reduces the mapped documents
reduce_documents_chain = ReduceDocumentsChain(
# This is final chain that is called.
combine_documents_chain=combine_documents_chain,
# If documents exceed context for `StuffDocumentsChain`
collapse_documents_chain=combine_documents_chain,
# The maximum number of tokens to group documents into.
token_max=4000,
)
# 2. Map chain
map_chain = LLMChain(llm=llm, prompt=map_prompt)
# Combining documents by mapping a chain over them, then combining results
map_reduce_chain = MapReduceDocumentsChain(
# Map chain
llm_chain=map_chain,
# Reduce chain
reduce_documents_chain=reduce_documents_chain,
# The variable name in the llm_chain to put the documents in
document_variable_name="docs",
# Return the results of the map steps in the output
return_intermediate_steps=False,
)
# Run
result = map_reduce_chain.run(split_docs)
print(result)
with open("temp/result.txt", "w") as f:
f.write(result)
return result
def text_to_arr(result):
text = result
# Regular expression to find the keyword
match = re.search(r"Keyword:\s*(\w+)", text)
if match:
keyword = match.group(1)
print("Keyword:", keyword) # The keyword is in the first capturing group
else:
match = re.search(r"ํ‚ค์›Œ๋“œ:\s*(\w+)", text)
keyword = match.group(1) # No keyword found
print("Keyword:", keyword)
return keyword
def aladin_api(keyword, selected_option):
aladin_key = 'ttbkangmj08250027001'
keyword = keyword
all_data = []
if selected_option == "์‚ฌํšŒ":
key = keyword
print(key)
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1"
response = requests.get(url)
response_json = json.loads(response.text)
all_data.append(response_json)
elif selected_option == "๊ณผํ•™":
key = keyword
print(key)
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
response = requests.get(url)
response_json = json.loads(response.text)
all_data.append(response_json)
elif selected_option == "์†Œ์„ค":
key = keyword
print(key)
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
response = requests.get(url)
response_json = json.loads(response.text)
all_data.append(response_json)
elif selected_option == "๊ธˆ์œต":
key = keyword
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
response = requests.get(url)
response_json = json.loads(response.text)
all_data.append(response_json)
# request ๋ณด๋‚ด๊ธฐ
all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
with open("temp/book.json", "wb") as f:
f.write(all_data.encode("utf-8"))
print(type(all_data))
print(all_data)
return all_data
def book_output(book_json):
data = json.loads(book_json)
if len(data[0]['item'][0]) != 0:
title1 = data[0]['item'][0]['title']
book_link1 = data[0]['item'][0]['link']
cover_link1 = data[0]['item'][0]['cover']
response1 = requests.get(cover_link1)
image1 = Image.open(BytesIO(response1.content))
else:
title1 = "No Data"
book_link1 = "No Data"
image1 = Image.open("NO DATA.jpeg")
if len(data[0]['item'][1]) != 0:
title2 = data[0]['item'][1]['title']
book_link2 = data[0]['item'][1]['link']
cover_link2 = data[0]['item'][1]['cover']
response2 = requests.get(cover_link2)
image2 = Image.open(BytesIO(response2.content))
else:
title2 = "No Data"
book_link2 = "No Data"
image2 = Image.open("NO DATA.jpeg")
if len(data[0]['item'][2]) != 0:
title3 = data[0]['item'][2]['title']
book_link3 = data[0]['item'][2]['link']
cover_link3 = data[0]['item'][2]['cover']
response3 = requests.get(cover_link3)
image3 = Image.open(BytesIO(response3.content))
else:
title3 = "No Data"
book_link3 = "No Data"
image3 = Image.open("NO DATA.jpeg")
return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3
def get_title(API_KEY, link, selected_option):
docs, split_docs = youtube_text(link)
result = youtube_sum(docs, split_docs, API_KEY)
keywords = text_to_arr(result)
all_data = aladin_api(keywords, selected_option)
title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data)
return result, title1, image1, title2, image2, title3, image3, link1, link2, link3
# Define the list of options for the Dropdown
options_list = ["์‚ฌํšŒ", "๊ณผํ•™", "์†Œ์„ค", "๊ธˆ์œต"]
with gr.Blocks() as demo:
gr.Markdown("Paste your Youtube Link and get the book recommandation")
with gr.Column():
with gr.Row():
inp1 = gr.Textbox(label="Your OpenAI KEY")
inp2 = gr.Textbox(label="Input Link")
inp3 = gr.Dropdown(choices=options_list, label="Select a category")
btn = gr.Button("Find the book")
with gr.Column():
out1 = gr.Textbox(label="Summary")
with gr.Row():
out2 = gr.Textbox(label="Title1")
out4 = gr.Textbox(label="Title2")
out6 = gr.Textbox(label="Title3")
with gr.Row():
out3 = gr.Image(label="Image1")
out5 = gr.Image(label="Image2")
out7 = gr.Image(label="Image3")
with gr.Row():
out8 = gr.HTML(label="Book Link1")
out9 = gr.HTML(label="Book Link2")
out10 = gr.HTML(label="Book Link3")
btn.click(fn=get_title, inputs=[inp1, inp2, inp3],
outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10])
demo.launch(share=True)