Spaces:
Sleeping
Sleeping
raphael825
commited on
Commit
·
3bd8090
1
Parent(s):
9378e58
Make the app.py
Browse files- app.py +298 -17
- requirements.txt +1 -0
app.py
CHANGED
@@ -1,4 +1,6 @@
|
|
|
|
1 |
import json
|
|
|
2 |
import gradio as gr
|
3 |
import Model
|
4 |
from pytube import YouTube
|
@@ -7,23 +9,27 @@ import time
|
|
7 |
import pickle
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.schema.document import Document
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
|
|
11 |
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
with open("book.json", "r", encoding='utf-8') as read_file:
|
16 |
-
test = json.load(read_file)
|
17 |
-
title = test[0]['item'][0]['title']
|
18 |
-
return link + title
|
19 |
|
20 |
|
21 |
-
|
22 |
-
model_test = Model.get_title(link)
|
23 |
-
model_link = Model.greet(link)
|
24 |
-
result = model_test + model_link
|
25 |
-
return result
|
26 |
|
|
|
27 |
|
28 |
def youtube_text(link):
|
29 |
yt = YouTube(link)
|
@@ -32,10 +38,10 @@ def youtube_text(link):
|
|
32 |
|
33 |
start = time.time()
|
34 |
model = whisper.load_model("small")
|
35 |
-
|
36 |
end = time.time()
|
37 |
|
38 |
-
print(
|
39 |
print(f"{end - start:.2f}sec")
|
40 |
|
41 |
text_splitter = RecursiveCharacterTextSplitter(
|
@@ -43,15 +49,290 @@ def youtube_text(link):
|
|
43 |
chunk_overlap=50,
|
44 |
length_function=len, )
|
45 |
|
46 |
-
|
|
|
47 |
|
48 |
split_docs = text_splitter.split_documents(docs)
|
49 |
|
50 |
with open("split_example_small.pkl", "wb") as f:
|
51 |
pickle.dump(split_docs, f)
|
52 |
|
53 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
|
|
|
|
55 |
|
56 |
-
iface = gr.Interface(fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
iface.launch()
|
|
|
1 |
+
import requests
|
2 |
import json
|
3 |
+
import re
|
4 |
import gradio as gr
|
5 |
import Model
|
6 |
from pytube import YouTube
|
|
|
9 |
import pickle
|
10 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
11 |
from langchain.schema.document import Document
|
12 |
+
from langchain.chains.mapreduce import MapReduceChain
|
13 |
+
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
|
14 |
+
from langchain.chat_models import ChatOpenAI
|
15 |
+
from langchain.chains.llm import LLMChain
|
16 |
+
from langchain.prompts import PromptTemplate
|
17 |
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
18 |
+
from PIL import Image
|
19 |
+
from io import BytesIO
|
20 |
+
from multiprocessing import Process, Queue, Pool
|
21 |
+
import openai
|
22 |
|
23 |
+
openai_api_key = "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"
|
24 |
|
25 |
+
client = openai.OpenAI(
|
26 |
+
api_key=openai_api_key
|
27 |
+
)
|
|
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
+
# for API
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
# # ==
|
33 |
|
34 |
def youtube_text(link):
|
35 |
yt = YouTube(link)
|
|
|
38 |
|
39 |
start = time.time()
|
40 |
model = whisper.load_model("small")
|
41 |
+
text = model.transcribe("test.mp3")
|
42 |
end = time.time()
|
43 |
|
44 |
+
print(text["text"])
|
45 |
print(f"{end - start:.2f}sec")
|
46 |
|
47 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
49 |
chunk_overlap=50,
|
50 |
length_function=len, )
|
51 |
|
52 |
+
full_docs = text["text"]
|
53 |
+
docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])]
|
54 |
|
55 |
split_docs = text_splitter.split_documents(docs)
|
56 |
|
57 |
with open("split_example_small.pkl", "wb") as f:
|
58 |
pickle.dump(split_docs, f)
|
59 |
|
60 |
+
return split_docs, full_docs
|
61 |
+
|
62 |
+
|
63 |
+
#
|
64 |
+
# def youtube_summary(full_docs, openai_key):
|
65 |
+
#
|
66 |
+
# prompt = """The following is a documents
|
67 |
+
# You need to output two things from the above Video.
|
68 |
+
# 1. Write an executive summary
|
69 |
+
# Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
|
70 |
+
# Your summary should.
|
71 |
+
# - Must be written in Korean
|
72 |
+
# - Be a single paragraph
|
73 |
+
# - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
|
74 |
+
# 2. Choose your keywords
|
75 |
+
# The keywords have the following conditions
|
76 |
+
# - Must be written in Korean
|
77 |
+
# - Must be a single word
|
78 |
+
# - Must be a word that appears in the Video
|
79 |
+
# - Must be a word that is not a stopword
|
80 |
+
# - Must be a word that is not a proper noun
|
81 |
+
# - Must be a word that is not a preposition
|
82 |
+
# - Must be a word that is not a conjunction
|
83 |
+
# - Must be a word that is not an interjection
|
84 |
+
# - Must be a word that is not an adjective
|
85 |
+
# - Must be a word that is not an adverb
|
86 |
+
# - Output as a Python array (ex: [keyword1,keyword2,keyword3] )
|
87 |
+
# - Output a total of 3 keywords
|
88 |
+
# - Choose words you might use to search for a book title !
|
89 |
+
# Here is an example of the final output
|
90 |
+
# 요약: Document_summary
|
91 |
+
# 키워드: [ Keyword1,Keyword2,Keyword3]
|
92 |
+
# """
|
93 |
+
#
|
94 |
+
#
|
95 |
+
# try:
|
96 |
+
#
|
97 |
+
# response = client.chat.completions.create(
|
98 |
+
# messages={
|
99 |
+
# "role": "system", "content": "You are a helpful assistant."
|
100 |
+
# "role": "user", "content": prompt
|
101 |
+
# },
|
102 |
+
# temperature=0.7)
|
103 |
+
#
|
104 |
+
# with open ("data/result_new.json", "w") as f:
|
105 |
+
# json.dump(response.choices[0].message['content'], f, indent=4)
|
106 |
+
# return response.choices[0].message['content']
|
107 |
+
# except Exception as e:
|
108 |
+
# print(e)
|
109 |
+
# return "Error"
|
110 |
+
|
111 |
+
#
|
112 |
+
# 1. Write an executive summary
|
113 |
+
# Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
|
114 |
+
# Your summary should.
|
115 |
+
# - Must be written in Korean
|
116 |
+
# - Be a single paragraph
|
117 |
+
# - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
|
118 |
+
# 2.
|
119 |
+
|
120 |
+
|
121 |
+
def youtube_sum(split_docs, full_docs):
|
122 |
+
llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_api_key)
|
123 |
+
|
124 |
+
# Map prompt
|
125 |
+
map_template = """The following is a set of documents
|
126 |
+
{docs}
|
127 |
+
Based on this list of docs, please identify the main themes
|
128 |
+
Helpful Answer:"""
|
129 |
+
|
130 |
+
map_prompt = PromptTemplate.from_template(map_template)
|
131 |
+
|
132 |
+
# Reduce prompt
|
133 |
+
reduce_template = """The following is set of summaries:
|
134 |
+
{doc_summaries}
|
135 |
+
You need to output Keyword from the above Video.
|
136 |
+
Choose your keywords
|
137 |
+
The keywords have the following conditions
|
138 |
+
- Must be written in Korean
|
139 |
+
- Must be a single word
|
140 |
+
- Must be a word that appears in the Video
|
141 |
+
- Must be a word that is not a stopword
|
142 |
+
- Must be a word that is not a proper noun
|
143 |
+
- Must be a word that is not a preposition
|
144 |
+
- Must be a word that is not a conjunction
|
145 |
+
- Must be a word that is not an interjection
|
146 |
+
- Must be a word that is not an adjective
|
147 |
+
- Must be a word that is not an adverb
|
148 |
+
- Output a total of 3 keywords
|
149 |
+
- Choose words you might use to search for a book title !
|
150 |
+
Here is an example of the final output
|
151 |
+
Keyword: Keyword1,Keyword2,Keyword3
|
152 |
+
Helpful Answer:"""
|
153 |
+
|
154 |
+
reduce_prompt = PromptTemplate.from_template(reduce_template)
|
155 |
+
|
156 |
+
reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
|
157 |
+
combine_documents_chain = StuffDocumentsChain(
|
158 |
+
llm_chain=reduce_chain, document_variable_name="doc_summaries"
|
159 |
+
)
|
160 |
+
|
161 |
+
# Combines and iteravely reduces the mapped documents
|
162 |
+
reduce_documents_chain = ReduceDocumentsChain(
|
163 |
+
# This is final chain that is called.
|
164 |
+
combine_documents_chain=combine_documents_chain,
|
165 |
+
# If documents exceed context for `StuffDocumentsChain`
|
166 |
+
collapse_documents_chain=combine_documents_chain,
|
167 |
+
# The maximum number of tokens to group documents into.
|
168 |
+
token_max=4000,
|
169 |
+
)
|
170 |
+
|
171 |
+
# 2. Map chain
|
172 |
+
map_chain = LLMChain(llm=llm, prompt=map_prompt)
|
173 |
+
|
174 |
+
# Combining documents by mapping a chain over them, then combining results
|
175 |
+
map_reduce_chain = MapReduceDocumentsChain(
|
176 |
+
# Map chain
|
177 |
+
llm_chain=map_chain,
|
178 |
+
# Reduce chain
|
179 |
+
reduce_documents_chain=reduce_documents_chain,
|
180 |
+
# The variable name in the llm_chain to put the documents in
|
181 |
+
document_variable_name="docs",
|
182 |
+
# Return the results of the map steps in the output
|
183 |
+
return_intermediate_steps=False,
|
184 |
+
)
|
185 |
+
# Run
|
186 |
+
result = map_reduce_chain.run(split_docs)
|
187 |
+
print(result)
|
188 |
+
with open("result.txt", "w") as f:
|
189 |
+
f.write(result)
|
190 |
+
return result
|
191 |
+
|
192 |
+
|
193 |
+
def text_to_arr(result):
|
194 |
+
parts = re.split(r'Keyword:', result, flags=re.IGNORECASE)
|
195 |
+
# Take the last part (the actual keywords), strip whitespace, and split by commas
|
196 |
+
keywords = parts[-1].strip().split(", ")
|
197 |
+
# Now 'keywords' is an array (list in Python) containing the extracted keywords
|
198 |
+
print(keywords)
|
199 |
+
|
200 |
+
return keywords
|
201 |
+
|
202 |
+
|
203 |
+
def aladin_api(keywords, selected_option):
|
204 |
+
aladin_key = 'ttbkangmj08250027001'
|
205 |
+
all_data = []
|
206 |
+
title = []
|
207 |
+
keyword = keywords
|
208 |
+
if selected_option == "사회":
|
209 |
+
for key in keyword:
|
210 |
+
print(key)
|
211 |
+
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
|
212 |
+
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=90853&outofStockFilter=1"
|
213 |
+
response = requests.get(url)
|
214 |
+
response_json = json.loads(response.text)
|
215 |
+
all_data.append(response_json)
|
216 |
+
# request 보내기
|
217 |
+
all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
|
218 |
+
with open("book.json", "wb") as f:
|
219 |
+
f.write(all_data.encode("utf-8"))
|
220 |
+
|
221 |
+
|
222 |
+
elif selected_option == "과학":
|
223 |
+
for key in keyword:
|
224 |
+
print(key)
|
225 |
+
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
|
226 |
+
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
|
227 |
+
response = requests.get(url)
|
228 |
+
response_json = json.loads(response.text)
|
229 |
+
all_data.append(response_json)
|
230 |
+
# request 보내기
|
231 |
+
all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
|
232 |
+
with open("book.json", "wb") as f:
|
233 |
+
f.write(all_data.encode("utf-8"))
|
234 |
+
|
235 |
+
elif selected_option == "소설":
|
236 |
+
for key in keyword:
|
237 |
+
print(key)
|
238 |
+
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
|
239 |
+
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
|
240 |
+
response = requests.get(url)
|
241 |
+
response_json = json.loads(response.text)
|
242 |
+
all_data.append(response_json)
|
243 |
+
# request 보내기
|
244 |
+
all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
|
245 |
+
with open("book.json", "wb") as f:
|
246 |
+
f.write(all_data.encode("utf-8"))
|
247 |
+
|
248 |
+
elif selected_option == "경제경영":
|
249 |
+
for key in keyword:
|
250 |
+
print(key)
|
251 |
+
url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
|
252 |
+
"&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
|
253 |
+
response = requests.get(url)
|
254 |
+
response_json = json.loads(response.text)
|
255 |
+
all_data.append(response_json)
|
256 |
+
# request 보내기
|
257 |
+
all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
|
258 |
+
with open("book.json", "wb") as f:
|
259 |
+
f.write(all_data.encode("utf-8"))
|
260 |
+
|
261 |
+
print(all_data)
|
262 |
+
return all_data
|
263 |
+
|
264 |
+
|
265 |
+
def book_output(book_json):
|
266 |
+
data = json.loads(book_json)
|
267 |
+
|
268 |
+
if len(data[0]['item']) != 0:
|
269 |
+
title1 = data[0]['item'][0]['title']
|
270 |
+
book_link1 = data[0]['item'][0]['link']
|
271 |
+
cover_link1 = data[0]['item'][0]['cover']
|
272 |
+
response1 = requests.get(cover_link1)
|
273 |
+
image1 = Image.open(BytesIO(response1.content))
|
274 |
+
else:
|
275 |
+
title1 = "No Data"
|
276 |
+
book_link1 = "No Data"
|
277 |
+
image1 = "No Data"
|
278 |
+
|
279 |
+
if len(data[1]['item']) != 0:
|
280 |
+
title2 = data[1]['item'][0]['title']
|
281 |
+
book_link2 = data[1]['item'][0]['link']
|
282 |
+
cover_link2 = data[1]['item'][0]['cover']
|
283 |
+
response2 = requests.get(cover_link2)
|
284 |
+
image2 = Image.open(BytesIO(response2.content))
|
285 |
+
else:
|
286 |
+
title2 = "No Data"
|
287 |
+
book_link2 = "No Data"
|
288 |
+
image2 = "No Data"
|
289 |
+
|
290 |
+
if len(data[2]['item']) != 0:
|
291 |
+
title3 = data[2]['item'][0]['title']
|
292 |
+
book_link3 = data[2]['item'][0]['link']
|
293 |
+
cover_link3 = data[2]['item'][0]['cover']
|
294 |
+
response3 = requests.get(cover_link3)
|
295 |
+
image3 = Image.open(BytesIO(response3.content))
|
296 |
+
else:
|
297 |
+
title3 = "No Data"
|
298 |
+
book_link3 = "No Data"
|
299 |
+
image3 = "No Data"
|
300 |
+
|
301 |
+
return title1, image1, title2, image2, title3, image3
|
302 |
+
|
303 |
+
|
304 |
+
def process_selection(input_list):
|
305 |
+
# Your processing logic here for the selected option
|
306 |
+
API_KEY = input_list[0]
|
307 |
+
link = input_list[1]
|
308 |
+
selected_option = input_list[2]
|
309 |
+
result = f"You selected: {selected_option}"
|
310 |
+
print(result)
|
311 |
+
return API_KEY, link, selected_option
|
312 |
+
|
313 |
+
|
314 |
+
def get_title(API_KEY, link, selected_option):
|
315 |
+
docs, split_docs = youtube_text(link)
|
316 |
+
result = youtube_sum(docs, split_docs)
|
317 |
+
keywords = text_to_arr(result)
|
318 |
+
all_data = aladin_api(keywords, selected_option)
|
319 |
+
title1, image1, title2, image2, title3, image3 = book_output(all_data)
|
320 |
+
return result, title1, image1, title2, image2, title3, image3
|
321 |
+
|
322 |
|
323 |
+
# Define the list of options for the Dropdown
|
324 |
+
options_list = ["사회", "과학", "소설", "경제경영"]
|
325 |
|
326 |
+
iface = gr.Interface(fn=get_title, inputs=[gr.Textbox(label="Your OpenAI KEY"),
|
327 |
+
gr.Textbox(label="Input Link"),
|
328 |
+
gr.Dropdown(choices=options_list, label="Select a category")],
|
329 |
+
outputs=[
|
330 |
+
gr.Textbox(label="Keywords"),
|
331 |
+
gr.Textbox(label="Title1"),
|
332 |
+
gr.Image(label="Image1"),
|
333 |
+
gr.Textbox(label="Title2"),
|
334 |
+
gr.Image(label="Image2"),
|
335 |
+
gr.Textbox(label="Title3"),
|
336 |
+
gr.Image(label="Image3"),
|
337 |
+
])
|
338 |
iface.launch()
|
requirements.txt
CHANGED
@@ -2,3 +2,4 @@ openai==1.3.7
|
|
2 |
openai-whisper==20231117
|
3 |
pytube==15.0.0
|
4 |
tiktoken==0.5.1
|
|
|
|
2 |
openai-whisper==20231117
|
3 |
pytube==15.0.0
|
4 |
tiktoken==0.5.1
|
5 |
+
|