raphael825 commited on
Commit
3bd8090
·
1 Parent(s): 9378e58

Make the app.py

Browse files
Files changed (2) hide show
  1. app.py +298 -17
  2. requirements.txt +1 -0
app.py CHANGED
@@ -1,4 +1,6 @@
 
1
  import json
 
2
  import gradio as gr
3
  import Model
4
  from pytube import YouTube
@@ -7,23 +9,27 @@ import time
7
  import pickle
8
  from langchain.text_splitter import RecursiveCharacterTextSplitter
9
  from langchain.schema.document import Document
 
 
 
 
 
 
 
 
 
 
10
 
 
11
 
12
- # # ==
13
-
14
- def get_title(link):
15
- with open("book.json", "r", encoding='utf-8') as read_file:
16
- test = json.load(read_file)
17
- title = test[0]['item'][0]['title']
18
- return link + title
19
 
20
 
21
- def greet(link):
22
- model_test = Model.get_title(link)
23
- model_link = Model.greet(link)
24
- result = model_test + model_link
25
- return result
26
 
 
27
 
28
  def youtube_text(link):
29
  yt = YouTube(link)
@@ -32,10 +38,10 @@ def youtube_text(link):
32
 
33
  start = time.time()
34
  model = whisper.load_model("small")
35
- result = model.transcribe("test.mp3")
36
  end = time.time()
37
 
38
- print(result["text"])
39
  print(f"{end - start:.2f}sec")
40
 
41
  text_splitter = RecursiveCharacterTextSplitter(
@@ -43,15 +49,290 @@ def youtube_text(link):
43
  chunk_overlap=50,
44
  length_function=len, )
45
 
46
- docs = [Document(page_content=x) for x in text_splitter.split_text(result["text"])]
 
47
 
48
  split_docs = text_splitter.split_documents(docs)
49
 
50
  with open("split_example_small.pkl", "wb") as f:
51
  pickle.dump(split_docs, f)
52
 
53
- return docs[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
 
 
55
 
56
- iface = gr.Interface(fn=youtube_text, inputs='text', outputs='text')
 
 
 
 
 
 
 
 
 
 
 
57
  iface.launch()
 
1
+ import requests
2
  import json
3
+ import re
4
  import gradio as gr
5
  import Model
6
  from pytube import YouTube
 
9
  import pickle
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain.schema.document import Document
12
+ from langchain.chains.mapreduce import MapReduceChain
13
+ from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
14
+ from langchain.chat_models import ChatOpenAI
15
+ from langchain.chains.llm import LLMChain
16
+ from langchain.prompts import PromptTemplate
17
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
18
+ from PIL import Image
19
+ from io import BytesIO
20
+ from multiprocessing import Process, Queue, Pool
21
+ import openai
22
 
23
+ openai_api_key = "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"
24
 
25
+ client = openai.OpenAI(
26
+ api_key=openai_api_key
27
+ )
 
 
 
 
28
 
29
 
30
+ # for API
 
 
 
 
31
 
32
+ # # ==
33
 
34
  def youtube_text(link):
35
  yt = YouTube(link)
 
38
 
39
  start = time.time()
40
  model = whisper.load_model("small")
41
+ text = model.transcribe("test.mp3")
42
  end = time.time()
43
 
44
+ print(text["text"])
45
  print(f"{end - start:.2f}sec")
46
 
47
  text_splitter = RecursiveCharacterTextSplitter(
 
49
  chunk_overlap=50,
50
  length_function=len, )
51
 
52
+ full_docs = text["text"]
53
+ docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])]
54
 
55
  split_docs = text_splitter.split_documents(docs)
56
 
57
  with open("split_example_small.pkl", "wb") as f:
58
  pickle.dump(split_docs, f)
59
 
60
+ return split_docs, full_docs
61
+
62
+
63
+ #
64
+ # def youtube_summary(full_docs, openai_key):
65
+ #
66
+ # prompt = """The following is a documents
67
+ # You need to output two things from the above Video.
68
+ # 1. Write an executive summary
69
+ # Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
70
+ # Your summary should.
71
+ # - Must be written in Korean
72
+ # - Be a single paragraph
73
+ # - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
74
+ # 2. Choose your keywords
75
+ # The keywords have the following conditions
76
+ # - Must be written in Korean
77
+ # - Must be a single word
78
+ # - Must be a word that appears in the Video
79
+ # - Must be a word that is not a stopword
80
+ # - Must be a word that is not a proper noun
81
+ # - Must be a word that is not a preposition
82
+ # - Must be a word that is not a conjunction
83
+ # - Must be a word that is not an interjection
84
+ # - Must be a word that is not an adjective
85
+ # - Must be a word that is not an adverb
86
+ # - Output as a Python array (ex: [keyword1,keyword2,keyword3] )
87
+ # - Output a total of 3 keywords
88
+ # - Choose words you might use to search for a book title !
89
+ # Here is an example of the final output
90
+ # 요약: Document_summary
91
+ # 키워드: [ Keyword1,Keyword2,Keyword3]
92
+ # """
93
+ #
94
+ #
95
+ # try:
96
+ #
97
+ # response = client.chat.completions.create(
98
+ # messages={
99
+ # "role": "system", "content": "You are a helpful assistant."
100
+ # "role": "user", "content": prompt
101
+ # },
102
+ # temperature=0.7)
103
+ #
104
+ # with open ("data/result_new.json", "w") as f:
105
+ # json.dump(response.choices[0].message['content'], f, indent=4)
106
+ # return response.choices[0].message['content']
107
+ # except Exception as e:
108
+ # print(e)
109
+ # return "Error"
110
+
111
+ #
112
+ # 1. Write an executive summary
113
+ # Read the following documents and write a summary that integrates them to quickly identify the main topics of the Video.
114
+ # Your summary should.
115
+ # - Must be written in Korean
116
+ # - Be a single paragraph
117
+ # - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video.
118
+ # 2.
119
+
120
+
121
+ def youtube_sum(split_docs, full_docs):
122
+ llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_api_key)
123
+
124
+ # Map prompt
125
+ map_template = """The following is a set of documents
126
+ {docs}
127
+ Based on this list of docs, please identify the main themes
128
+ Helpful Answer:"""
129
+
130
+ map_prompt = PromptTemplate.from_template(map_template)
131
+
132
+ # Reduce prompt
133
+ reduce_template = """The following is set of summaries:
134
+ {doc_summaries}
135
+ You need to output Keyword from the above Video.
136
+ Choose your keywords
137
+ The keywords have the following conditions
138
+ - Must be written in Korean
139
+ - Must be a single word
140
+ - Must be a word that appears in the Video
141
+ - Must be a word that is not a stopword
142
+ - Must be a word that is not a proper noun
143
+ - Must be a word that is not a preposition
144
+ - Must be a word that is not a conjunction
145
+ - Must be a word that is not an interjection
146
+ - Must be a word that is not an adjective
147
+ - Must be a word that is not an adverb
148
+ - Output a total of 3 keywords
149
+ - Choose words you might use to search for a book title !
150
+ Here is an example of the final output
151
+ Keyword: Keyword1,Keyword2,Keyword3
152
+ Helpful Answer:"""
153
+
154
+ reduce_prompt = PromptTemplate.from_template(reduce_template)
155
+
156
+ reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
157
+ combine_documents_chain = StuffDocumentsChain(
158
+ llm_chain=reduce_chain, document_variable_name="doc_summaries"
159
+ )
160
+
161
+ # Combines and iteravely reduces the mapped documents
162
+ reduce_documents_chain = ReduceDocumentsChain(
163
+ # This is final chain that is called.
164
+ combine_documents_chain=combine_documents_chain,
165
+ # If documents exceed context for `StuffDocumentsChain`
166
+ collapse_documents_chain=combine_documents_chain,
167
+ # The maximum number of tokens to group documents into.
168
+ token_max=4000,
169
+ )
170
+
171
+ # 2. Map chain
172
+ map_chain = LLMChain(llm=llm, prompt=map_prompt)
173
+
174
+ # Combining documents by mapping a chain over them, then combining results
175
+ map_reduce_chain = MapReduceDocumentsChain(
176
+ # Map chain
177
+ llm_chain=map_chain,
178
+ # Reduce chain
179
+ reduce_documents_chain=reduce_documents_chain,
180
+ # The variable name in the llm_chain to put the documents in
181
+ document_variable_name="docs",
182
+ # Return the results of the map steps in the output
183
+ return_intermediate_steps=False,
184
+ )
185
+ # Run
186
+ result = map_reduce_chain.run(split_docs)
187
+ print(result)
188
+ with open("result.txt", "w") as f:
189
+ f.write(result)
190
+ return result
191
+
192
+
193
+ def text_to_arr(result):
194
+ parts = re.split(r'Keyword:', result, flags=re.IGNORECASE)
195
+ # Take the last part (the actual keywords), strip whitespace, and split by commas
196
+ keywords = parts[-1].strip().split(", ")
197
+ # Now 'keywords' is an array (list in Python) containing the extracted keywords
198
+ print(keywords)
199
+
200
+ return keywords
201
+
202
+
203
+ def aladin_api(keywords, selected_option):
204
+ aladin_key = 'ttbkangmj08250027001'
205
+ all_data = []
206
+ title = []
207
+ keyword = keywords
208
+ if selected_option == "사회":
209
+ for key in keyword:
210
+ print(key)
211
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
212
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=90853&outofStockFilter=1"
213
+ response = requests.get(url)
214
+ response_json = json.loads(response.text)
215
+ all_data.append(response_json)
216
+ # request 보내기
217
+ all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
218
+ with open("book.json", "wb") as f:
219
+ f.write(all_data.encode("utf-8"))
220
+
221
+
222
+ elif selected_option == "과학":
223
+ for key in keyword:
224
+ print(key)
225
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
226
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
227
+ response = requests.get(url)
228
+ response_json = json.loads(response.text)
229
+ all_data.append(response_json)
230
+ # request 보내기
231
+ all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
232
+ with open("book.json", "wb") as f:
233
+ f.write(all_data.encode("utf-8"))
234
+
235
+ elif selected_option == "소설":
236
+ for key in keyword:
237
+ print(key)
238
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
239
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
240
+ response = requests.get(url)
241
+ response_json = json.loads(response.text)
242
+ all_data.append(response_json)
243
+ # request 보내기
244
+ all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
245
+ with open("book.json", "wb") as f:
246
+ f.write(all_data.encode("utf-8"))
247
+
248
+ elif selected_option == "경제경영":
249
+ for key in keyword:
250
+ print(key)
251
+ url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
252
+ "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
253
+ response = requests.get(url)
254
+ response_json = json.loads(response.text)
255
+ all_data.append(response_json)
256
+ # request 보내기
257
+ all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
258
+ with open("book.json", "wb") as f:
259
+ f.write(all_data.encode("utf-8"))
260
+
261
+ print(all_data)
262
+ return all_data
263
+
264
+
265
+ def book_output(book_json):
266
+ data = json.loads(book_json)
267
+
268
+ if len(data[0]['item']) != 0:
269
+ title1 = data[0]['item'][0]['title']
270
+ book_link1 = data[0]['item'][0]['link']
271
+ cover_link1 = data[0]['item'][0]['cover']
272
+ response1 = requests.get(cover_link1)
273
+ image1 = Image.open(BytesIO(response1.content))
274
+ else:
275
+ title1 = "No Data"
276
+ book_link1 = "No Data"
277
+ image1 = "No Data"
278
+
279
+ if len(data[1]['item']) != 0:
280
+ title2 = data[1]['item'][0]['title']
281
+ book_link2 = data[1]['item'][0]['link']
282
+ cover_link2 = data[1]['item'][0]['cover']
283
+ response2 = requests.get(cover_link2)
284
+ image2 = Image.open(BytesIO(response2.content))
285
+ else:
286
+ title2 = "No Data"
287
+ book_link2 = "No Data"
288
+ image2 = "No Data"
289
+
290
+ if len(data[2]['item']) != 0:
291
+ title3 = data[2]['item'][0]['title']
292
+ book_link3 = data[2]['item'][0]['link']
293
+ cover_link3 = data[2]['item'][0]['cover']
294
+ response3 = requests.get(cover_link3)
295
+ image3 = Image.open(BytesIO(response3.content))
296
+ else:
297
+ title3 = "No Data"
298
+ book_link3 = "No Data"
299
+ image3 = "No Data"
300
+
301
+ return title1, image1, title2, image2, title3, image3
302
+
303
+
304
+ def process_selection(input_list):
305
+ # Your processing logic here for the selected option
306
+ API_KEY = input_list[0]
307
+ link = input_list[1]
308
+ selected_option = input_list[2]
309
+ result = f"You selected: {selected_option}"
310
+ print(result)
311
+ return API_KEY, link, selected_option
312
+
313
+
314
+ def get_title(API_KEY, link, selected_option):
315
+ docs, split_docs = youtube_text(link)
316
+ result = youtube_sum(docs, split_docs)
317
+ keywords = text_to_arr(result)
318
+ all_data = aladin_api(keywords, selected_option)
319
+ title1, image1, title2, image2, title3, image3 = book_output(all_data)
320
+ return result, title1, image1, title2, image2, title3, image3
321
+
322
 
323
+ # Define the list of options for the Dropdown
324
+ options_list = ["사회", "과학", "소설", "경제경영"]
325
 
326
+ iface = gr.Interface(fn=get_title, inputs=[gr.Textbox(label="Your OpenAI KEY"),
327
+ gr.Textbox(label="Input Link"),
328
+ gr.Dropdown(choices=options_list, label="Select a category")],
329
+ outputs=[
330
+ gr.Textbox(label="Keywords"),
331
+ gr.Textbox(label="Title1"),
332
+ gr.Image(label="Image1"),
333
+ gr.Textbox(label="Title2"),
334
+ gr.Image(label="Image2"),
335
+ gr.Textbox(label="Title3"),
336
+ gr.Image(label="Image3"),
337
+ ])
338
  iface.launch()
requirements.txt CHANGED
@@ -2,3 +2,4 @@ openai==1.3.7
2
  openai-whisper==20231117
3
  pytube==15.0.0
4
  tiktoken==0.5.1
 
 
2
  openai-whisper==20231117
3
  pytube==15.0.0
4
  tiktoken==0.5.1
5
+