File size: 10,490 Bytes
a5d3b8f
3bd8090
3c69fe4
3bd8090
bc0dbe0
60c2359
 
 
 
 
 
3bd8090
 
 
 
 
 
 
 
c43db93
9d6f9cb
bc0dbe0
60c2359
3bd8090
60c2359
3bd8090
60c2359
 
 
a5d3b8f
60c2359
 
a5d3b8f
3bd8090
60c2359
 
3bd8090
60c2359
 
 
 
 
 
 
3bd8090
 
60c2359
 
 
a5d3b8f
60c2359
 
3bd8090
 
 
a5d3b8f
 
 
3bd8090
 
 
 
a5d3b8f
3bd8090
 
 
 
 
 
a5d3b8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bd8090
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5d3b8f
3bd8090
 
 
 
 
a5d3b8f
3bd8090
a5d3b8f
 
3bd8090
a5d3b8f
 
 
 
 
 
 
3bd8090
a5d3b8f
 
 
 
3bd8090
a5d3b8f
3bd8090
 
a5d3b8f
 
 
 
 
 
 
3bd8090
 
a5d3b8f
 
 
 
 
 
 
3bd8090
 
a5d3b8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bd8090
a5d3b8f
 
 
 
3bd8090
 
 
 
 
 
 
a5d3b8f
3bd8090
 
 
 
 
 
 
 
a5d3b8f
3bd8090
a5d3b8f
 
 
 
3bd8090
 
 
 
 
a5d3b8f
3bd8090
a5d3b8f
 
 
 
3bd8090
 
 
 
 
a5d3b8f
3bd8090
a5d3b8f
3bd8090
 
 
 
a5d3b8f
3bd8090
 
a5d3b8f
 
3bd8090
bc0dbe0
3bd8090
a5d3b8f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import os
import requests
import json
import re
import gradio as gr
from pytube import YouTube
import whisper
import time
import pickle
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.chains.mapreduce import MapReduceChain
from langchain.chains import ReduceDocumentsChain, MapReduceDocumentsChain
from langchain.chat_models import ChatOpenAI
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from PIL import Image
from io import BytesIO

openai_api_key = ""


# for API

# # ==

def youtube_text(link):
    yt = YouTube(link)
    yt.streams.filter(only_audio=True).first().download(output_path=".", filename="test.mp3")

    start = time.time()
    model = whisper.load_model("base")
    text = model.transcribe("test.mp3")
    end = time.time()

    print(text["text"])
    print(f"{end - start:.2f}sec")

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=50,
        length_function=len, )

    full_docs = text["text"]
    docs = [Document(page_content=x) for x in text_splitter.split_text(text["text"])]

    split_docs = text_splitter.split_documents(docs)

    with open("temp/split_example_small.pkl", "wb") as f:
        pickle.dump(split_docs, f)

    return split_docs, full_docs


def youtube_sum(split_docs, full_docs, API_KEY):
    openai_key = API_KEY
    llm = ChatOpenAI(temperature=0.7, openai_api_key=openai_key)

    # Map prompt
    map_template = """The following is a set of documents
    {docs}
    Based on this list of Video subtitles , please identify the main themes
    Helpful Answer:"""

    map_prompt = PromptTemplate.from_template(map_template)

    # Reduce prompt
    reduce_template = """The following is set of summaries:
    {doc_summaries}
    You need to output two things from the above Video Subtitles. 
    1. Write an executive summary 
    Read the following subtitles and write a summary that integrates them to quickly identify the main topics of the Video.
    Your summary should. 
    - Must be written in Korean 
    - Be a 1~2 paragraph
    - Be descriptive and detailed so that you can tell at a glance what is being said without having to look at the original Video. 
    - There are no more than three main topics in the video. 
    - Please also briefly describe the overall content of the video 
    2. Choose your keyword 
    The keywords have the following conditions 
    - Must be written in Korean 
    - Must be a single word 
    - Must be a noun 
    - Must be a word that appears in the Video 
    - Must be a word that is not a stopword 
    - Must be a word that is not a proper noun 
    - Must be a word that is not a number 
    - Must be a word that is not a verb 
    - Must be a word that is not a pronoun 
    - Must be a word that is not a preposition 
    - Must be a word that is not a conjunction 
    - Must be a word that is not an interjection 
    - Must be a word that is not an adjective 
    - Must be a word that is not an adverb 
    - Must be a word that is not a determiner 
    - Must be a word that is not a particle 
    - Must be a word that is not a numeral 
    - Output only one keyword

    Here is an example of the final output
    Summary: Summary of The video
    Keyword: keyword
    Don't output any other text outside of the given format
    Helpful Answer:"""

    reduce_prompt = PromptTemplate.from_template(reduce_template)

    reduce_chain = LLMChain(llm=llm, prompt=reduce_prompt)
    combine_documents_chain = StuffDocumentsChain(
        llm_chain=reduce_chain, document_variable_name="doc_summaries"
    )

    # Combines and iteravely reduces the mapped documents
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `StuffDocumentsChain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000,
    )

    # 2. Map chain
    map_chain = LLMChain(llm=llm, prompt=map_prompt)

    # Combining documents by mapping a chain over them, then combining results
    map_reduce_chain = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="docs",
        # Return the results of the map steps in the output
        return_intermediate_steps=False,
    )
    # Run
    result = map_reduce_chain.run(split_docs)
    print(result)
    with open("temp/result.txt", "w") as f:
        f.write(result)
    return result


def text_to_arr(result):
    text = result

    # Regular expression to find the keyword
    match = re.search(r"Keyword:\s*(\w+)", text)

    if match:
        keyword = match.group(1)
        print("Keyword:", keyword)  # The keyword is in the first capturing group
    else:
        match = re.search(r"ํ‚ค์›Œ๋“œ:\s*(\w+)", text)
        keyword = match.group(1)  # No keyword found
        print("Keyword:", keyword)

    return keyword


def aladin_api(keyword, selected_option):
    aladin_key = 'ttbkangmj08250027001'
    keyword = keyword
    all_data = []
    if selected_option == "์‚ฌํšŒ":
        key = keyword
        print(key)
        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=798&outofStockFilter=1"
        response = requests.get(url)
        response_json = json.loads(response.text)
        all_data.append(response_json)

    elif selected_option == "๊ณผํ•™":
        key = keyword
        print(key)
        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=987&outofStockFilter=1"
        response = requests.get(url)
        response_json = json.loads(response.text)
        all_data.append(response_json)

    elif selected_option == "์†Œ์„ค":
        key = keyword
        print(key)
        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=1&outofStockFilter=1"
        response = requests.get(url)
        response_json = json.loads(response.text)
        all_data.append(response_json)

    elif selected_option == "๊ธˆ์œต":
        key = keyword
        url = f"http://www.aladin.co.kr/ttb/api/ItemSearch.aspx?ttbkey={aladin_key}&Query={key}&QueryType=Keyword&Cover=Big&MaxResults=5" \
              "&start=1&SearchTarget=Book&output=js&Sort=SalesPoint&Version=20131101&CategoryId=170&outofStockFilter=1"
        response = requests.get(url)
        response_json = json.loads(response.text)
        all_data.append(response_json)
        # request ๋ณด๋‚ด๊ธฐ
    all_data = json.dumps(all_data, ensure_ascii=False, indent=4)
    with open("temp/book.json", "wb") as f:
        f.write(all_data.encode("utf-8"))
    print(type(all_data))
    print(all_data)
    return all_data


def book_output(book_json):
    data = json.loads(book_json)

    if len(data[0]['item'][0]) != 0:
        title1 = data[0]['item'][0]['title']
        book_link1 = data[0]['item'][0]['link']
        cover_link1 = data[0]['item'][0]['cover']
        response1 = requests.get(cover_link1)
        image1 = Image.open(BytesIO(response1.content))
    else:
        title1 = "No Data"
        book_link1 = "No Data"
        image1 = Image.open("NO DATA.jpeg")

    if len(data[0]['item'][1]) != 0:
        title2 = data[0]['item'][1]['title']
        book_link2 = data[0]['item'][1]['link']
        cover_link2 = data[0]['item'][1]['cover']
        response2 = requests.get(cover_link2)
        image2 = Image.open(BytesIO(response2.content))
    else:
        title2 = "No Data"
        book_link2 = "No Data"
        image2 = Image.open("NO DATA.jpeg")

    if len(data[0]['item'][2]) != 0:
        title3 = data[0]['item'][2]['title']
        book_link3 = data[0]['item'][2]['link']
        cover_link3 = data[0]['item'][2]['cover']
        response3 = requests.get(cover_link3)
        image3 = Image.open(BytesIO(response3.content))
    else:
        title3 = "No Data"
        book_link3 = "No Data"
        image3 = Image.open("NO DATA.jpeg")

    return title1, image1, title2, image2, title3, image3, book_link1, book_link2, book_link3


def get_title(API_KEY, link, selected_option):
    docs, split_docs = youtube_text(link)
    result = youtube_sum(docs, split_docs, API_KEY)
    keywords = text_to_arr(result)
    all_data = aladin_api(keywords, selected_option)
    title1, image1, title2, image2, title3, image3, link1, link2, link3 = book_output(all_data)
    return result, title1, image1, title2, image2, title3, image3, link1, link2, link3


# Define the list of options for the Dropdown
options_list = ["์‚ฌํšŒ", "๊ณผํ•™", "์†Œ์„ค", "๊ธˆ์œต"]

with gr.Blocks() as demo:
    gr.Markdown("Paste your Youtube Link and get the book recommandation")
    with gr.Column():
        with gr.Row():
            inp1 = gr.Textbox(label="Your OpenAI KEY")
            inp2 = gr.Textbox(label="Input Link")
        inp3 = gr.Dropdown(choices=options_list, label="Select a category")
        btn = gr.Button("Find the book")

    with gr.Column():
        out1 = gr.Textbox(label="Summary")
        with gr.Row():
            out2 = gr.Textbox(label="Title1")
            out4 = gr.Textbox(label="Title2")
            out6 = gr.Textbox(label="Title3")
        with gr.Row():
            out3 = gr.Image(label="Image1")
            out5 = gr.Image(label="Image2")
            out7 = gr.Image(label="Image3")
        with gr.Row():
            out8 = gr.HTML(label="Book Link1")
            out9 = gr.HTML(label="Book Link2")
            out10 = gr.HTML(label="Book Link3")
    btn.click(fn=get_title, inputs=[inp1, inp2, inp3],
              outputs=[out1, out2, out3, out4, out5, out6, out7, out8, out9, out10])

demo.launch(share=True)