AIRider commited on
Commit
5870494
Β·
verified Β·
1 Parent(s): 754ff35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -206
app.py CHANGED
@@ -1,212 +1,71 @@
1
- import gradio as gr
2
- from gradio_client import Client
3
- import json
4
- import logging
5
- import openai
6
  import os
 
 
 
7
  import re
8
- import html
9
-
10
- # λ‘œκΉ… μ„€μ •
11
- logging.basicConfig(filename='youtube_script_extractor.log', level=logging.DEBUG,
12
- format='%(asctime)s - %(levelname)s - %(message)s')
13
-
14
- openai.api_key = os.getenv("OPENAI_API_KEY")
15
-
16
- def parse_api_response(response):
17
- try:
18
- if isinstance(response, str):
19
- response = json.loads(response)
20
- if isinstance(response, list) and len(response) > 0:
21
- response = response[0]
22
- if not isinstance(response, dict):
23
- raise ValueError(f"μ˜ˆμƒμΉ˜ λͺ»ν•œ 응닡 ν˜•μ‹μž…λ‹ˆλ‹€. 받은 데이터 νƒ€μž…: {type(response)}")
24
- return response
25
- except Exception as e:
26
- logging.error(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
27
- raise ValueError(f"API 응닡 νŒŒμ‹± μ‹€νŒ¨: {str(e)}")
28
-
29
- def get_youtube_script(url):
30
- logging.info(f"슀크립트 μΆ”μΆœ μ‹œμž‘: URL = {url}")
31
- client = Client("whispersound/YT_Ts_R")
32
- try:
33
- result = client.predict(youtube_url=url, api_name="/predict")
34
- parsed_result = parse_api_response(result)
35
-
36
- if 'data' not in parsed_result or not parsed_result['data']:
37
- raise ValueError("API 응닡에 μœ νš¨ν•œ 데이터가 μ—†μŠ΅λ‹ˆλ‹€.")
38
-
39
- data = parsed_result["data"][0]
40
- title = data.get("title", "제λͺ© μ—†μŒ")
41
- description = data.get("description", "μ„€λͺ… μ—†μŒ")
42
- transcription_text = data.get("transcriptionAsText", "")
43
- thumbnails = data.get("thumbnails", [])
44
-
45
- if not transcription_text:
46
- raise ValueError("μΆ”μΆœλœ μŠ€ν¬λ¦½νŠΈκ°€ μ—†μŠ΅λ‹ˆλ‹€.")
47
-
48
- logging.info("슀크립트 μΆ”μΆœ μ™„λ£Œ")
49
- return title, description, transcription_text, thumbnails
50
- except Exception as e:
51
- logging.exception("슀크립트 μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ")
52
- raise
53
-
54
- def call_api(prompt, max_tokens, temperature, top_p):
55
- try:
56
- response = openai.ChatCompletion.create(
57
- model="gpt-4o-mini",
58
- messages=[{"role": "user", "content": prompt}],
59
- max_tokens=max_tokens,
60
- temperature=temperature,
61
- top_p=top_p
62
- )
63
- return response['choices'][0]['message']['content']
64
- except Exception as e:
65
- logging.exception("LLM API 호좜 쀑 였λ₯˜ λ°œμƒ")
66
- raise
67
-
68
- def summarize_text(title, description, text):
69
- prompt = f"""
70
- [유튜브 μš”μ•½ κ·œμΉ™]
71
- 1. λ„ˆλŠ” 유튜브 μ˜μƒ μ „λ¬Έ ν•΄μ„€κ°€λ‘œμ„œ 지침에 맞게 이 글을 μž‘μ„±ν•˜λΌ
72
- 2. μ•„λž˜μ˜ 제λͺ©κ³Ό μ„€λͺ…은 이 유튜브 μ˜μƒμ˜ 원본 메타데이터이닀.
73
- 3. λ°˜λ“œμ‹œ 제λͺ©κ³Ό μ„€λͺ…μœΌλ‘œ μ£Όμ œμ™€ λ¬Έλ§₯, 철자(Spelling)을 λ¨Όμ € νŒŒμ•…ν•˜κ³ , μ•„λž˜μ˜ λŒ€λ³Έμ„ λ°˜λ“œμ‹œ 지침에 맞게 μƒμ„Έν•˜κ²Œ μš”μ•½ν•˜λΌ
74
- - λ°˜λ“œμ‹œ 주어진 제λͺ©, μ„€λͺ…에 μžˆλŠ” 철자(Spelling)λ₯Ό μš”μ•½μ— λ°˜μ˜ν•˜λΌ(원문 λŒ€λ³Έμ—λŠ” μ˜€νƒˆμžκ°€ μžˆμ„ 수 μžˆλ‹€)
75
- 4. λ°˜λ“œμ‹œ ν•œκΈ€λ‘œ μž‘μ„±ν•˜λΌ
76
- 5. λ°˜λ“œμ‹œ '이 유튜브 λŒ€λ³Έμ€', '이 μ˜μƒμ€', '이 μœ νŠœλΈŒλŠ”'λ“±μ˜ μ†Œκ°œμ‹ ν‘œν˜„μ€ μ œμ™Έν•˜λΌ
77
- 6. μš”μ•½λ¬Έλ§ŒμœΌλ‘œλ„ μ˜μƒμ„ 직접 μ‹œμ²­ν•œ 것과 λ™μΌν•œ μˆ˜μ€€μœΌλ‘œ λ‚΄μš©μ„ 이해할 수 μžˆλ„λ‘ μƒμ„Ένžˆ μž‘μ„±
78
- 7. 글을 λ„ˆλ¬΄ μ••μΆ•ν•˜κ±°λ‚˜ ν•¨μΆ•ν•˜μ§€ 말고, μ€‘μš”ν•œ λ‚΄μš©κ³Ό 세뢀사항을 λͺ¨λ‘ 포함
79
- 8. λ°˜λ“œμ‹œ λŒ€λ³Έμ˜ 흐름과 논리 ꡬ쑰λ₯Ό μœ μ§€
80
- 9. λŒ€λ³Έμ˜ λͺ©μ μ΄λ‚˜ μ˜λ„λ₯Ό νŒŒμ•…ν•˜κ³ , 이λ₯Ό μš”μ•½μ— λ°˜λ“œμ‹œ 반영
81
- 10. λ°˜λ“œμ‹œ μ‹œκ°„ μˆœμ„œλ‚˜ μ‚¬κ±΄μ˜ μ „κ°œ 과정을 λͺ…ν™•ν•˜κ²Œ 반영
82
- 11. λ“±μž₯인물, μž₯μ†Œ, 사건 λ“± μ€‘μš”ν•œ μš”μ†Œλ₯Ό μ •ν™•ν•˜κ²Œ μž‘μ„±
83
- 12. λŒ€λ³Έμ—μ„œ μ „λ‹¬ν•˜λŠ” κ°μ •μ΄λ‚˜ λΆ„μœ„κΈ°λ„ 포함
84
- 13. λ°˜λ“œμ‹œ 기술적 μš©μ–΄λ‚˜ μ „λ¬Έ μš©μ–΄κ°€ μžˆμ„ 경우, 이λ₯Ό μ •ν™•ν•˜κ²Œ μ‚¬μš©
85
- 14. λ°˜λ“œμ‹œ 핡심 μ„Ήμ…˜(μ†Œμ£Όμ œ)λ₯Ό νŒŒμ•…ν•˜μ—¬ μ„Ήμ…˜μ— 맞게 글을 μš”μ•½ν•˜λΌ(κΈ€μ˜ 양을 κ³ λ €ν•˜μ—¬ μ„Ήμ…˜μ˜ 개수λ₯Ό 탄λ ₯적으둜 μ„€μ •)
86
- 15. 각 μ„Ήμ…˜μ˜ 제λͺ©(μ†Œμ£Όμ œ)μ—λŠ” λ‚΄μš©κ³Ό μ–΄μšΈλ¦¬λŠ” μ μ ˆν•œ 이λͺ¨μ§€λ‘œ μ†Œμ£Όμ œλ₯Ό μ‹œμž‘ν•˜λΌ
87
- 16. 각 μ„Ήμ…˜μ˜ λ‚΄μš©μ€ Bullet Pointλ₯Ό μ‚¬μš©ν•˜μ—¬ 가독성을 높여라(λ¬Έμž₯ λ‹¨μœ„λ‘œ ꡬ뢄)
88
- [μ˜ˆμ‹œ]
89
- (λ³€κ²½μ „)
90
- - 유튜브λ₯Ό 처음 μ‹œμž‘ν•˜λŠ” μ‚¬λžŒλ“€μ€ κ΅¬λ…μž μˆ˜μ™€ μ‘°νšŒμˆ˜μ— 큰 관심을 두고 맀일 유튜브 μŠ€νŠœλ””μ˜€λ₯Ό ν™•μΈν•˜κ²Œ λœλ‹€. κ·ΈλŸ¬λ‚˜ κ΅¬λ…μžκ°€ 100λͺ…, 1,000λͺ…에 λ„λ‹¬ν•˜λŠ” κ²ƒλ§ŒμœΌλ‘œλŠ” 지속적인 μ„±μž₯에 도움이 λ˜μ§€ μ•ŠλŠ”λ‹€. κ΅¬λ…μž μˆ˜κ°€ λŠ˜μ–΄λ‚œ 후에도 유튜브 채널 μš΄μ˜μ— λŒ€ν•œ 감을 μž‘μ§€ λͺ»ν•΄ ν¬κΈ°ν•˜λŠ” κ²½μš°κ°€ λ§Žλ‹€.
91
- (λ³€κ²½ν›„)
92
- - 유튜브λ₯Ό 처음 μ‹œμž‘ν•˜λŠ” μ‚¬λžŒλ“€μ€ κ΅¬λ…μž μˆ˜μ™€ μ‘°νšŒμˆ˜μ— 큰 관심을 두고 맀일 유튜브 μŠ€νŠœλ””μ˜€λ₯Ό ν™•μΈν•˜κ²Œ λœλ‹€.
93
- - κ·ΈλŸ¬λ‚˜ κ΅¬λ…μžκ°€ 100λͺ…, 1,000λͺ…에 λ„λ‹¬ν•˜λŠ” κ²ƒλ§ŒμœΌλ‘œλŠ” 지속적인 μ„±μž₯에 도움이 λ˜μ§€ μ•ŠλŠ”λ‹€.
94
- - κ΅¬λ…μž μˆ˜κ°€ λŠ˜μ–΄λ‚œ ��에도 유튜브 채널 μš΄μ˜μ— λŒ€ν•œ 감을 μž‘μ§€ λͺ»ν•΄ ν¬κΈ°ν•˜λŠ” κ²½μš°κ°€ λ§Žλ‹€.
95
- 17. 각 μ„Ήμ…˜μ˜ λ‚΄μš©μ„ λ°˜λ“œμ‹œ μΆ©μ‹€ν•˜κ²Œ μž‘μ„±
96
 
97
- 제λͺ©: {title}
98
- μ„€λͺ…: {description}
99
-
100
- λŒ€λ³Έ:
101
- {text}
102
- """
103
- return call_api(prompt, max_tokens=8000, temperature=0.35, top_p=0.95)
104
-
105
- def split_sentences(text):
106
- sentences = re.split(r"(λ‹ˆλ‹€|μ—μš”|κ΅¬λ‚˜|ν•΄μš”|κ΅°μš”|κ² μ–΄μš”|μ‹œμ˜€|해라|μ˜ˆμš”|μ•„μš”|λ°μš”|λŒ€μš”|μ„Έμš”|μ–΄μš”|κ²Œμš”|κ΅¬μš”|κ³ μš”|λ‚˜μš”|ν•˜μ£ )(?![\w])", text)
107
- combined_sentences = []
108
- current_sentence = ""
109
- for i in range(0, len(sentences), 2):
110
- if i + 1 < len(sentences):
111
- sentence = sentences[i] + sentences[i + 1]
112
- else:
113
- sentence = sentences[i]
114
- if len(current_sentence) + len(sentence) > 100: # 100자λ₯Ό μ΄ˆκ³Όν•  경우
115
- combined_sentences.append(current_sentence.strip())
116
- current_sentence = sentence.strip()
117
- else:
118
- current_sentence += sentence
119
- if sentence.endswith(('.', '?', '!')):
120
- combined_sentences.append(current_sentence.strip())
121
- current_sentence = ""
122
- if current_sentence:
123
- combined_sentences.append(current_sentence.strip())
124
- return combined_sentences
125
-
126
- def display_script(title, script):
127
- script_sentences = split_sentences(script)
128
- formatted_script = "\n\n".join(script_sentences)
129
- return f"""<div class="script-box">
130
- <details>
131
- <summary>ν΄λ¦­ν•˜μ—¬ 펼치기</summary>
132
- <div class="output-title">{title}</div>
133
- <p style="white-space: pre-wrap;">{formatted_script}</p>
134
- </details>
135
- </div>"""
136
-
137
- def display_summary(title, summary):
138
- return f"""<div class="script-box">
139
- <div class="output-title">{title}</div>
140
- {summary}
141
- </div>"""
142
-
143
- def get_thumbnail_url(thumbnails):
144
- for thumbnail in thumbnails:
145
- if thumbnail.get("width") == 640 and thumbnail.get("height") == 480:
146
- return thumbnail.get("url")
147
- return "640x480 크기의 썸넀일을 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
148
-
149
- def analyze(url):
150
- # 슀크립트 μΆ”μΆœ
151
- yield "슀크립트 μΆ”μΆœ 쀑...", "슀크립트 μΆ”μΆœ 쀑...", ""
152
- title, description, script, thumbnails = get_youtube_script(url)
153
- script_content = display_script(title, script)
154
- thumbnail_url = get_thumbnail_url(thumbnails)
155
-
156
- # 원문 슀크립트 ν‘œμ‹œ 및 μš”μ•½ μ‹œμž‘
157
- yield script_content, "μš”μ•½ 생성 쀑...", thumbnail_url
158
 
159
- # μš”μ•½ 생성
160
- summary = summarize_text(title, description, script)
161
 
162
- # HTML둜 λ³€ν™˜ (convert_to_html λ‘œμ§μ„ 직접 톡합)
163
- lines = summary.split('\n')
164
- formatted_lines = []
165
- for line in lines:
166
- line = line.strip()
167
- if line.startswith('####'):
168
- formatted_lines.append(f"<h4>{html.escape(line[4:].strip())}</h4>")
169
- elif line.startswith('###'):
170
- formatted_lines.append(f"<h3>{html.escape(line[3:].strip())}</h3>")
171
- elif line.startswith('##'):
172
- formatted_lines.append(f"<h2>{html.escape(line[2:].strip())}</h2>")
173
- elif line.startswith('#'):
174
- formatted_lines.append(f"<h1>{html.escape(line[1:].strip())}</h1>")
175
- elif line.startswith('- '): # 리슀트 μ•„μ΄ν…œ
176
- content = html.escape(line[2:])
177
- bold_content = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', content)
178
- formatted_lines.append(f"<li>{bold_content}</li>")
179
- elif line: # 일반 ν…μŠ€νŠΈ (빈 쀄 μ œμ™Έ)
180
- content = html.escape(line)
181
- bold_content = re.sub(r'\*\*(.*?)\*\*', r'<strong>\1</strong>', content)
182
- formatted_lines.append(f"<p>{bold_content}</p>")
183
- else: # 빈 쀄
184
- formatted_lines.append("<br>")
185
-
186
- formatted_summary = '\n'.join(formatted_lines)
187
-
188
- summary_content = f"""<div class="script-box">
189
- <div class="output-title">{html.escape(title)}</div>
190
- {formatted_summary}
191
- </div>"""
192
 
193
- # μ΅œμ’… κ²°κ³Ό ν‘œμ‹œ
194
- yield script_content, summary_content, thumbnail_url
195
-
196
- # Gradio μΈν„°νŽ˜μ΄μŠ€
197
- with gr.Blocks() as demo:
198
- gr.Markdown("## YouTube 슀크립트 μΆ”μΆœ 및 μš”μ•½ 도ꡬ")
199
- youtube_url_input = gr.Textbox(label="YouTube URL μž…λ ₯")
200
- analyze_button = gr.Button("λΆ„μ„ν•˜κΈ°")
201
- script_output = gr.HTML(label="원문 슀크립트")
202
- summary_output = gr.HTML(label="μš”μ•½")
203
- thumbnail_output = gr.Textbox(label="썸넀일 URL (640x480)") # 이 쀄 μΆ”κ°€
204
-
205
- analyze_button.click(
206
- analyze,
207
- inputs=[youtube_url_input],
208
- outputs=[script_output, summary_output, thumbnail_output] # thumbnail_output μΆ”κ°€
209
- )
210
-
211
- if __name__ == "__main__":
212
- demo.launch()
 
 
 
 
 
 
1
  import os
2
+ import requests
3
+ import json
4
+ import gradio as gr
5
  import re
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # Hugging Face ν™˜κ²½ λ³€μˆ˜λ‘œλΆ€ν„° RapidAPI 킀와 호슀트 κ°€μ Έμ˜€κΈ°
8
+ AA_KEY = os.getenv("AA_KEY")
9
+ AA_HOST = "youtube-transcriptor.p.rapidapi.com"
10
+
11
+ # 유튜브 URLμ—μ„œ λΉ„λ””μ˜€ IDλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜
12
+ def get_video_id(youtube_url):
13
+ # 유튜브 URL λ˜λŠ” youtu.be 단좕 URLμ—μ„œ video_id μΆ”μΆœ
14
+ video_id_match = re.search(r"(?<=v=)[^#&?]*", youtube_url) or re.search(r"(?<=youtu.be/)[^#&?]*", youtube_url)
15
+ return video_id_match.group(0) if video_id_match else None
16
+
17
+ # μžλ§‰ μ–Έμ–΄ μš°μ„ μˆœμœ„ 리슀트
18
+ LANGUAGE_PRIORITY = ['ko', 'en', 'ja', 'zh']
19
+
20
+ # 유튜브 μžλ§‰μ„ μš”μ²­ν•˜λŠ” ν•¨μˆ˜ (μ–Έμ–΄ μš°μ„ μˆœμœ„λ₯Ό μ μš©ν•˜μ—¬ μ‹œλ„)
21
+ def get_youtube_transcript(youtube_url):
22
+ # λΉ„λ””μ˜€ ID μΆ”μΆœ
23
+ video_id = get_video_id(youtube_url)
24
+ if video_id is None:
25
+ return {"error": "잘λͺ»λœ 유튜브 URLμž…λ‹ˆλ‹€. λΉ„λ””μ˜€ IDλ₯Ό 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ url = "https://youtube-transcriptor.p.rapidapi.com/transcript"
 
28
 
29
+ headers = {
30
+ "x-rapidapi-key": AA_KEY,
31
+ "x-rapidapi-host": AA_HOST
32
+ }
33
+
34
+ # μ–Έμ–΄ μš°μ„ μˆœμœ„μ— 따라 순��적으둜 μš”μ²­μ„ μ‹œλ„
35
+ for lang in LANGUAGE_PRIORITY:
36
+ querystring = {"video_id": video_id, "lang": lang}
37
+ response = requests.get(url, headers=headers, params=querystring)
38
+
39
+ # μƒνƒœ μ½”λ“œ 확인 및 전체 응닡 λ°˜ν™˜
40
+ if response.status_code == 200:
41
+ try:
42
+ data = response.json()
43
+
44
+ # 전체 응닡 데이터λ₯Ό κ·ΈλŒ€λ‘œ λ°˜ν™˜
45
+ return {"language": lang, "data": data}
46
+
47
+ except json.JSONDecodeError as e:
48
+ return {"error": f"JSON λ””μ½”λ”© 였λ₯˜ λ°œμƒ: {str(e)}"}
49
+
50
+ # λͺ¨λ“  μ–Έμ–΄μ—μ„œ μžλ§‰μ„ 찾지 λͺ»ν•œ 경우
51
+ return {"error": "μš°μ„ μˆœμœ„ μ–Έμ–΄λ‘œ μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."}
52
+
53
+ # Gradio μΈν„°νŽ˜μ΄μŠ€ μ •μ˜
54
+ def youtube_transcript_interface(youtube_url):
55
+ # μžλ§‰ 데이터 κ°€μ Έμ˜€κΈ°
56
+ transcript_data = get_youtube_transcript(youtube_url)
 
 
57
 
58
+ # κ²°κ³Ό 좜λ ₯
59
+ return json.dumps(transcript_data, ensure_ascii=False, indent=2)
60
+
61
+ # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
62
+ interface = gr.Interface(
63
+ fn=youtube_transcript_interface,
64
+ inputs="text",
65
+ outputs="text",
66
+ title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
67
+ description="유튜브 URL을 μž…λ ₯ν•˜μ„Έμš”."
68
+ )
69
+
70
+ # Gradio μΈν„°νŽ˜μ΄μŠ€ μ‹€ν–‰
71
+ interface.launch()