AIRider's picture
Update app.py
1966659 verified
raw
history blame
13.2 kB
import os
import random
import time
import re
import json
import requests
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import openai
import gradio as gr
from fpdf import FPDF as FPDF2
from datetime import datetime
from zoneinfo import ZoneInfo
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain.callbacks import get_openai_callback
import sys
# API ํ‚ค ์„ค์ •
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# OpenAI ์„ค์ •
openai.api_key = OPENAI_API_KEY
def setup_session():
try:
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
return session
except Exception as e:
return None
def generate_naver_search_url(query):
base_url = "https://search.naver.com/search.naver?"
params = {"ssc": "tab.blog.all", "sm": "tab_jum", "query": query}
url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
return url
def crawl_blog_content(url, session):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Referer": "https://search.naver.com/search.naver",
}
delay = random.uniform(1, 2)
time.sleep(delay)
response = session.get(url, headers=headers)
if response.status_code != 200:
return ""
soup = BeautifulSoup(response.content, "html.parser")
content = soup.find("div", attrs={'class': 'se-main-container'})
if content:
return clean_text(content.get_text())
else:
return ""
except Exception as e:
return ""
def crawl_naver_search_results(url, session):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Referer": "https://search.naver.com/search.naver",
}
response = session.get(url, headers=headers)
if response.status_code != 200:
return []
soup = BeautifulSoup(response.content, "html.parser")
results = []
count = 0
for li in soup.find_all("li", class_=re.compile("bx.*")):
if count >= 10:
break
for div in li.find_all("div", class_="detail_box"):
for div2 in div.find_all("div", class_="title_area"):
title = div2.text.strip()
for a in div2.find_all("a", href=True):
link = a["href"]
if "blog.naver" in link:
link = link.replace("https://", "https://m.")
results.append({"์ œ๋ชฉ": title, "๋งํฌ": link})
count += 1
if count >= 10:
break
if count >= 10:
break
if count >= 10:
break
return results
except Exception as e:
return []
def clean_text(text):
text = re.sub(r'\s+', ' ', text).strip()
return text
def fetch_references(topic):
search_url = generate_naver_search_url(topic)
session = setup_session()
if session is None:
return ["์„ธ์…˜ ์„ค์ • ์‹คํŒจ"] * 3
results = crawl_naver_search_results(search_url, session)
if len(results) < 3:
return ["์ถฉ๋ถ„ํ•œ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."] * 3
selected_results = random.sample(results, 3)
references = []
for result in selected_results:
content = crawl_blog_content(result['๋งํฌ'], session)
references.append(f"์ œ๋ชฉ: {result['์ œ๋ชฉ']}\n๋‚ด์šฉ: {content}")
return references
def fetch_crawl_results(query):
references = fetch_references(query)
return references[0], references[1], references[2]
def generate_blog_post(query, prompt_template):
try:
target_length = 1500 # ๋‚ด๋ถ€์ ์œผ๋กœ ๋ชฉํ‘œ ๊ธ€์ž์ˆ˜ ์„ค์ •
max_attempts = 2 # ์ตœ๋Œ€ 2๋ฒˆ ์‹คํ–‰ (์ดˆ๊ธฐ 1๋ฒˆ + ์žฌ์‹œ๋„ 1๋ฒˆ)
references = fetch_references(query)
ref1, ref2, ref3 = references
chat = ChatOpenAI(
model_name="gpt-4o-mini",
temperature=0.85,
max_tokens=10000,
top_p=0.9,
frequency_penalty=0.5,
presence_penalty=0,
n=1,
request_timeout=60
)
prompt = ChatPromptTemplate.from_template(
prompt_template + """
์ฃผ์ œ: {query}
์ฐธ๊ณ ๊ธ€1: {ref1}
์ฐธ๊ณ ๊ธ€2: {ref2}
์ฐธ๊ณ ๊ธ€3: {ref3}
๋‹ค์Œ ํ‘œํ˜„์€ ์‚ฌ์šฉํ•˜์ง€ ๋งˆ์„ธ์š”: ์—ฌ๋Ÿฌ๋ถ„, ๋งˆ์ง€๋ง‰์œผ๋กœ, ๊ฒฐ๋ก ์ ์œผ๋กœ, ๊ฒฐ๊ตญ, ์ข…ํ•ฉ์ ์œผ๋กœ, ๋”ฐ๋ผ์„œ, ๋งˆ๋ฌด๋ฆฌ, ์š”์•ฝ
์•ฝ {target_length}์ž๋กœ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”.
"""
)
chain = LLMChain(llm=chat, prompt=prompt)
unwanted_patterns = [
r'\b์—ฌ๋Ÿฌ๋ถ„[,.]?\s*',
r'\b(๋งˆ์ง€๋ง‰์œผ๋กœ|๊ฒฐ๋ก ์ ์œผ๋กœ|๊ฒฐ๊ตญ|์ข…ํ•ฉ์ ์œผ๋กœ|๋”ฐ๋ผ์„œ|๋งˆ๋ฌด๋ฆฌ|์š”์•ฝ)[,.]?\s*'
]
for attempt in range(max_attempts):
with get_openai_callback() as cb:
result = chain.run(query=query, ref1=ref1, ref2=ref2, ref3=ref3, target_length=target_length)
generated_post = result.strip()
# ๋ชฉํ‘œ ๊ธ€์ž์ˆ˜๋ฅผ ์ถฉ์กฑํ•˜๊ณ  ์›์น˜ ์•Š๋Š” ํ‘œํ˜„์ด ์—†์œผ๋ฉด ๋ฃจํ”„ ์ข…๋ฃŒ
if len(generated_post) >= target_length and not any(re.search(pattern, generated_post, re.IGNORECASE) for pattern in unwanted_patterns):
break
# ์ฒซ ๋ฒˆ์งธ ์‹œ๋„ ํ›„ ์žฌ์‹œ๋„ ์‹œ ํ”„๋กฌํ”„ํŠธ ์ˆ˜์ •
if attempt == 0:
if len(generated_post) < target_length:
prompt.template += f"\n\nํ˜„์žฌ ๊ธ€์ž์ˆ˜๋Š” {len(generated_post)}์ž์ž…๋‹ˆ๋‹ค. ์•ฝ {target_length - len(generated_post)}์ž๋ฅผ ์ถ”๊ฐ€๋กœ ์ž‘์„ฑํ•˜์—ฌ ์ด {target_length}์ž๊ฐ€ ๋˜๋„๋ก ํ•ด์ฃผ์„ธ์š”."
if any(re.search(pattern, generated_post, re.IGNORECASE) for pattern in unwanted_patterns):
prompt.template += "\n\n์›์น˜ ์•Š๋Š” ํ‘œํ˜„์ด ํฌํ•จ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค. ํ•ด๋‹น ํ‘œํ˜„์„ ์ œ๊ฑฐํ•˜๊ณ  ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ๊ธ€์„ ๋‹ค์‹œ ์ž‘์„ฑํ•ด์ฃผ์„ธ์š”."
final_post = f"์ฃผ์ œ: {query}\n\n{generated_post}"
actual_length = len(generated_post)
return final_post, ref1, ref2, ref3, actual_length
except Exception as e:
return f"๋ธ”๋กœ๊ทธ ๊ธ€ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}", "", "", "", 0
# PDF ํด๋ž˜์Šค ๋ฐ ๊ด€๋ จ ํ•จ์ˆ˜ ์ •์˜
class PDF(FPDF2):
def __init__(self):
super().__init__()
current_dir = os.path.dirname(__file__)
self.add_font("NanumGothic", "", os.path.join(current_dir, "NanumGothic.ttf"))
self.add_font("NanumGothic", "B", os.path.join(current_dir, "NanumGothicBold.ttf"))
self.add_font("NanumGothicExtraBold", "", os.path.join(current_dir, "NanumGothicExtraBold.ttf"))
self.add_font("NanumGothicLight", "", os.path.join(current_dir, "NanumGothicLight.ttf"))
def header(self):
self.set_font('NanumGothic', '', 10)
def footer(self):
self.set_y(-15)
self.set_font('NanumGothic', '', 8)
self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
def save_to_pdf(blog_post, user_topic):
pdf = PDF()
pdf.add_page()
lines = blog_post.split('\n')
title = lines[0].strip()
content = '\n'.join(lines[1:]).strip()
# ํ˜„์žฌ ๋‚ ์งœ์™€ ์‹œ๊ฐ„์„ ๊ฐ€์ ธ์˜ต๋‹ˆ๋‹ค (๋Œ€ํ•œ๋ฏผ๊ตญ ์‹œ๊ฐ„ ๊ธฐ์ค€)
now = datetime.now(ZoneInfo("Asia/Seoul"))
date_str = now.strftime("%y%m%d")
time_str = now.strftime("%H%M")
# ํŒŒ์ผ๋ช… ์ƒ์„ฑ
filename = f"{date_str}_{time_str}_{format_filename(user_topic)}.pdf"
pdf.set_font("NanumGothic", 'B', size=14)
pdf.cell(0, 10, title, ln=True, align='C')
pdf.ln(10)
pdf.set_font("NanumGothic", '', size=11)
pdf.multi_cell(0, 5, content)
print(f"Saving PDF as: {filename}")
pdf.output(filename)
return filename
def format_filename(text):
text = re.sub(r'[^\w\s-]', '', text)
return text[:50].strip()
def save_content_to_pdf(blog_post, user_topic):
return save_to_pdf(blog_post, user_topic)
# ๊ธฐ๋ณธ ํ”„๋กฌํ”„ํŠธ ํ…œํ”Œ๋ฆฟ
DEFAULT_PROMPT_TEMPLATE = """
[๋ธ”๋กœ๊ทธ ๊ธ€ ์ž‘์„ฑ ๊ธฐ๋ณธ ๊ทœ์น™]
1. ๋ฐ˜๋“œ์‹œ ํ•œ๊ธ€๋กœ ์ž‘์„ฑํ•˜๋ผ
2. ์ฃผ์–ด์ง„ ์ฐธ๊ณ ๊ธ€์„ ๋ฐ”ํƒ•์œผ๋กœ 1๊ฐœ์˜ ์ƒํ’ˆ๋ฆฌ๋ทฐํ˜•(Product Review) ๋ธ”๋กœ๊ทธ๋ฅผ ์ž‘์„ฑ
3. ์ฃผ์ œ์™€ ์ œ๋ชฉ์„ ์ œ์™ธํ•œ ๊ธ€์ด 1500๋‹จ์–ด ์ด์ƒ์ด ๋˜๋„๋ก ์ž‘์„ฑ
4. ๊ธ€์˜ ์ œ๋ชฉ์„ ์ƒํ’ˆ๋ฆฌ๋ทฐํ˜• ๋ธ”๋กœ๊ทธ ํ˜•ํƒœ์— ๋งž๋Š” ์ ์ ˆํ•œ ์ œ๋ชฉ์œผ๋กœ ์ถœ๋ ฅ
- ์ฐธ๊ณ ๊ธ€์˜ ์ œ๋ชฉ๋„ ์ฐธ๊ณ ํ•˜๋˜, ๋™์ผํ•˜๊ฒŒ ์ž‘์„ฑํ•˜์ง€ ๋ง ๊ฒƒ
5. ๋ฐ˜๋“œ์‹œ ๋งˆํฌ๋‹ค์šด ํ˜•์‹์ด ์•„๋‹Œ ์ˆœ์ˆ˜ํ•œ ํ…์ŠคํŠธ๋กœ๋งŒ ์ถœ๋ ฅํ•˜๋ผ
6. ๋‹ค์‹œํ•œ๋ฒˆ ์ฐธ๊ณ ๊ธ€์„ ๊ฒ€ํ† ํ•˜์—ฌ ๋‚ด์šฉ์„ ์ถฉ๋ถ„ํžˆ ๋ฐ˜์˜ํ•˜๋˜, ์ฐธ๊ณ ๊ธ€์˜ ๊ธ€์„ ๊ทธ๋Œ€๋กœ ์žฌ์ž‘์„ฑํ•˜์ง€๋Š” ๋ง ๊ฒƒ
[๋ธ”๋กœ๊ทธ ๊ธ€ ์ž‘์„ฑ ์„ธ๋ถ€ ๊ทœ์น™]
1. ์‚ฌ์šฉ์ž๊ฐ€ ์ž…๋ ฅํ•œ ์ฃผ์ œ์™€ ์ฃผ์–ด์ง„ ์ฐธ๊ณ ๊ธ€ 3๊ฐœ๋ฅผ ๋ฐ”ํƒ•์œผ๋กœ ์ƒํ’ˆ๋ฆฌ๋ทฐํ˜• ๋ธ”๋กœ๊ทธ ๊ธ€ 1๊ฐœ๋ฅผ ์ž‘์„ฑํ•˜๋ผ
2. ์ฃผ์–ด์ง„ ๋ชจ๋“  ๊ธ€์„ ๋ถ„์„ํ•˜์—ฌ ํ•˜๋‚˜์˜ ๋Œ€์ฃผ์ œ๋ฅผ ์„ ์ •ํ•˜๋ผ(1๊ฐœ์˜ ์ฐธ๊ณ ๊ธ€์— ์น˜์šฐ์น˜์ง€ ๋ง๊ณ  ๋‹ค์–‘ํ•œ ๋‚ด์šฉ์„ ๋‹ด์„๊ฒƒ)
3. ์—ฌ๋Ÿฌ๊ฐ€์ง€ ์ƒํ’ˆ์ด๋ผ๋ฉด ์ƒํ’ˆ 1๊ฐœ์— ์น˜์šฐ์นœ ๋ฆฌ๋ทฐ๋ฅผ ์ž‘์„ฑํ•˜์ง€ ๋ง ๊ฒƒ.
4. ๋Œ€์ฃผ์ œ์— ๋งž๊ฒŒ ๊ธ€์˜ ๋งฅ๋ฝ์„ ์œ ์ง€ํ•˜๋ผ
5. ์ฐธ๊ณ ๊ธ€์— ์ž‘์„ฑ๋œ ์ƒํ’ˆ๊ณผ ๊ธฐ๋Šฅ์— ์ง‘์ค‘ํ•˜์—ฌ ์ž‘์„ฑํ•˜๋ผ
6. ์‹ค์ œ ๋‚ด๊ฐ€ ์‚ฌ์šฉํ•ด๋ณด๊ณ  ๊ฒฝํ—˜ํ•œ ๋‚ด์šฉ์„ ์ž‘์„ฑํ•œ ๋ฆฌ๋ทฐ ํ˜•ํƒœ๋กœ ๊ธ€์„ ์ž‘์„ฑ
7. ๋‚ด์šฉ์€ ๊ธ์ •์ ์œผ๋กœ ์ž‘์„ฑํ•˜๋˜, ์ƒํ’ˆ์ด ๋‹๋ณด์ด๋„๋ก ์ž‘์„ฑ(์ œํ’ˆ์ด ์—ฌ๋Ÿฌ๊ฐœ์ผ ๊ฒฝ์šฐ, ํ•˜๋‚˜์˜ ์ƒํ’ˆ์— ์น˜์šฐ์น˜์ง€ ๋ง ๊ฒƒ)
8. ์ƒํ’ˆ์˜ ๊ฐ€์น˜๋ฅผ ๊ณ ๊ฐ์—๊ฒŒ ์–ดํ•„ํ•˜๋ผ.
9. ๊ธ€์˜ ์•ž, ๋’ค ๋ฌธ์žฅ์ด ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ์ด์–ด์ง€๋„๋ก ์ž‘์„ฑ
10. ์–ดํˆฌ๋Š” ์ฃผ์–ด์ง„ ์ฐธ๊ณ ๊ธ€ 3๊ฐ€์ง€์˜ ์–ดํˆฌ๋ฅผ ์ ์ ˆํžˆ ๋ฐ˜์˜ํ•˜๋ผ
- ํŠนํžˆ ๋ฌธ์žฅ์˜ ๋ ๋ถ€๋ถ„์„ ์ ์ ˆํžˆ ๋ฐ˜์˜(๊ฐ€๊ธ‰์  '~์š”'๋กœ ๋๋‚˜๋„๋ก ์ž‘์„ฑ)
- ๋„ˆ๋ฌด ๋”ฑ๋”ฑํ•˜์ง€ ์•Š๊ฒŒ ํŽธ์•ˆํ•˜๊ฒŒ ์ฝ์„ ์ˆ˜ ์žˆ๋„๋ก ์ž์—ฐ์Šค๋Ÿฌ์šด ๋Œ€ํ™”์ฒด๋ฅผ ๋ฐ˜์˜
- ๋‹จ์–ด ์„ ํƒ์€ ์‰ฌ์šด ํ•œ๊ตญ์–ด ์–ดํœ˜๋ฅผ ์‚ฌ์šฉํ•˜๊ณ  ์‚ฌ์ „์‹ํ‘œํ˜„, ์˜ค๋ž˜๋œ ํ‘œํ˜„์€ ์ œ์™ธํ•˜๋ผ
[์ œ์™ธ ๊ทœ์น™]
1. ๋ฐ˜๋“œ์‹œ ์ฐธ๊ณ ๊ธ€์˜ ํฌํ•จ๋œ ๋งํฌ(URL)๋Š” ์ œ์™ธ
2. ์ฐธ๊ณ ๊ธ€์—์„œ '๋งํฌ๋ฅผ ํ™•์ธํ•ด์ฃผ์„ธ์š”'์™€ ๊ฐ™์€ ๋งํฌ ์ด๋™์˜ ๋ฌธ๊ตฌ๋Š” ์ œ์™ธ
3. ์ฐธ๊ณ ๊ธ€์— ์žˆ๋Š” ์ž‘์„ฑ์ž, ํ™”์ž, ์œ ํŠœ๋ฒ„, ๊ธฐ์ž(Writer, speaker, YouTuber, reporter)์˜ ์ด๋ฆ„, ์• ์นญ, ๋‹‰๋„ค์ž„(Name, Nkickname)์€ ๋ฐ˜๋“œ์‹œ ์ œ์™ธ
4. '์—…์ฒด๋กœ ๋ถ€ํ„ฐ ์ œ๊ณต ๋ฐ›์•„์„œ ์ž‘์„ฑ', '์ฟ ํŒก ํŒŒํŠธ๋„ˆ์Šค'๋“ฑ์˜ ํ‘œํ˜„์„ ๋ฐ˜๋“œ์‹œ ์ œ์™ธํ•˜๋ผ.
5. ๊ธ€์˜ ๊ตฌ์กฐ๊ฐ€ ๋“œ๋Ÿฌ๋‚˜๊ฒŒ ์ž‘์„ฑํ•˜์ง€ ๋ง ๊ฒƒ(์‹œ์ž‘, ๋์— ๋Œ€ํ•œ ํ‘œํ˜„)
- ์—ฌ๋Ÿฌ๋ถ„,
- ๋งˆ์ง€๋ง‰์œผ๋กœ, ๊ฒฐ๋ก ์ ์œผ๋กœ, ๊ฒฐ๊ตญ, ์ข…ํ•ฉ์ ์œผ๋กœ, ๋”ฐ๋ผ์„œ, ๋งˆ๋ฌด๋ฆฌ, ์š”์•ฝ,
"""
# Gradio ์•ฑ ์ƒ์„ฑ
with gr.Blocks() as iface:
gr.Markdown("# ๋ธ”๋กœ๊ทธ ๊ธ€ ์ž‘์„ฑ๊ธฐ_๋ฆฌ๋ทฐ_๊ธฐ๋Šฅ์ง‘์ค‘ํ˜•")
gr.Markdown("์ฃผ์ œ๋ฅผ ์ž…๋ ฅํ•˜๊ณ  ๋ธ”๋กœ๊ทธ ๊ธ€ ์ƒ์„ฑ ๋ฒ„ํŠผ์„ ๋ˆ„๋ฅด๋ฉด ์ž๋™์œผ๋กœ ๋ธ”๋กœ๊ทธ ๊ธ€์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.")
query_input = gr.Textbox(lines=1, placeholder="๋ธ”๋กœ๊ทธ ๊ธ€์˜ ์ฃผ์ œ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”...", label="์ฃผ์ œ")
prompt_input = gr.Textbox(lines=10, value=DEFAULT_PROMPT_TEMPLATE, label="ํ”„๋กฌํ”„ํŠธ ํ…œํ”Œ๋ฆฟ", visible=True)
generate_button = gr.Button("๋ธ”๋กœ๊ทธ ๊ธ€ ์ƒ์„ฑ")
output_text = gr.Textbox(label="์ƒ์„ฑ๋œ ๋ธ”๋กœ๊ทธ ๊ธ€")
ref1_text = gr.Textbox(label="์ฐธ๊ณ ๊ธ€ 1", lines=10, visible=True)
ref2_text = gr.Textbox(label="์ฐธ๊ณ ๊ธ€ 2", lines=10, visible=True)
ref3_text = gr.Textbox(label="์ฐธ๊ณ ๊ธ€ 3", lines=10, visible=True)
save_pdf_button = gr.Button("PDF๋กœ ์ €์žฅ")
pdf_output = gr.File(label="์ƒ์„ฑ๋œ PDF ํŒŒ์ผ")
generate_button.click(
generate_blog_post,
inputs=[query_input, prompt_input],
outputs=[output_text, ref1_text, ref2_text, ref3_text],
show_progress=True
)
save_pdf_button.click(
save_content_to_pdf,
inputs=[output_text, query_input],
outputs=[pdf_output],
show_progress=True
)
# Gradio ์•ฑ ์‹คํ–‰
if __name__ == "__main__":
iface.launch()