myezrag

Running

App Files Files Community

myezrag / app-in-apikey.py

ginipick

Rename app.py to app-in-apikey.py

d5a415c verified 4 months ago

raw

history blame contribute delete

35 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	import os
	import pandas as pd
	from typing import List, Dict, Tuple
	import json
	import io
	import traceback
	import csv
	# HuggingFace 클라이언트 대신 OpenAI 클라이언트 사용
	from openai import OpenAI
	import os

	# 추론 API 클라이언트 설정
	hf_client = InferenceClient(
	"CohereForAI/c4ai-command-r-plus-08-2024", token=os.getenv("HF_TOKEN")
	)

	def load_code(filename: str) -> str:
	try:
	with open(filename, 'r', encoding='utf-8') as file:
	return file.read()
	except FileNotFoundError:
	return f"{filename} 파일을 찾을 수 없습니다."
	except Exception as e:
	return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"

	def load_parquet(filename: str) -> str:
	try:
	df = pd.read_parquet(filename, engine='pyarrow')
	return df.head(10).to_markdown(index=False)
	except FileNotFoundError:
	return f"{filename} 파일을 찾을 수 없습니다."
	except Exception as e:
	return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"


	# OpenAI 클라이언트 설정
	client = OpenAI(api_key=os.getenv("OPEN_AI"))

	# respond 함수 수정
	def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
	# 시스템 프롬프트 설정
	system_prefix = """반드시 한글로 답변할 것. 너는 업로드된 데이터를 기반으로 질문에 답변하는 역할을 한다.

	주요 지침:
	1. 질문과 직접 관련된 내용만 간단명료하게 답변할 것
	2. 이전 답변과 중복되는 내용은 제외할 것
	3. 불필요한 예시나 부연 설명은 하지 말 것
	4. 동일한 내용을 다른 표현으로 반복하지 말 것
	5. 핵심 정보만 전달할 것
	"""

	if parquet_data:
	try:
	df = pd.read_json(io.StringIO(parquet_data))
	data_summary = df.describe(include='all').to_string()
	system_prefix += f"\n\n데이터 요약:\n{data_summary}"
	except Exception as e:
	print(f"데이터 로드 오류: {str(e)}")

	# 대화 히스토리 구성
	messages = [{"role": "system", "content": system_prefix}]

	# 최근 대화 컨텍스트만 유지
	recent_history = history[-3:] if history else []
	for chat in recent_history:
	messages.append({"role": chat["role"], "content": chat["content"]})

	messages.append({"role": "user", "content": message})

	try:
	# OpenAI API 호출
	response = client.chat.completions.create(
	model="gpt-4o-mini", # GPT-4-mini 모델 사용
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True
	)

	full_response = ""
	for chunk in response:
	if chunk.choices[0].delta.content:
	full_response += chunk.choices[0].delta.content
	# 응답 정제
	cleaned_response = clean_response(full_response)
	yield cleaned_response

	except Exception as e:
	error_message = f"추론 오류: {str(e)}"
	print(error_message)
	yield error_message

	def clean_response(text: str) -> str:
	"""응답 텍스트 정제 함수"""
	# 문장 단위로 분리
	sentences = [s.strip() for s in text.split('.') if s.strip()]

	# 중복 제거
	unique_sentences = []
	seen = set()

	for sentence in sentences:
	# 문장 정규화 (공백 제거, 소문자 변환)
	normalized = ' '.join(sentence.lower().split())
	if normalized not in seen:
	seen.add(normalized)
	unique_sentences.append(sentence)

	# 정제된 문장 결합
	cleaned_text = '. '.join(unique_sentences)
	if cleaned_text and not cleaned_text.endswith('.'):
	cleaned_text += '.'

	return cleaned_text

	def remove_duplicates(text: str) -> str:
	"""중복 문장 제거 함수"""
	sentences = text.split('.')
	unique_sentences = []
	seen = set()

	for sentence in sentences:
	sentence = sentence.strip()
	if sentence and sentence not in seen:
	seen.add(sentence)
	unique_sentences.append(sentence)

	return '. '.join(unique_sentences)

	def upload_csv(file_path: str) -> Tuple[str, str]:
	try:
	# CSV 파일 읽기
	df = pd.read_csv(file_path, sep=',')
	# 필수 컬럼 확인
	required_columns = {'id', 'text', 'label', 'metadata'}
	available_columns = set(df.columns)
	missing_columns = required_columns - available_columns
	if missing_columns:
	return f"CSV 파일에 다음 필수 컬럼이 누락되었습니다: {', '.join(missing_columns)}", ""
	# 데이터 클렌징
	df.drop_duplicates(inplace=True)
	df.fillna('', inplace=True)
	# 데이터 유형 최적화
	df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
	# Parquet 파일로 변환
	parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
	df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
	return f"{parquet_filename} 파일이 성공적으로 업로드되고 변환되었습니다.", parquet_filename
	except Exception as e:
	return f"CSV 파일 업로드 및 변환 중 오류가 발생했습니다: {str(e)}", ""

	def upload_parquet(file_path: str) -> Tuple[str, str, str]:
	try:
	# Parquet 파일 읽기
	df = pd.read_parquet(file_path, engine='pyarrow')

	# 데이터 기본 정보 수집
	data_info = {
	"총 레코드 수": len(df),
	"컬럼 목록": list(df.columns),
	"데이터 타입": df.dtypes.to_dict(),
	"결측치 정보": df.isnull().sum().to_dict()
	}

	# 데이터 요약 정보 생성
	summary = []
	summary.append(f"### 데이터셋 기본 정보:")
	summary.append(f"- 총 레코드 수: {data_info['총 레코드 수']}")
	summary.append(f"- 컬럼 목록: {', '.join(data_info['컬럼 목록'])}")

	# 각 컬럼별 통계 정보 생성
	summary.append("\n### 컬럼별 정보:")
	for col in df.columns:
	if df[col].dtype in ['int64', 'float64']:
	# 수치형 데이터
	stats = df[col].describe()
	summary.append(f"\n{col} (수치형):")
	summary.append(f"- 평균: {stats['mean']:.2f}")
	summary.append(f"- 최소: {stats['min']}")
	summary.append(f"- 최대: {stats['max']}")
	elif df[col].dtype == 'object' or df[col].dtype == 'string':
	# 문자열 데이터
	unique_count = df[col].nunique()
	summary.append(f"\n{col} (텍스트):")
	summary.append(f"- 고유값 수: {unique_count}")
	if unique_count < 10: # 고유값이 적은 경우만 표시
	value_counts = df[col].value_counts().head(5)
	summary.append("- 상위 5개 값:")
	for val, count in value_counts.items():
	summary.append(f" • {val}: {count}개")

	# 미리보기 생성
	preview = df.head(10).to_markdown(index=False)
	summary.append("\n### 데이터 미리보기:")
	summary.append(preview)

	parquet_content = "\n".join(summary)

	# DataFrame을 JSON 문자열로 변환 (Q&A에서 사용)
	parquet_json = df.to_json(orient='records', force_ascii=False)

	return "Parquet 파일이 성공적으로 업로드되었습니다.", parquet_content, parquet_json
	except Exception as e:
	return f"Parquet 파일 업로드 중 오류가 발생했습니다: {str(e)}", "", ""


	def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
	try:
	if parquet_data:
	# JSON 문자열을 DataFrame으로 변환
	df = pd.read_json(io.StringIO(parquet_data))

	# 데이터셋 컨텍스트 생성
	columns_info = []
	for col in df.columns:
	if df[col].dtype in ['int64', 'float64']:
	col_type = "수치형"
	stats = df[col].describe()
	col_info = f"- {col} ({col_type}): 평균={stats['mean']:.2f}, 최소={stats['min']}, 최대={stats['max']}"
	else:
	col_type = "텍스트"
	unique_count = df[col].nunique()
	col_info = f"- {col} ({col_type}): 고유값 {unique_count}개"
	columns_info.append(col_info)

	data_context = f"""
	현재 업로드된 데이터셋 정보:
	- 총 {len(df)} 개의 레코드
	- 컬럼 정보:
	{chr(10).join(columns_info)}

	샘플 데이터:
	{df.head(20).to_string()}
	"""
	system_prompt = f"""당신은 업로드된 데이터셋을 분석하고 질문에 답변하는 AI 어시스턴트입니다.

	주요 지침:
	1. 반드시 한글로 답변할 것
	2. 데이터셋의 실제 내용을 기반으로 정확하게 답변할 것
	3. 데이터에 없는 내용은 추측하지 말 것
	4. 답변은 간단명료하게 할 것
	5. 데이터 프라이버시를 고려하여 답변할 것

	데이터셋 구조 설명:
	{chr(10).join(columns_info)}

	참고할 데이터 샘플:
	{data_context}
	"""
	else:
	system_prompt = system_message or "너는 AI 조언자 역할이다."

	# OpenAI API 호출
	messages = [{"role": "system", "content": system_prompt}]

	# 최근 대화 기록 추가
	recent_history = history[-3:] if history else []
	for chat in recent_history:
	messages.append({"role": chat["role"], "content": chat["content"]})

	messages.append({"role": "user", "content": message})

	response = client.chat.completions.create(
	model="gpt-4-0125-preview",
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=True
	)

	full_response = ""
	for chunk in response:
	if chunk.choices[0].delta.content:
	full_response += chunk.choices[0].delta.content
	yield clean_response(full_response)

	except Exception as e:
	error_message = f"응답 생성 중 오류 발생: {str(e)}"
	print(f"{error_message}\n{traceback.format_exc()}")
	yield error_message

	def text_to_parquet(text: str) -> Tuple[str, str, str]:
	try:
	# 입력 텍스트를 줄 단위로 분리
	lines = [line.strip() for line in text.split('\n') if line.strip()]

	# 데이터를 저장할 리스트
	data = []

	for line in lines:
	try:
	# 정규식을 사용하여 CSV 형식 파싱
	import re
	pattern = r'(\d+),([^,]+),([^,]+),(.+)'
	match = re.match(pattern, line)

	if match:
	id_val, text_val, label_val, metadata_val = match.groups()

	# 쌍따옴표 제거 및 정제
	text_val = text_val.strip().strip('"')
	label_val = label_val.strip().strip('"')
	metadata_val = metadata_val.strip().strip('"')

	data.append({
	'id': int(id_val),
	'text': text_val,
	'label': label_val,
	'metadata': metadata_val
	})
	except Exception as e:
	print(f"라인 파싱 오류: {line}\n{str(e)}")
	continue

	if not data:
	return "변환할 데이터가 없습니다.", "", ""

	# DataFrame 생성
	df = pd.DataFrame(data)

	# 데이터 타입 설정
	df = df.astype({
	'id': 'int32',
	'text': 'string',
	'label': 'string',
	'metadata': 'string'
	})

	# Parquet 파일로 변환
	parquet_filename = 'text_to_parquet.parquet'
	df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')

	# 미리보기 생성
	preview = df.to_markdown(index=False)

	return (
	f"{parquet_filename} 파일이 성공적으로 변환되었습니다. 총 {len(df)}개의 레코드가 처리되었습니다.",
	preview,
	parquet_filename
	)

	except Exception as e:
	error_message = f"텍스트 변환 중 오류가 발생했습니다: {str(e)}"
	print(f"{error_message}\n{traceback.format_exc()}")
	return error_message, "", ""

	# preprocess_text_with_llm 함수도 수정
	def preprocess_text_with_llm(input_text: str) -> str:
	if not input_text.strip():
	return "입력 텍스트가 비어있습니다."

	system_prompt = """반드시 한글(한국어)로 답변하시오. 당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.

	규칙:
	1. 출력 형식: id,text,label,metadata
	2. id: 1부터 시작하는 순차적 번호
	3. text: 의미 있는 단위로 분리된 텍스트
	4. label: 텍스트의 주제나 카테고리를 아래 기준으로 정확하게 한 개만 선택
	- Historical_Figure (역사적 인물)
	- Military_History (군사 역사)
	- Technology (기술)
	- Politics (정치)
	- Culture (문화)
	5. metadata: 날짜, 출처 등 추가 정보"""

	try:
	response = client.chat.completions.create(
	model="gpt-4-0125-preview",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": input_text}
	],
	max_tokens=4000,
	temperature=0.1,
	stream=True
	)

	full_response = ""
	for chunk in response:
	if chunk.choices[0].delta.content:
	full_response += chunk.choices[0].delta.content

	# 응답 정제
	processed_text = clean_response(full_response)

	# CSV 형식 검증
	try:
	from io import StringIO
	import csv
	csv.reader(StringIO(processed_text))
	return processed_text
	except csv.Error:
	return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."

	except Exception as e:
	error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
	print(error_message)
	return error_message# preprocess_text_with_llm 함수도 수정
	def preprocess_text_with_llm(input_text: str) -> str:
	if not input_text.strip():
	return "입력 텍스트가 비어있습니다."

	system_prompt = """반드시 한글(한국어)로 답변하시오. 당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.

	규칙:
	1. 출력 형식: id,text,label,metadata
	2. id: 1부터 시작하는 순차적 번호
	3. text: 의미 있는 단위로 분리된 텍스트
	4. label: 텍스트의 주제나 카테고리를 아래 기준으로 정확하게 한 개만 선택
	- Historical_Figure (역사적 인물)
	- Military_History (군사 역사)
	- Technology (기술)
	- Politics (정치)
	- Culture (문화)
	5. metadata: 날짜, 출처 등 추가 정보"""

	try:
	response = client.chat.completions.create(
	model="gpt-4o-mini",
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": input_text}
	],
	max_tokens=4000,
	temperature=0.1,
	stream=True
	)

	full_response = ""
	for chunk in response:
	if chunk.choices[0].delta.content:
	full_response += chunk.choices[0].delta.content

	# 응답 정제
	processed_text = clean_response(full_response)

	# CSV 형식 검증
	try:
	from io import StringIO
	import csv
	csv.reader(StringIO(processed_text))
	return processed_text
	except csv.Error:
	return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."

	except Exception as e:
	error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
	print(error_message)
	return error_message

	# CSS 설정
	css = """
	footer {
	visibility: hidden;
	}
	#chatbot-container, #chatbot-data-upload {
	height: 700px;
	overflow-y: scroll;
	}
	#chatbot-container .message, #chatbot-data-upload .message {
	font-size: 14px;
	}
	/* 입력창 배경색 및 글자색 변경 */
	textarea, input[type="text"] {
	background-color: #ffffff; /* 흰색 배경 */
	color: #000000; /* 검정색 글자 */
	}
	/* 파일 업로드 영역 높이 조절 */
	#parquet-upload-area {
	max-height: 150px;
	overflow-y: auto;
	}
	/* 초기 설명 글씨 크기 조절 */
	#initial-description {
	font-size: 14px;
	}
	"""

	# Gradio Blocks 인터페이스 설정
	with gr.Blocks(css=css) as demo:
	gr.Markdown("# MyEzRAG: LLM이 나만의 데이터로 학습한 콘텐츠 생성/답변", elem_id="initial-description")
	gr.Markdown(
	"### '사용 방법' 탭을 통해 자세한 이용 방법을 참고하세요.\n"
	"### Tip) '예제'를 통해 다양한 활용 방법을 체험하고 응용해 보세요, 데이터셋 업로드시 미리보기는 10건만 출력",
	elem_id="initial-description"
	)



	# 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
	with gr.Tab("My 데이터셋+LLM"):
	gr.Markdown("### LLM과 대화하기")
	chatbot_data_upload = gr.Chatbot(label="챗봇", type="messages", elem_id="chatbot-data-upload")
	msg_data_upload = gr.Textbox(label="메시지 입력", placeholder="여기에 메시지를 입력하세요...")
	send_data_upload = gr.Button("전송")

	with gr.Accordion("시스템 프롬프트 및 옵션 설정", open=False):
	system_message = gr.Textbox(label="System Message", value="너는 AI 조언자 역할이다.")
	max_tokens = gr.Slider(minimum=1, maximum=8000, value=1000, label="Max Tokens")
	temperature = gr.Slider(minimum=0, maximum=1, value=0.7, label="Temperature")
	top_p = gr.Slider(minimum=0, maximum=1, value=0.9, label="Top P")

	parquet_data_state = gr.State()

	def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str):
	history = history or []

	# 중복 질문 검사
	recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user']
	if message.strip().lower() in recent_questions:
	yield history + [{"role": "assistant", "content": "동일한 질문이 최근에 있었습니다. 다른 질문을 해주세요."}], ""
	return

	try:
	history.append({"role": "user", "content": message})
	response_gen = respond(
	message,
	history,
	system_message,
	max_tokens,
	temperature=0.3, # 낮은 temperature 사용
	top_p=top_p,
	parquet_data=parquet_data
	)

	partial_response = ""
	for partial in response_gen:
	partial_response = partial
	display_history = history + [{"role": "assistant", "content": partial_response}]
	yield display_history, ""

	history.append({"role": "assistant", "content": partial_response})
	except Exception as e:
	response = f"오류 발생: {str(e)}"
	history.append({"role": "assistant", "content": response})
	yield history, ""




	send_data_upload.click(
	handle_message_data_upload,
	inputs=[
	msg_data_upload,
	chatbot_data_upload,
	system_message,
	max_tokens,
	temperature,
	top_p,
	parquet_data_state, # parquet_data_state를 사용하여 업로드된 데이터를 전달
	],
	outputs=[chatbot_data_upload, msg_data_upload],
	queue=True
	)

	# 예제 추가
	with gr.Accordion("예제", open=False):
	gr.Examples(
	examples=[
	["업로드된 데이터셋에 대해 요약 설명하라."],
	["업로드된 데이터셋 파일을 학습 데이터로 활용하여, 본 서비스를 SEO 최적화하여 블로그 포스트(개요, 배경 및 필요성, 기존 유사 제품/서비스와 비교하여 특장점, 활용처, 가치, 기대효과, 결론을 포함)로 4000 토큰 이상 작성하라"],
	["업로드된 데이터셋 파일을 학습 데이터로 활용하여, 사용 방법과 차별점, 특징, 강점을 중심으로 4000 토큰 이상 유튜브 영상 스크립트 형태로 작성하라"],
	["업로드된 데이터셋 파일을 학습 데이터로 활용하여, 제품 상세 페이지 형식의 내용을 4000 토큰 이상 자세히 설명하라"],
	["업로드된 데이터셋 파일을 학습 데이터로 활용하여, FAQ 20건을 상세하게 작성하라. 4000토큰 이상 사용하라."],
	["업로드된 데이터셋 파일을 학습 데이터로 활용하여, 특허 출원에 활용할 기술 및 비즈니스 모델 측면을 포함하여 특허 출원서 구성에 맞게 혁신적인 창의 발명 내용을 중심으로 4000 토큰 이상 작성하라."],
	],
	inputs=msg_data_upload,
	label="예제 선택",
	)

	# Parquet 파일 업로드를 화면 하단으로 이동
	gr.Markdown("### Parquet 파일 업로드")
	with gr.Row():
	with gr.Column():
	parquet_upload = gr.File(
	label="Parquet 파일 업로드", type="filepath", elem_id="parquet-upload-area"
	)
	parquet_upload_button = gr.Button("업로드")
	parquet_upload_status = gr.Textbox(label="업로드 상태", interactive=False)
	parquet_preview_chat = gr.Markdown(label="Parquet 파일 미리보기")

	def handle_parquet_upload(file_path: str):
	message, parquet_content, parquet_json = upload_parquet(file_path)
	if parquet_json:
	return message, parquet_content, parquet_json
	else:
	return message, "", ""

	parquet_upload_button.click(
	handle_parquet_upload,
	inputs=parquet_upload,
	outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
	)

	# 두 번째 탭: 데이터 변환 (탭 이름 변경: "CSV to My 데이터셋")
	with gr.Tab("CSV to My 데이터셋"):
	gr.Markdown("### CSV 파일 업로드 및 Parquet 변환")
	with gr.Row():
	with gr.Column():
	csv_file = gr.File(label="CSV 파일 업로드", type="filepath")
	upload_button = gr.Button("업로드 및 변환")
	upload_status = gr.Textbox(label="업로드 상태", interactive=False)
	parquet_preview = gr.Markdown(label="Parquet 파일 미리보기")
	download_button = gr.File(label="Parquet 파일 다운로드", interactive=False)

	def handle_csv_upload(file_path: str):
	message, parquet_filename = upload_csv(file_path)
	if parquet_filename:
	parquet_content = load_parquet(parquet_filename)
	return message, parquet_content, parquet_filename
	else:
	return message, "", None

	upload_button.click(
	handle_csv_upload,
	inputs=csv_file,
	outputs=[upload_status, parquet_preview, download_button]
	)

	# 세 번째 탭: 텍스트 to csv to parquet 변환 (탭 이름 변경: "Text to My 데이터셋")
	with gr.Tab("Text to My 데이터셋"):
	gr.Markdown("### 텍스트를 입력하면 CSV로 변환 후 Parquet으로 자동 전환됩니다.")
	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="텍스트 입력 (각 행은 `id,text,label,metadata` 형식으로 입력)",
	lines=10,
	placeholder='예: 1,"이순신","장군","거북선"\n2,"원균","장군","모함"\n3,"선조","왕","시기"\n4,"도요토미 히데요시","왕","침략"'
	)
	convert_button = gr.Button("변환 및 다운로드")
	convert_status = gr.Textbox(label="변환 상태", interactive=False)
	parquet_preview_convert = gr.Markdown(label="Parquet 파일 미리보기")
	download_parquet_convert = gr.File(label="Parquet 파일 다운로드", interactive=False)

	def handle_text_to_parquet(text: str):
	message, parquet_content, parquet_filename = text_to_parquet(text)
	if parquet_filename:
	return message, parquet_content, parquet_filename
	else:
	return message, "", None

	convert_button.click(
	handle_text_to_parquet,
	inputs=text_input,
	outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
	)

	# 네번째 탭의 UI 부분 수정
	with gr.Tab("Text Preprocessing with LLM"):
	gr.Markdown("### 텍스트를 입력하면 LLM이 데이터셋 형식에 맞게 전처리하여 출력합니다.")
	with gr.Row():
	with gr.Column():
	raw_text_input = gr.Textbox(
	label="텍스트 입력",
	lines=15,
	placeholder="여기에 전처리할 텍스트를 입력하세요..."
	)

	with gr.Row():
	preprocess_button = gr.Button("전처리 실행", variant="primary")
	clear_button = gr.Button("초기화")

	preprocess_status = gr.Textbox(
	label="전처리 상태",
	interactive=False,
	value="대기 중..."
	)

	processed_text_output = gr.Textbox(
	label="전처리된 데이터셋 출력",
	lines=15,
	interactive=False
	)

	# Parquet 변환 및 다운로드 섹션
	convert_to_parquet_button = gr.Button("Parquet으로 변환")
	download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")




	def handle_text_preprocessing(input_text: str):
	if not input_text.strip():
	return "입력 텍스트가 없습니다.", ""

	try:
	preprocess_status_msg = "전처리를 시작합니다..."
	yield preprocess_status_msg, ""

	processed_text = preprocess_text_with_llm(input_text)

	if processed_text:
	preprocess_status_msg = "전처리가 완료되었습니다."
	yield preprocess_status_msg, processed_text
	else:
	preprocess_status_msg = "전처리 결과가 없습니다."
	yield preprocess_status_msg, ""

	except Exception as e:
	error_msg = f"처리 중 오류가 발생했습니다: {str(e)}"
	yield error_msg, ""

	def clear_inputs():
	return "", "대기 중...", ""

	def convert_to_parquet_file(processed_text: str):
	if not processed_text.strip():
	return "변환할 텍스트가 없습니다.", None

	try:
	message, parquet_content, parquet_filename = text_to_parquet(processed_text)
	if parquet_filename:
	return message, parquet_filename
	return message, None
	except Exception as e:
	return f"Parquet 변환 중 오류 발생: {str(e)}", None

	# 이벤트 핸들러 연결
	preprocess_button.click(
	handle_text_preprocessing,
	inputs=[raw_text_input],
	outputs=[preprocess_status, processed_text_output],
	queue=True
	)

	clear_button.click(
	clear_inputs,
	outputs=[raw_text_input, preprocess_status, processed_text_output]
	)

	convert_to_parquet_button.click(
	convert_to_parquet_file,
	inputs=[processed_text_output],
	outputs=[preprocess_status, download_parquet]
	)

	# 예제 텍스트 추가
	with gr.Accordion("예제 텍스트", open=False):
	gr.Examples(
	examples=[
	["이순신은 조선 중기의 무신이다. 그는 임진왜란 당시 해군을 이끌었다. 거북선을 만들어 왜군과 싸웠다."],
	["인공지능은 컴퓨터 과학의 한 분야이다. 기계학습은 인공지능의 하위 분야이다. 딥러닝은 기계학습의 한 방법이다."]
	],
	inputs=raw_text_input,
	label="예제 선택"
	)

	with gr.Tab("📚 사용 방법"):
	gr.Markdown("""
	# MyEzRAG 사용 가이드

	## 1️⃣ My 데이터셋+LLM 탭
	![Tab1](https://your-image-url.com/tab1.png)
	### 기능
	- 업로드된 Parquet 데이터셋을 기반으로 LLM과 대화
	- 데이터셋의 내용을 활용한 콘텐츠 생성

	### 사용 방법
	1. Parquet 파일 업로드 섹션에서 데이터셋 파일을 업로드
	2. 채팅창에 원하는 질문이나 요청사항 입력
	3. 예제 버튼을 활용하여 다양한 활용 사례 체험

	### 팁
	- 시스템 프롬프트 설정으로 응답 스타일 조정 가능
	- 상세한 질문일수록 더 정확한 답변 제공

	---

	## 2️⃣ CSV to My 데이터셋 탭
	![Tab2](https://your-image-url.com/tab2.png)
	### 기능
	- CSV 파일을 Parquet 형식으로 변환
	- 데이터 최적화 및 정제

	### 사용 방법
	1. CSV 파일 준비 (필수 컬럼: id, text, label, metadata)
	2. 파일 업로드 후 '업로드 및 변환' 버튼 클릭
	3. 변환된 Parquet 파일 다운로드

	### 주의사항
	- CSV 파일은 반드시 필수 컬럼을 포함해야 함
	- 인코딩은 UTF-8 권장

	---

	## 3️⃣ Text to My 데이터셋 탭
	![Tab3](https://your-image-url.com/tab3.png)
	### 기능
	- 텍스트 형식의 데이터를 Parquet으로 변환
	- 수동 데이터 입력 지원

	### 사용 방법
	1. 지정된 형식으로 텍스트 입력
	```
	1,"이순신","장군","거북선"
	2,"원균","장군","모함"
	```
	2. '변환 및 다운로드' 버튼 클릭
	3. 변환된 파일 확인 및 다운로드

	### 입력 형식
	- id: 순차적 번호
	- text: 실제 텍스트 내용
	- label: 분류 라벨
	- metadata: 부가 정보

	---

	## 4️⃣ Text Preprocessing with LLM 탭
	![Tab4](https://your-image-url.com/tab4.png)
	### 기능
	- LLM을 활용한 자동 텍스트 전처리
	- 구조화된 데이터셋 생성

	### 사용 방법
	1. 원문 텍스트 입력
	2. '전처리 실행' 버튼 클릭
	3. 결과 확인 후 필요시 Parquet 변환

	### 특징
	- 자동 레이블링
	- 문장 단위 분리
	- 중복 제거
	- 데이터 정규화

	## 💡 일반적인 팁
	- 각 탭의 예제를 참고하여 사용법 익히기
	- 데이터 품질이 좋을수록 더 나은 결과 제공
	- 오류 발생 시 입력 데이터 형식 확인
	- 대용량 처리 시 적절한 청크 크기로 분할 처리

	## ⚠️ 주의사항
	- 민감한 개인정보 포함하지 않기
	- 데이터 백업 권장
	- 네트워크 상태 확인
	- 브라우저 캐시 주기적 정리

	## 🔍 문제 해결
	- 오류 발생 시 입력 데이터 형식 확인
	- 파일 업로드 실패 시 파일 크기 및 형식 확인
	- 변환 실패 시 데이터 인코딩 확인
	- 응답이 느릴 경우 데이터 크기 조정
	""")


	gr.Markdown("### Arxivgpt@gmail.com", elem_id="initial-description")

	if __name__ == "__main__":
	demo.launch(share=True)