UNIVA-Bllossom/DeepSeek-llama3.1-Bllossom-8B

import os
from threading import Thread
from typing import Iterator
import torch
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import re

# CUDA 사용 가능 여부 확인
if not torch.cuda.is_available():
    raise RuntimeError("CUDA GPU를 찾을 수 없습니다. GPU가 필요합니다.")

print(f"사용 가능한 GPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA 버전: {torch.version.cuda}")

# GPU 메모리 캐시 초기화
torch.cuda.empty_cache()

MAX_INPUT_TOKEN_LENGTH = 8192
DEFAULT_MAX_NEW_TOKENS = 4096

# 모델과 토크나이저 로딩
print("모델을 로딩중입니다...")
model = AutoModelForCausalLM.from_pretrained(
    "UNIVA-Bllossom/DeepSeek-llama3.1-Bllossom-8B",
    torch_dtype=torch.bfloat16, 
    device_map="auto",
    trust_remote_code=True
)

print("토크나이저를 로딩중입니다...")
tokenizer = AutoTokenizer.from_pretrained(
    "UNIVA-Bllossom/DeepSeek-llama3.1-Bllossom-8B",  # 예제랑은 다르게 토크나이저 맞춤 UNIVA-Bllossom/DeepSeek-llama3.3-Bllossom-70B
    trust_remote_code=True
)
tokenizer.use_default_system_prompt = False

system_prompt = '''You are a highly capable assistant. For every user question, follow these instructions exactly:
    1. First, think through the problem step-by-step in English. Enclose all of your internal reasoning between <think> and </think> tags. This chain-of-thought should detail your reasoning process.
    2. After the closing </think> tag, provide your final answer in Korean.
    3. Do not include any additional text or commentary outside of this format.
    4. Your output should strictly follow this structure:

<think>
[Your detailed step-by-step reasoning in English]
</think>
<answer>
[Your final answer in Korean]
</answer>'''

def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    max_new_tokens: int = 4096,
    temperature: float = 0.7,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    try:
        print("\n사용자:", message)
        conversation = []
        conversation.append({"role": "system", "content": system_prompt})
        for user, assistant in chat_history:
            conversation.extend([
                {"role": "user", "content": user},
                {"role": "assistant", "content": assistant}
            ])
        conversation.append({"role": "user", "content": message})

        inputs = tokenizer.apply_chat_template(
            conversation,
            return_tensors="pt",
            add_generation_prompt=True
        )

        print("\nAI 응답:")
        streamer = TextIteratorStreamer(
            tokenizer,
            timeout=120.0,
            skip_prompt=True,
            skip_special_tokens=False
        )
        
        generate_kwargs = dict(
            input_ids=inputs.to(model.device),
            attention_mask=torch.ones_like(inputs).to(model.device),
            streamer=streamer,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            repetition_penalty=repetition_penalty,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        t = Thread(target=model.generate, kwargs=generate_kwargs)
        t.start()

        outputs = []
        for text in streamer:
            if text is None:
                continue
            if "</s>" in text:
                text = text.replace("</s>", "")
            outputs.append(text)
            current_response = "".join(outputs)
            print(text, end="", flush=True)  # CLI에 실시간으로 한 글자씩 출력
            yield current_response
        
        print("\n" + "-" * 50)

    except Exception as e:
        error_msg = f"Error in generate: {str(e)}"
        print(error_msg)
        yield "죄송합니다. 응답 생성 중 오류가 발생했습니다. 다시 시도해 주세요."

# Gradio 인터페이스 수정
with gr.Blocks(css="""
    .message-wrap {margin-bottom: 10px;}
    details {margin: 10px 0;}
    summary {cursor: pointer; padding: 5px;}
    summary:hover {background-color: #f5f5f5;}
""") as demo:
    gr.Markdown("## DeepSeek Bllossom 챗봇")
    
    chatbot = gr.Chatbot(
        label="DeepSeek Bllossom 챗봇",
        height=600,
        bubble_full_width=False,
        render_markdown=True,
        show_label=False
    )
    
    with gr.Row():
        msg = gr.Textbox(
            label="메시지 입력",
            placeholder="메시지를 입력하세요...",
            lines=2,
            scale=9
        )
        submit = gr.Button("전송", variant="primary", scale=1)
    
    with gr.Accordion("고급 설정", open=False):
        max_new_tokens = gr.Slider(
            label="최대 토큰 수",
            minimum=1,
            maximum=8192,
            step=1,
            value=4096
        )
        temperature = gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=2.0,
            step=0.1,
            value=0.7
        )
        top_p = gr.Slider(
            label="Top-p",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9
        )
    
    clear = gr.Button("대화 내용 지우기")

    # 이벤트 핸들러
    def user(message, history):
        return "", history + [[message, None]]

    def bot(history, max_new_tokens, temperature, top_p):
        try:
            message = history[-1][0]
            history[-1][1] = ""
            for content in generate(
                message,
                history[:-1],
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p
            ):
                history[-1][1] = content
                yield history
        except Exception as e:
            print(f"Error in bot: {str(e)}")
            history[-1][1] = "죄송합니다. 응답 생성 중 오류가 발생했습니다. 다시 시도해 주세요."
            yield history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, max_new_tokens, temperature, top_p], chatbot
    )
    submit.click(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, max_new_tokens, temperature, top_p], chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

    # 초기 인사말
    demo.load(lambda: [[None, "안녕하세요! 무엇을 도와드릴까요?"]], None, chatbot)

demo.queue(max_size=20).launch(share=True)
UNIVA-Bllossom
/

DeepSeek-llama3.1-Bllossom-8B

chat, stream code