File size: 2,916 Bytes
47ab986
1f2af73
 
8381279
 
47ab986
f18ba88
 
0b0436e
f18ba88
cffea61
2602485
cffea61
8381279
 
 
1f2af73
 
 
54d9c29
f18ba88
 
 
1f2af73
e33536d
54d9c29
e33536d
 
8381279
 
 
e33536d
1f2af73
8381279
 
1f2af73
 
 
 
 
 
 
 
 
 
 
6cdd279
1f2af73
 
 
 
 
 
 
 
 
 
 
f18ba88
 
 
 
 
8381279
 
 
f18ba88
1f2af73
e33536d
cdc1fda
e33536d
 
 
 
 
54d9c29
8c6561b
fc59d0b
e33536d
 
 
47ab986
1f2af73
e33536d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import gradio as gr
import requests
import os
import json
from collections import deque

# ν™˜κ²½ λ³€μˆ˜μ—μ„œ API 토큰 κ°€μ Έμ˜€κΈ°
TOKEN = os.getenv("HUGGINGFACE_API_TOKEN")

# API 토큰이 μ„€μ •λ˜μ–΄ μžˆλŠ”μ§€ 확인
if not TOKEN:
    raise ValueError("API token is not set. Please set the HUGGINGFACE_API_TOKEN environment variable.")

# λŒ€ν™” 기둝을 κ΄€λ¦¬ν•˜λŠ” 큐 (μ΅œλŒ€ 10개의 λŒ€ν™” 기둝을 μœ μ§€)
memory = deque(maxlen=10)

def respond(
    message,
    history: list[tuple[str, str]],
    system_message="AI Assistant Role",
    max_tokens=512,
    temperature=0.7,
    top_p=0.95,
):
    # μ‹œμŠ€ν…œ λ©”μ‹œμ§€μ— 접두사 μΆ”κ°€
    system_prefix = "System: μž…λ ₯μ–΄μ˜ μ–Έμ–΄(μ˜μ–΄, ν•œκ΅­μ–΄, 쀑ꡭ어, 일본어 λ“±)에 따라 λ™μΌν•œ μ–Έμ–΄λ‘œ λ‹΅λ³€ν•˜λΌ."
    full_system_message = f"{system_prefix}{system_message}"

    # ν˜„μž¬ λŒ€ν™” λ‚΄μš©μ„ λ©”λͺ¨λ¦¬μ— μΆ”κ°€
    memory.append((message, None))

    messages = [{"role": "system", "content": full_system_message}]

    # λ©”λͺ¨λ¦¬μ—μ„œ λŒ€ν™” 기둝을 가져와 λ©”μ‹œμ§€ λͺ©λ‘μ— μΆ”κ°€
    for val in memory:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    headers = {
        "Authorization": f"Bearer {TOKEN}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "messages": messages
    }

    response = requests.post("https://api-inference.huggingface.co/v1/chat/completions", headers=headers, json=payload, stream=True)
    
    response_text = ""
    for chunk in response.iter_content(chunk_size=None):
        if chunk:
            chunk_data = chunk.decode('utf-8')
            response_json = json.loads(chunk_data)
            # content μ˜μ—­λ§Œ 좜λ ₯
            if "choices" in response_json:
                content = response_json["choices"][0]["message"]["content"]
                response_text = content
                # λ§ˆμ§€λ§‰ λŒ€ν™”μ— λͺ¨λΈμ˜ 응닡을 μΆ”κ°€ν•˜μ—¬ λ©”λͺ¨λ¦¬μ— μ €μž₯
                memory[-1] = (message, response_text)
                yield content

theme = "Nymbo/Nymbo_Theme"

# Gradio ChatInterface μ„€μ •
demo = gr.ChatInterface(
    fn=respond,
    theme=theme,
    additional_inputs=[
        gr.Textbox(value="AI Assistant Role", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),  # μˆ˜μ •λœ λΆ€λΆ„
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
    ],
)

if __name__ == "__main__":
    demo.queue(concurrency_limit=20).launch(max_threads=20)