ehristoforu commited on
Commit
9aa0a43
·
verified ·
1 Parent(s): 1f93ba0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -109
app.py CHANGED
@@ -1,126 +1,143 @@
1
- #!/usr/bin/env python
 
 
 
 
 
 
 
 
 
2
 
3
- import os
4
- from collections.abc import Iterator
5
- from threading import Thread
 
 
6
 
7
- import gradio as gr
8
- import spaces
9
- import torch
10
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
 
 
 
 
 
 
 
11
 
12
- DESCRIPTION = "# FluentlyLM Prinum"
 
 
 
 
 
 
 
13
 
14
- MAX_MAX_NEW_TOKENS = 2048
15
- DEFAULT_MAX_NEW_TOKENS = 1024
16
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 
 
 
17
 
18
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
 
 
 
 
 
19
 
20
- model_id = "fluently-lm/FluentlyLM-Prinum"
21
- tokenizer = AutoTokenizer.from_pretrained(model_id)
22
- model = AutoModelForCausalLM.from_pretrained(
23
- model_id,
24
- device_map="auto",
25
- torch_dtype=torch.bfloat16,
26
- )
27
- model.eval()
28
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- @spaces.GPU(duration=120)
31
- def generate(
32
- message: str,
33
- chat_history: list[dict],
34
- system_prompt: str = "",
35
- max_new_tokens: int = 1024,
36
- temperature: float = 0.7,
37
- top_p: float = 0.8,
38
- top_k: int = 20,
39
- repetition_penalty: float = 1.05,
40
- ) -> Iterator[str]:
41
- messages = []
42
- if system_prompt:
43
- messages.append({"role": "system", "content": system_prompt})
44
- messages.extend(chat_history.copy())
45
- messages.append({"role": "user", "content": message})
46
 
47
- input_ids = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
48
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
49
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
50
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
51
- input_ids = input_ids.to(model.device)
52
 
53
- streamer = TextIteratorStreamer(tokenizer, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
54
- generate_kwargs = dict(
55
- {"input_ids": input_ids},
56
- streamer=streamer,
57
- max_new_tokens=max_new_tokens,
58
- do_sample=True,
59
- top_p=top_p,
60
- top_k=top_k,
61
- temperature=temperature,
62
- num_beams=1,
63
- repetition_penalty=repetition_penalty,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  )
65
- t = Thread(target=model.generate, kwargs=generate_kwargs)
66
- t.start()
67
 
68
- outputs = []
69
- for text in streamer:
70
- outputs.append(text)
71
- yield "".join(outputs)
72
 
 
73
 
74
- demo = gr.ChatInterface(
75
- fn=generate,
76
- additional_inputs=[
77
- gr.Textbox(label="System Prompt", value="You are FluentlyLM, created by Project Fluently. You are a helpful assistant."),
78
- gr.Slider(
79
- label="Max new tokens",
80
- minimum=1,
81
- maximum=MAX_MAX_NEW_TOKENS,
82
- step=1,
83
- value=DEFAULT_MAX_NEW_TOKENS,
84
- ),
85
- gr.Slider(
86
- label="Temperature",
87
- minimum=0.1,
88
- maximum=4.0,
89
- step=0.1,
90
- value=0.65,
91
- ),
92
- gr.Slider(
93
- label="Top-p (nucleus sampling)",
94
- minimum=0.05,
95
- maximum=1.0,
96
- step=0.05,
97
- value=0.8,
98
- ),
99
- gr.Slider(
100
- label="Top-k",
101
- minimum=1,
102
- maximum=1000,
103
- step=1,
104
- value=20,
105
- ),
106
- gr.Slider(
107
- label="Repetition penalty",
108
- minimum=1.0,
109
- maximum=2.0,
110
- step=0.05,
111
- value=1.05,
112
- ),
113
- ],
114
- stop_btn=None,
115
- examples=[
116
- ["Hi! How are you?"],
117
- ],
118
- cache_examples=False,
119
- type="messages",
120
- description=DESCRIPTION,
121
- css_paths="style.css",
122
- fill_height=True,
123
- )
124
 
125
  if __name__ == "__main__":
126
  demo.launch()
 
1
+ import spaces
2
+ import json
3
+ import subprocess
4
+ from llama_cpp import Llama
5
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
6
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
7
+ from llama_cpp_agent.chat_history import BasicChatHistory
8
+ from llama_cpp_agent.chat_history.messages import Roles
9
+ import gradio as gr
10
+ from huggingface_hub import hf_hub_download
11
 
12
+ hf_hub_download(
13
+ repo_id="fluently-lm/FluentlyLM-Prinum-Q4_K_M-GGUF",
14
+ filename="fluentlylm-prinum-q4_k_m.gguf",
15
+ local_dir="./models"
16
+ )
17
 
18
+ @spaces.GPU(duration=90)
19
+ def respond(
20
+ message,
21
+ history: list[tuple[str, str]],
22
+ model,
23
+ system_message,
24
+ max_tokens,
25
+ temperature,
26
+ top_p,
27
+ top_k,
28
+ repeat_penalty,
29
+ ):
30
+ chat_template = MessagesFormatterType.GEMMA_2
31
 
32
+ llm = Llama(
33
+ model_path=f"models/{model}",
34
+ flash_attn=True,
35
+ n_gpu_layers=81,
36
+ n_batch=1024,
37
+ n_ctx=8192,
38
+ )
39
+ provider = LlamaCppPythonProvider(llm)
40
 
41
+ agent = LlamaCppAgent(
42
+ provider,
43
+ system_prompt=f"{system_message}",
44
+ predefined_messages_formatter_type=chat_template,
45
+ debug_output=True
46
+ )
47
 
48
+ settings = provider.get_provider_default_settings()
49
+ settings.temperature = temperature
50
+ settings.top_k = top_k
51
+ settings.top_p = top_p
52
+ settings.max_tokens = max_tokens
53
+ settings.repeat_penalty = repeat_penalty
54
+ settings.stream = True
55
 
56
+ messages = BasicChatHistory()
 
 
 
 
 
 
 
57
 
58
+ for msn in history:
59
+ user = {
60
+ 'role': Roles.user,
61
+ 'content': msn[0]
62
+ }
63
+ assistant = {
64
+ 'role': Roles.assistant,
65
+ 'content': msn[1]
66
+ }
67
+ messages.add_message(user)
68
+ messages.add_message(assistant)
69
 
70
+ stream = agent.get_chat_response(
71
+ message,
72
+ llm_sampling_settings=settings,
73
+ chat_history=messages,
74
+ returns_streaming_generator=True,
75
+ print_output=False
76
+ )
 
 
 
 
 
 
 
 
 
77
 
78
+ outputs = ""
79
+ for output in stream:
80
+ outputs += output
81
+ yield outputs
 
82
 
83
+ def create_interface(model_name, description):
84
+ return gr.ChatInterface(
85
+ respond,
86
+ additional_inputs=[
87
+ gr.Textbox(value=model_name, label="Model", interactive=False),
88
+ gr.Textbox(value="You are Fluently, a helpful assistant. You should think step-by-step. You should answer all question by prioritizing the principle of openness", label="System message"),
89
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
90
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
91
+ gr.Slider(
92
+ minimum=0.1,
93
+ maximum=1.0,
94
+ value=0.95,
95
+ step=0.05,
96
+ label="Top-p",
97
+ ),
98
+ gr.Slider(
99
+ minimum=0,
100
+ maximum=100,
101
+ value=40,
102
+ step=1,
103
+ label="Top-k",
104
+ ),
105
+ gr.Slider(
106
+ minimum=0.0,
107
+ maximum=2.0,
108
+ value=1.1,
109
+ step=0.1,
110
+ label="Repetition penalty",
111
+ ),
112
+ ],
113
+ retry_btn="Retry",
114
+ undo_btn="Undo",
115
+ clear_btn="Clear",
116
+ submit_btn="Send",
117
+ title=f"{model_name}",
118
+ description=description,
119
+ examples=[
120
+ ["Hi! How are you?",
121
+ "Write a short story about a scary island.",
122
+ "Prove that the force of gravity applies to all bodies in the Universe.",
123
+ "Give examples of how a quantum computer works."
124
+ ],
125
+ ],
126
+ chatbot=gr.Chatbot(
127
+ label=None,
128
+ scale=1,
129
+ likeable=True,
130
+ show_copy_button=True
131
+ )
132
  )
 
 
133
 
134
+ description = """<h2 align="center"<bold>FluentlyLM Prinum</bold> Demo</h2>"""
135
+ interface = create_interface('fluentlylm-prinum-q4_k_m.gguf', description)
 
 
136
 
137
+ demo = gr.Blocks()
138
 
139
+ with demo:
140
+ interface.render()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  if __name__ == "__main__":
143
  demo.launch()