abdfajar707 commited on
Commit
31793e1
1 Parent(s): 395e9da

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -140
app.py CHANGED
@@ -1,101 +1,58 @@
 
1
  import torch
2
  import gradio as gr
3
- import os
4
- from threading import Thread
5
- from typing import Iterator
6
- from transformers import (
7
- AutoModelForCausalLM,
8
- BitsAndBytesConfig,
9
- GenerationConfig,
10
- AutoTokenizer,
11
- TextIteratorStreamer,
12
- )
13
- from peft import AutoPeftModelForCausalLM
14
 
15
-
16
-
17
- #deklarasi
18
  max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
19
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
20
  load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
21
- #alpaca_prompt = """Berikut adalah instruksi yang deskripsikan tugas dan sepasang input dan konteksnya. Tulis response sesuai dengan permintaan.
22
  ### Instruction:
23
  {}
24
  ### Input:
25
  {}
26
  ### Response:
27
- #{}"""
28
-
29
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
30
- MAX_MAX_NEW_TOKENS = 2048
31
- DEFAULT_MAX_NEW_TOKENS = 1024
32
- MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
33
-
34
- model_id = "abdfajar707/llama3_8B_lora_model_rkp_pn2025_v3"
35
- #tokenizer = LlamaTokenizer.from_pretrained(model_id)
36
- #model, tokenizer = AutoModelForCausalLM.from_pretrained(
37
- # model_id,
38
- # device_map="auto",
39
- # quantization_config=BitsAndBytesConfig(load_in_8bit=True),
40
- #)
41
- model = AutoPeftModelForCausalLM.from_pretrained(
42
- model_id, # YOUR MODEL YOU USED FOR TRAINING
43
  load_in_4bit = load_in_4bit,
44
  )
45
- tokenizer = AutoTokenizer.from_pretrained(
46
- model_id,
47
- quantization_config=BitsAndBytesConfig(load_in_8bit=True)
48
- )
49
-
50
- model.config.sliding_window = 4096
51
- model.eval()
52
-
53
 
54
- #@spaces.GPU(duration=90)
55
- def generate(
56
- message: str,
57
- chat_history: list[tuple[str, str]],
58
- max_new_tokens: int = 1024,
59
- temperature: float = 0.6,
60
- top_p: float = 0.9,
61
- top_k: int = 50,
62
- repetition_penalty: float = 1.2,
63
- ) -> Iterator[str]:
64
- conversation = []
65
- for user, assistant in chat_history:
66
- conversation.extend(
67
- [
68
- {"role": "user", "content": user},
69
- {"role": "assistant", "content": assistant},
70
- ]
71
- )
72
- conversation.append({"role": "user", "content": message})
73
 
74
- input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
75
- if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
76
- input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
77
- gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
78
- input_ids = input_ids.to(model.device)
79
 
80
- streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
81
- generate_kwargs = dict(
82
- {"input_ids": input_ids},
83
- streamer=streamer,
84
- max_new_tokens=max_new_tokens,
85
- do_sample=True,
86
- top_p=top_p,
87
- top_k=top_k,
88
- temperature=temperature,
89
- num_beams=1,
90
- repetition_penalty=repetition_penalty,
91
  )
92
- t = Thread(target=model.generate, kwargs=generate_kwargs)
93
- t.start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- outputs = []
96
- for text in streamer:
97
- outputs.append(text)
98
- yield "".join(outputs)
99
 
100
  DESCRIPTION = '''
101
  <div style="padding: 5px; text-align: left; display: flex; flex-direction: column; align-items: left;">
@@ -131,64 +88,21 @@ h1 {
131
  border-radius: 100vh;
132
  }
133
  """
134
- chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Interlinked Sytem ChatInterface')
135
-
136
-
137
- chat_interface = gr.ChatInterface(
138
- fn=generate,
139
- chatbot=chatbot,
140
- additional_inputs=[
141
- gr.Slider(
142
- label="Max new tokens",
143
- minimum=1,
144
- maximum=MAX_MAX_NEW_TOKENS,
145
- step=1,
146
- value=DEFAULT_MAX_NEW_TOKENS,
147
- ),
148
- gr.Slider(
149
- label="Temperature",
150
- minimum=0.1,
151
- maximum=4.0,
152
- step=0.1,
153
- value=0.6,
154
- ),
155
- gr.Slider(
156
- label="Top-p (nucleus sampling)",
157
- minimum=0.05,
158
- maximum=1.0,
159
- step=0.05,
160
- value=0.9,
161
- ),
162
- gr.Slider(
163
- label="Top-k",
164
- minimum=1,
165
- maximum=1000,
166
- step=1,
167
- value=50,
168
- ),
169
- gr.Slider(
170
- label="Repetition penalty",
171
- minimum=1.0,
172
- maximum=2.0,
173
- step=0.05,
174
- value=1.2,
175
- ),
176
- ],
177
- stop_btn=None,
178
- examples=[
179
- ["Apa yang dimaksud dengan RPJMN"],
180
- ["Jelaskan tentang RPJMN 2020-2024"],
181
- ["Apa peran RKP 2021 dan 20211 dalam RPJM 2020-2024"],
182
- ["Apa saja program prioritas RPJMN 2020-2024"],
183
- ],
184
- )
185
-
186
-
187
-
188
- with gr.Blocks(css=css, fill_height=True) as demo:
189
- gr.Markdown(DESCRIPTION)
190
- #gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
191
- chat_interface.render()
192
 
193
- if __name__ == "__main__":
194
- demo.queue(max_size=20).launch()
 
1
+ from unsloth import FastLanguageModel
2
  import torch
3
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
4
 
 
 
 
5
  max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
  dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
  load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
+ alpaca_prompt = """Berikut adalah instruksi yang deskripsikan tugas dan sepasang input dan konteksnya. Tulis response sesuai dengan permintaan.
9
  ### Instruction:
10
  {}
11
  ### Input:
12
  {}
13
  ### Response:
14
+ {}"""
15
+
16
+ if True:
17
+ from unsloth import FastLanguageModel
18
+ model, tokenizer = FastLanguageModel.from_pretrained(
19
+ model_name = "abdfajar707/llama3_8B_lora_model_orpo_v1", # YOUR MODEL YOU USED FOR TRAINING
20
+ max_seq_length = max_seq_length,
21
+ dtype = dtype,
 
 
 
 
 
 
 
 
22
  load_in_4bit = load_in_4bit,
23
  )
24
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
 
 
 
 
 
 
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
 
 
 
 
27
 
28
+ # Fungsi untuk menghasilkan respons
29
+ def generate_response(prompt, max_length=1000):
30
+ inputs = tokenizer(
31
+ [
32
+ alpaca_prompt.format(
33
+ prompt, # instruction
34
+ "", # input
35
+ "", # output - leave this blank for generation!
 
 
 
36
  )
37
+ ], return_tensors = "pt").to("cuda")
38
+ outputs = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
39
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
40
+ return response
41
+
42
+ history = []
43
+ def wrapper_chat_history(chat_history, history):
44
+ chat_history = history[1:]
45
+ return chat_history
46
+
47
+ def converse(message, chat_history):
48
+ response = generate_response(message)
49
+ print(response)
50
+ user_msg = {"role": "user", "content": message}
51
+ history.append(user_msg)
52
+ ai_msg = {"role": "assistant", "content": response}
53
+ history.append(ai_msg)
54
+ return history[-1]["content"]
55
 
 
 
 
 
56
 
57
  DESCRIPTION = '''
58
  <div style="padding: 5px; text-align: left; display: flex; flex-direction: column; align-items: left;">
 
88
  border-radius: 100vh;
89
  }
90
  """
91
+ chatbot=gr.Chatbot(height=500, placeholder=PLACEHOLDER, label='Interlinked Sytem ChatInterface')
92
+
93
+ with gr.Blocks(css=css) as interface:
94
+ chatbot=chatbot,
95
+ with gr.Row():
96
+ with gr.Column(scale=1):
97
+ gr.HTML('<img src="https://datahub.data.go.id/data/static/Kementerian%20PPN%20Bappenas%20Tanpa%20Teks.png" width="100px" alt="Image" style="max-width: 100%;">')
98
+ with gr.Row():
99
+ with gr.Column(scale=1, elem_id='col'):
100
+ gr.ChatInterface(fn=converse, title=("""
101
+ <center>
102
+ <h1>KemenPPN/Bappenas</h1>
103
+ <b>AI-Interlinked System/Bappenas GPT<b>
104
+ </center>
105
+ """
106
+ ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ interface.launch()