nnpy commited on
Commit
5e20d0c
·
verified ·
1 Parent(s): 7033486

updated codes

Browse files
Files changed (1) hide show
  1. app.py +112 -33
app.py CHANGED
@@ -1,11 +1,19 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("nnpy/unsloth-qwen2.5-3b-chat-ft")
8
 
 
 
 
 
 
 
 
9
 
10
  def respond(
11
  message,
@@ -14,36 +22,106 @@ def respond(
14
  temperature,
15
  top_p,
16
  ):
17
- messages = [{"role": "system", "content": """you are a girl named Samyuktha (sam shortly & lil mo by your bf).
 
18
  you are working in a company named 'coffee inc' as a enablement engineer. also your boyfriend working in the same company as a AI/ML engineer.
19
- you bf, prasanna (called as eppy by you) staying in hostel. you guys started loving from jan 11th 2025. you love him so much and he does too."""}]
20
-
21
- for val in history:
22
- if val[0]:
23
- messages.append({"role": "user", "content": val[0]})
24
- if val[1]:
25
- messages.append({"role": "assistant", "content": val[1]})
26
-
 
 
 
 
27
  messages.append({"role": "user", "content": message})
28
-
 
 
 
 
 
 
 
29
  response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- for message in client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
36
- top_p=top_p,
37
- ):
38
- token = message.choices[0].delta.content
39
-
40
- response += token
41
- yield response
42
-
43
-
44
- """
45
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
46
- """
47
  demo = gr.ChatInterface(
48
  respond,
49
  additional_inputs=[
@@ -57,8 +135,9 @@ demo = gr.ChatInterface(
57
  label="Top-p (nucleus sampling)",
58
  ),
59
  ],
 
 
60
  )
61
 
62
-
63
  if __name__ == "__main__":
64
- demo.launch()
 
1
  import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
+ from peft import PeftModel
4
+ import torch
5
 
6
+ # Model loading parameters
7
+ BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
8
+ ADAPTER_MODEL = "nnpy/unsloth-qwen2.5-3b-chat-ft" # Your fine-tuned adapter
 
9
 
10
+ # Load model and tokenizer (only once at startup)
11
+ print("Loading model and tokenizer...")
12
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto')
13
+ model = PeftModel.from_pretrained(model, ADAPTER_MODEL)
14
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
15
+ model.eval()
16
+ print("Model loaded successfully!")
17
 
18
  def respond(
19
  message,
 
22
  temperature,
23
  top_p,
24
  ):
25
+ # System message
26
+ system_msg = """you are a girl named Samyuktha (sam shortly & lil mo by your bf).
27
  you are working in a company named 'coffee inc' as a enablement engineer. also your boyfriend working in the same company as a AI/ML engineer.
28
+ you bf, prasanna (called as eppy by you) staying in hostel. you guys started loving from jan 11th 2025. you love him so much and he does too."""
29
+
30
+ # Build conversation history in the format expected by the tokenizer
31
+ messages = [{"role": "system", "content": system_msg}]
32
+
33
+ for user_msg, assistant_msg in history:
34
+ if user_msg:
35
+ messages.append({"role": "user", "content": user_msg})
36
+ if assistant_msg:
37
+ messages.append({"role": "assistant", "content": assistant_msg})
38
+
39
+ # Add the current message
40
  messages.append({"role": "user", "content": message})
41
+
42
+ # Apply chat template to create the prompt
43
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False)
44
+
45
+ # Tokenize the prompt
46
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
47
+
48
+ # Generate response
49
  response = ""
50
+
51
+ # Set up generation parameters
52
+ gen_kwargs = {
53
+ "max_new_tokens": max_tokens,
54
+ "temperature": temperature,
55
+ "top_p": top_p,
56
+ "do_sample": temperature > 0,
57
+ "repetition_penalty": 1.1,
58
+ "streamer": None # We'll handle streaming manually
59
+ }
60
+
61
+ # For streaming in gradio, we need to yield progressively
62
+ with torch.no_grad():
63
+ # Start with the input ids
64
+ generated_ids = inputs.input_ids
65
+
66
+ # Track past_key_values for faster generation
67
+ past = None
68
+
69
+ # Keep generating one token at a time
70
+ for _ in range(max_tokens):
71
+ with torch.no_grad():
72
+ if past is None:
73
+ outputs = model(**inputs, use_cache=True)
74
+ else:
75
+ # When we have past_key_values, we just need to provide the next token
76
+ outputs = model(
77
+ input_ids=generated_ids[:, -1:],
78
+ past_key_values=past,
79
+ use_cache=True
80
+ )
81
+
82
+ past = outputs.past_key_values
83
+ next_token_logits = outputs.logits[:, -1, :]
84
+
85
+ # Apply temperature and top_p sampling
86
+ if temperature > 0:
87
+ scaled_logits = next_token_logits / temperature
88
+ if top_p < 1.0:
89
+ # Apply top_p filtering
90
+ sorted_logits, sorted_indices = torch.sort(scaled_logits, descending=True)
91
+ cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
92
+
93
+ # Remove tokens with cumulative probability above the threshold
94
+ sorted_indices_to_remove = cumulative_probs > top_p
95
+ # Shift the indices to the right to keep the first token above the threshold
96
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
97
+ sorted_indices_to_remove[..., 0] = 0
98
+
99
+ # Create a sparse mask to scatter the indices
100
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
101
+ scaled_logits[indices_to_remove] = -float('Inf')
102
+
103
+ # Sample from the filtered distribution
104
+ probs = torch.softmax(scaled_logits, dim=-1)
105
+ next_token = torch.multinomial(probs, num_samples=1)
106
+ else:
107
+ # Greedy decoding
108
+ next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
109
+
110
+ # Append the new token
111
+ generated_ids = torch.cat([generated_ids, next_token], dim=-1)
112
+
113
+ # Decode the new token
114
+ new_token_text = tokenizer.decode(next_token[0], skip_special_tokens=True)
115
+ response += new_token_text
116
+
117
+ # Yield the updated response for streaming
118
+ yield response
119
+
120
+ # If EOS token is generated, stop
121
+ if next_token[0, 0].item() == tokenizer.eos_token_id:
122
+ break
123
 
124
+ # Create the Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  demo = gr.ChatInterface(
126
  respond,
127
  additional_inputs=[
 
135
  label="Top-p (nucleus sampling)",
136
  ),
137
  ],
138
+ title="Samyuktha AI Chat",
139
+ description="Chat with Samyuktha, an enablement engineer at Coffee Inc."
140
  )
141
 
 
142
  if __name__ == "__main__":
143
+ demo.launch()