KGSAGAR commited on
Commit
e5d4b35
·
verified ·
1 Parent(s): c81028d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -22
app.py CHANGED
@@ -18,40 +18,88 @@ peft_model = PeftModel.from_pretrained(base_model, "KGSAGAR/Sarvam-1-text-normal
18
  peft_model = peft_model.merge_and_unload()
19
 
20
 
21
- client = InferenceClient(peft_model)
22
 
23
 
 
 
 
 
24
  def respond(
25
  message,
26
- history: list[tuple[str, str]],
27
  system_message,
28
  max_tokens,
29
  temperature,
30
  top_p,
 
 
 
31
  ):
32
- messages = [{"role": "system", "content": system_message}]
33
-
34
- for val in history:
35
- if val[0]:
36
- messages.append({"role": "user", "content": val[0]})
37
- if val[1]:
38
- messages.append({"role": "assistant", "content": val[1]})
39
-
40
- messages.append({"role": "user", "content": message})
41
-
42
- response = ""
43
-
44
- for message in client.chat_completion(
45
- messages,
46
- max_tokens=max_tokens,
47
- stream=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  temperature=temperature,
49
  top_p=top_p,
50
- ):
51
- token = message.choices[0].delta.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- response += token
54
- yield response
55
 
56
 
57
  """
 
18
  peft_model = peft_model.merge_and_unload()
19
 
20
 
21
+ # client = InferenceClient(peft_model)
22
 
23
 
24
+ import re
25
+ import torch
26
+ from transformers import AutoTokenizer
27
+
28
  def respond(
29
  message,
30
+ history,
31
  system_message,
32
  max_tokens,
33
  temperature,
34
  top_p,
35
+ peft_model,
36
+ tokenizer_name='your-tokenizer-name',
37
+ device='cuda' # or 'cpu' based on your setup
38
  ):
39
+ """
40
+ Generates a response based on the user message and history using the provided PEFT model.
41
+
42
+ Args:
43
+ message (str): The user's input message.
44
+ history (list of tuples): A list containing tuples of (user_message, assistant_response).
45
+ system_message (str): The system's initial message or prompt.
46
+ max_tokens (int): The maximum number of tokens to generate.
47
+ temperature (float): The temperature parameter for generation.
48
+ top_p (float): The top_p parameter for nucleus sampling.
49
+ peft_model: The pre-trained fine-tuned model for generation.
50
+ tokenizer_name (str): The name or path of the tokenizer.
51
+ device (str): The device to run the model on ('cuda' or 'cpu').
52
+
53
+ Yields:
54
+ str: The generated response up to the current token.
55
+ """
56
+ # Load the tokenizer
57
+ tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
58
+
59
+ # Construct the prompt
60
+ prompt = system_message
61
+ for user_msg, assistant_msg in history:
62
+ if user_msg:
63
+ prompt += f"<user>{user_msg}</user>"
64
+ if assistant_msg:
65
+ prompt += f"<assistant>{assistant_msg}</assistant>"
66
+ prompt += f"<user>{message}</user>"
67
+
68
+ # Tokenize the input prompt
69
+ inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to(device)
70
+
71
+ # Generate the output
72
+ outputs = peft_model.generate(
73
+ **inputs,
74
+ max_new_tokens=max_tokens,
75
  temperature=temperature,
76
  top_p=top_p,
77
+ do_sample=True # Enable sampling for more diverse outputs
78
+ )
79
+
80
+ # Decode the generated tokens
81
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
82
+
83
+ # Extract content between <user>...</user> tags
84
+ def extract_user_content(text):
85
+ """
86
+ Extracts and returns content between <user>...</user> tags in the given text.
87
+ If multiple such sections exist, their contents are concatenated.
88
+ """
89
+ pattern = r'<user>(.*?)</user>'
90
+ matches = re.findall(pattern, text, re.DOTALL)
91
+ extracted_content = '\n'.join(match.strip() for match in matches)
92
+ return extracted_content
93
+
94
+ # Extract the normalized text
95
+ normalized_text = extract_user_content(generated_text)
96
+
97
+ # Stream the response token by token
98
+ response = ""
99
+ for token in normalized_text.split():
100
+ response += token + " "
101
+ yield response.strip()
102
 
 
 
103
 
104
 
105
  """