JCai commited on
Commit
60eea81
1 Parent(s): 58c8e18

modify app code for case study 3

Browse files
Files changed (1) hide show
  1. app.py +75 -58
app.py CHANGED
@@ -2,11 +2,18 @@ import gradio as gr
2
  from huggingface_hub import InferenceClient
3
  import torch
4
  from transformers import pipeline
 
5
 
6
  from typing import Iterable
7
  from gradio.themes.base import Base
8
  from gradio.themes.utils import colors, fonts, sizes
9
 
 
 
 
 
 
 
10
  # import os
11
  # from dotenv import load_dotenv
12
  # load_dotenv()
@@ -34,65 +41,74 @@ def respond(
34
  system_message += " You also love puns and add 'meow' at the end of every response."
35
  global stop_inference
36
  stop_inference = False # Reset cancellation flag
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Initialize history if it's None
39
- if history is None:
40
- history = []
41
-
42
- if use_local_model:
43
- # local inference
44
- messages = [{"role": "system", "content": system_message}]
45
- for val in history:
46
- if val[0]:
47
- messages.append({"role": "user", "content": val[0]})
48
- if val[1]:
49
- messages.append({"role": "assistant", "content": val[1]})
50
- messages.append({"role": "user", "content": message})
51
-
52
- response = ""
53
- for output in pipe(
54
- messages,
55
- max_new_tokens=max_tokens,
56
- temperature=temperature,
57
- do_sample=True,
58
- top_p=top_p,
59
- ):
60
- if stop_inference:
61
- response = "Inference cancelled."
62
- yield history + [(message, response)]
63
- return
64
- token = output['generated_text'][-1]['content']
65
- response += token
66
- yield history + [(message, response)] # Yield history + new response
67
-
68
- else:
69
- # API-based inference
70
- messages = [{"role": "system", "content": system_message}]
71
- for val in history:
72
- if val[0]:
73
- messages.append({"role": "user", "content": val[0]})
74
- if val[1]:
75
- messages.append({"role": "assistant", "content": val[1]})
76
- messages.append({"role": "user", "content": message})
77
-
78
- response = ""
79
- for message_chunk in client.chat_completion(
80
- messages,
81
- max_tokens=max_tokens,
82
- stream=True,
83
- temperature=temperature,
84
- top_p=top_p,
85
- ):
86
- if stop_inference:
87
- response = "Inference cancelled."
88
- yield history + [(message, response)]
89
- return
90
- if stop_inference:
91
- response = "Inference cancelled."
92
- break
93
- token = message_chunk.choices[0].delta.content
94
- response += token
95
- yield history + [(message, response)] # Yield history + new response
96
 
97
 
98
  def cancel_inference():
@@ -244,5 +260,6 @@ with gr.Blocks(css=custom_css) as demo:
244
  cancel_button.click(cancel_inference)
245
 
246
  if __name__ == "__main__":
 
247
  demo.launch(share=False) # Remove share=True because it's not supported on HF Spaces
248
 
 
2
  from huggingface_hub import InferenceClient
3
  import torch
4
  from transformers import pipeline
5
+ from prometheus_client import start_http_server, Counter, Summary
6
 
7
  from typing import Iterable
8
  from gradio.themes.base import Base
9
  from gradio.themes.utils import colors, fonts, sizes
10
 
11
+ # Prometheus metrics
12
+ REQUEST_COUNTER = Counter('app_requests_total', 'Total number of requests')
13
+ SUCCESSFUL_REQUESTS = Counter('app_successful_requests_total', 'Total number of successful requests')
14
+ FAILED_REQUESTS = Counter('app_failed_requests_total', 'Total number of failed requests')
15
+ REQUEST_DURATION = Summary('app_request_duration_seconds', 'Time spent processing request')
16
+
17
  # import os
18
  # from dotenv import load_dotenv
19
  # load_dotenv()
 
41
  system_message += " You also love puns and add 'meow' at the end of every response."
42
  global stop_inference
43
  stop_inference = False # Reset cancellation flag
44
+ REQUEST_COUNTER.inc() # Increment request counter
45
+ request_timer = REQUEST_DURATION.time() # Start timing the request
46
+
47
+ try:
48
+ # Initialize history if it's None
49
+ if history is None:
50
+ history = []
51
+
52
+ if use_local_model:
53
+ # local inference
54
+ messages = [{"role": "system", "content": system_message}]
55
+ for val in history:
56
+ if val[0]:
57
+ messages.append({"role": "user", "content": val[0]})
58
+ if val[1]:
59
+ messages.append({"role": "assistant", "content": val[1]})
60
+ messages.append({"role": "user", "content": message})
61
+
62
+ response = ""
63
+ for output in pipe(
64
+ messages,
65
+ max_new_tokens=max_tokens,
66
+ temperature=temperature,
67
+ do_sample=True,
68
+ top_p=top_p,
69
+ ):
70
+ if stop_inference:
71
+ response = "Inference cancelled."
72
+ yield history + [(message, response)]
73
+ return
74
+ token = output['generated_text'][-1]['content']
75
+ response += token
76
+ yield history + [(message, response)] # Yield history + new response
77
+
78
+ else:
79
+ # API-based inference
80
+ messages = [{"role": "system", "content": system_message}]
81
+ for val in history:
82
+ if val[0]:
83
+ messages.append({"role": "user", "content": val[0]})
84
+ if val[1]:
85
+ messages.append({"role": "assistant", "content": val[1]})
86
+ messages.append({"role": "user", "content": message})
87
 
88
+ response = ""
89
+ for message_chunk in client.chat_completion(
90
+ messages,
91
+ max_tokens=max_tokens,
92
+ stream=True,
93
+ temperature=temperature,
94
+ top_p=top_p,
95
+ ):
96
+ if stop_inference:
97
+ response = "Inference cancelled."
98
+ yield history + [(message, response)]
99
+ return
100
+ if stop_inference:
101
+ response = "Inference cancelled."
102
+ break
103
+ token = message_chunk.choices[0].delta.content
104
+ response += token
105
+ yield history + [(message, response)] # Yield history + new response
106
+ SUCCESSFUL_REQUESTS.inc() # Increment successful request counter
107
+ except Exception as e:
108
+ FAILED_REQUESTS.inc() # Increment failed request counter
109
+ yield history + [(message, f"Error: {str(e)}")]
110
+ finally:
111
+ request_timer.observe_duration() # Stop timing the request
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  def cancel_inference():
 
260
  cancel_button.click(cancel_inference)
261
 
262
  if __name__ == "__main__":
263
+ start_http_server(8000) # Expose metrics on port 8000
264
  demo.launch(share=False) # Remove share=True because it's not supported on HF Spaces
265