nguyenbh commited on
Commit
38c463d
·
1 Parent(s): 88c6033

Use azure endpoint

Browse files
Files changed (2) hide show
  1. app.py +107 -88
  2. requirements.txt +2 -4
app.py CHANGED
@@ -1,108 +1,127 @@
1
- import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  import os
 
 
 
4
 
 
 
 
5
 
6
- hf_token = os.getenv("YOUR_HF_TOKEN")
7
-
8
- # Load model and tokenizer
9
- print("Loading model and tokenizer...")
10
- model_path = "microsoft/Phi-4-mini-instruct" # Can be changed to local path "./Phi-4-Mini-Instruct"
11
-
12
- tokenizer = AutoTokenizer.from_pretrained(
13
- model_path,
14
- padding_side="left",
15
- token=hf_token,
16
- trust_remote_code=True
17
- )
18
-
19
- model = AutoModelForCausalLM.from_pretrained(
20
- model_path,
21
- device_map="auto",
22
- attn_implementation="eager", # "flash_attention_2",
23
- torch_dtype="auto",
24
- token=hf_token,
25
- trust_remote_code=True
26
  )
27
 
28
- # Create pipeline for easier inference
29
- pipe = pipeline(
30
- "text-generation",
31
- model=model,
32
- tokenizer=tokenizer,
33
- )
 
 
34
 
35
- print("Model and tokenizer loaded successfully!")
 
 
 
36
 
37
- # Format chat history to messages format
38
- def format_chat_history(message, history):
39
- messages = [
40
- {"role": "system", "content": "You are a helpful AI assistant."}
41
- ]
 
 
 
 
 
 
 
 
42
 
43
- # Add chat history
44
- for user_msg, assistant_msg in history:
45
- messages.append({"role": "user", "content": user_msg})
46
- messages.append({"role": "assistant", "content": assistant_msg})
 
47
 
48
- # Add current message
49
  messages.append({"role": "user", "content": message})
50
 
51
- return messages
52
-
53
- # Streaming response generator
54
- def predict(message, history):
55
- messages = format_chat_history(message, history)
56
-
57
- generation_args = {
58
- "max_new_tokens": 1024,
59
- "return_full_text": False,
60
- "temperature": 0.001,
61
- "top_p": 1.0,
62
- "do_sample": True,
63
- "streamer": None, # Will be set in the generator
64
  }
65
 
66
- # Initialize an empty response
67
- partial_message = ""
68
- history_with_message = history + [[message, partial_message]]
69
-
70
- # Create a TextIteratorStreamer for streaming generation
71
- from transformers import TextIteratorStreamer
72
- from threading import Thread
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
75
- generation_args["streamer"] = streamer
 
 
76
 
77
- # Start a separate thread for generation
78
- thread = Thread(target=pipe, args=(messages,), kwargs=generation_args)
79
- thread.start()
 
 
 
80
 
81
- # Stream the response
82
- for new_text in streamer:
83
- partial_message += new_text
84
- yield history + [[message, partial_message]]
85
-
86
- # Create the Gradio interface
87
- css = """
88
- .chatbot-container {max-width: 800px; margin: auto;}
89
- .chat-header {text-align: center; margin-bottom: 20px;}
90
- """
91
-
92
- with gr.Blocks(css=css) as demo:
93
- gr.HTML("<div class='chat-header'><h1>Phi-4 Mini Chatbot</h1></div>")
94
 
95
- with gr.Column(elem_classes="chatbot-container"):
96
- chatbot = gr.Chatbot(height=400)
97
- msg = gr.Textbox(placeholder="Type your message here...", label="Input")
98
- clear = gr.Button("Clear Conversation")
99
 
100
- msg.submit(predict, [msg, chatbot], [chatbot], queue=True, api_name="chat").then(
101
- lambda: "", None, [msg]
102
- )
103
- clear.click(lambda: None, None, chatbot, queue=False)
104
-
105
-
 
 
 
 
 
 
 
 
 
106
 
107
  # Launch the app
108
- demo.launch(share=True) # Set share=False if you don't want a public link
 
 
 
1
  import os
2
+ import gradio as gr
3
+ from azure.ai.inference import ChatCompletionsClient
4
+ from azure.core.credentials import AzureKeyCredential
5
 
6
+ # Azure Inference setup
7
+ url = os.getenv("Azure_Endpoint")
8
+ api_key = AzureKeyCredential(os.getenv("Azure_API_KEY"))
9
 
10
+ # Initialize the ChatCompletionsClient
11
+ client = ChatCompletionsClient(
12
+ endpoint=url,
13
+ credential=api_key,
14
+ stream=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  )
16
 
17
+ # Get and print model information (optional)
18
+ try:
19
+ model_info = client.get_model_info()
20
+ print("Model name:", model_info.model_name)
21
+ print("Model type:", model_info.model_type)
22
+ print("Model provider name:", model_info.model_provider_name)
23
+ except Exception as e:
24
+ print("Could not get model info:", str(e))
25
 
26
+ # Configuration parameters
27
+ default_temperature = 0.8
28
+ default_max_tokens = 2048
29
+ default_top_p = 0.1
30
 
31
+ # Example prompts that users can try
32
+ example_prompts = [
33
+ "I have $20,000 in my savings account, where I receive a 4% profit per year and payments twice a year. Can you please tell me how long it will take for me to become a millionaire?",
34
+ "I have total $500 create a plan with travel and food",
35
+ "I have $1000 and 5 years. Is it better to invest in a stock paying $15 quarterly dividends or in a 5% annual savings account?"
36
+ ]
37
+
38
+ def get_azure_response(message, chat_history, temperature, max_tokens, top_p):
39
+ """
40
+ Function to get a response from the Azure Phi-4 model
41
+ """
42
+ # Prepare conversation history in the format expected by Azure
43
+ messages = [{"role": "system", "content": "You are a helpful AI assistant."}]
44
 
45
+ # Add conversation history
46
+ for human, assistant in chat_history:
47
+ messages.append({"role": "user", "content": human})
48
+ if assistant: # Only add non-empty assistant messages
49
+ messages.append({"role": "assistant", "content": assistant})
50
 
51
+ # Add the current message
52
  messages.append({"role": "user", "content": message})
53
 
54
+ # Prepare the payload
55
+ payload = {
56
+ "messages": messages,
57
+ "max_tokens": max_tokens,
58
+ "temperature": temperature,
59
+ "top_p": top_p,
60
+ "presence_penalty": 0,
61
+ "frequency_penalty": 0,
 
 
 
 
 
62
  }
63
 
64
+ # Get response
65
+ try:
66
+ print("Sending request to Azure...")
67
+ response = client.complete(payload)
68
+ reply = response.choices[0].message.content
69
+
70
+ # Print usage statistics
71
+ print(f"Usage - Prompt tokens: {response.usage.prompt_tokens}, "
72
+ f"Completion tokens: {response.usage.completion_tokens}, "
73
+ f"Total tokens: {response.usage.total_tokens}")
74
+
75
+ return reply
76
+ except Exception as e:
77
+ print(f"Error getting response: {str(e)}")
78
+ return f"Error: {str(e)}"
79
+
80
+ # Create the Gradio interface
81
+ with gr.Blocks(title="Phi-4-mini Chatbot") as demo:
82
+ gr.Markdown("Chat with the Phi-4 mini model hosted on Azure AI")
83
 
84
+ # Create a chatbot component
85
+ chatbot = gr.Chatbot(height=300)
86
+ msg = gr.Textbox(label="Type your message here", placeholder="Ask me anything...", lines=1)
87
+ clear = gr.Button("Clear Conversation")
88
 
89
+ # Add examples section
90
+ with gr.Accordion("Try these examples", open=True):
91
+ examples = gr.Examples(
92
+ examples=example_prompts,
93
+ inputs=msg
94
+ )
95
 
96
+ # Add model parameter controls
97
+ with gr.Accordion("Model Parameters", open=False):
98
+ temp_slider = gr.Slider(minimum=0.0, maximum=1.0, value=default_temperature, step=0.1,
99
+ label="Temperature (higher = more creative, lower = more focused)")
100
+ max_tokens_slider = gr.Slider(minimum=100, maximum=4096, value=default_max_tokens, step=100,
101
+ label="Max Tokens (maximum length of response)")
102
+ top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=default_top_p, step=0.1,
103
+ label="Top P (diversity of response)")
 
 
 
 
 
104
 
105
+ # Simplified chat function that handles both sending and receiving messages
106
+ def chat(message, history, temperature, max_tokens, top_p):
107
+ if not message.strip():
108
+ return "", history
109
 
110
+ # Get response from Azure
111
+ response = get_azure_response(message, history, temperature, max_tokens, top_p)
112
+
113
+ # Add the exchange to history
114
+ history.append((message, response))
115
+
116
+ return "", history # Clear the input field after sending
117
+
118
+ # Function to clear the conversation
119
+ def clear_conversation():
120
+ return [], default_temperature, default_max_tokens, default_top_p
121
+
122
+ # Set up event handlers - simplified approach
123
+ msg.submit(chat, [msg, chatbot, temp_slider, max_tokens_slider, top_p_slider], [msg, chatbot])
124
+ clear.click(clear_conversation, None, [chatbot, temp_slider, max_tokens_slider, top_p_slider])
125
 
126
  # Launch the app
127
+ demo.launch(debug=True) # Set share=True to generate a public URL for testing
requirements.txt CHANGED
@@ -1,4 +1,2 @@
1
- smolagents==1.9.2
2
- transformers==4.49.0
3
- torch
4
- accelerate
 
1
+ azure-ai-inference==1.0.0b9
2
+ azureml-inference-server-http==1.0.0