nananie143 commited on
Commit
bdcefa0
·
verified ·
1 Parent(s): d582d65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -25
app.py CHANGED
@@ -13,6 +13,7 @@ import time
13
  from threading import Lock
14
  from pathlib import Path
15
  from huggingface_hub import hf_hub_download, list_repo_files
 
16
 
17
  # Configure logging
18
  logging.basicConfig(level=logging.INFO)
@@ -30,7 +31,6 @@ def get_model_filename():
30
  try:
31
  logger.info("Listing repository files...")
32
  files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
33
- # Filter for GGUF files
34
  gguf_files = [f for f in files if f.endswith('.gguf')]
35
  if not gguf_files:
36
  raise ValueError("No GGUF model files found in repository")
@@ -44,23 +44,18 @@ def download_model_from_hf():
44
  """Download the model file from Hugging Face."""
45
  try:
46
  logger.info("Downloading model from Hugging Face Hub...")
47
-
48
- # Create models directory if it doesn't exist
49
  model_dir = Path("models")
50
  model_dir.mkdir(exist_ok=True)
51
 
52
- # Get the correct filename
53
  model_filename = get_model_filename()
54
  logger.info(f"Using model file: {model_filename}")
55
 
56
- # Download the model using huggingface_hub
57
  local_path = hf_hub_download(
58
  repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
59
  filename=model_filename,
60
  local_dir=model_dir,
61
  local_dir_use_symlinks=False
62
  )
63
-
64
  return Path(local_path)
65
  except Exception as e:
66
  logger.error(f"Error downloading model: {str(e)}")
@@ -70,22 +65,18 @@ class QwenModel:
70
  def __init__(self):
71
  """Initialize the Qwen model with automatic device detection."""
72
  try:
73
- # Check for GPU availability
74
  self.has_gpu = torch.cuda.is_available()
75
  self.device_count = torch.cuda.device_count() if self.has_gpu else 0
76
  logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
77
 
78
- # Download or get the model
79
  model_path = download_model_from_hf()
80
  logger.info(f"Model path: {model_path}")
81
 
82
- # Configure model parameters based on available hardware
83
  n_gpu_layers = 40 if self.has_gpu else 0
84
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
85
 
86
- # Adjust memory settings for CPU
87
- n_batch = 512 if self.has_gpu else 64 # Reduced batch size for CPU
88
- n_ctx = 2048 if not self.has_gpu else 4096 # Reduced context for CPU
89
 
90
  self.llm = LlamaCpp(
91
  model_path=str(model_path),
@@ -100,19 +91,166 @@ class QwenModel:
100
  f16_kv=self.has_gpu,
101
  use_mlock=True,
102
  use_mmap=True,
103
- seed=42, # For reproducibility
104
- repeat_penalty=1.1, # Prevent repetitive outputs
105
- rope_scaling={"type": "linear", "factor": 1.0}, # RoPE scaling for better long-context handling
106
  )
107
 
108
- # Thread lock for concurrent API requests
109
  self.lock = Lock()
110
 
111
  except Exception as e:
112
  logger.error(f"Failed to initialize model: {str(e)}")
113
  raise
114
 
115
- # ... [rest of the QwenModel class methods remain the same] ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  # Initialize FastAPI with lifespan
118
  app = FastAPI(title="Qwen 2.5 API")
@@ -129,29 +267,33 @@ async def lifespan(app: FastAPI):
129
  logger.info("Model initialized successfully")
130
  yield
131
  finally:
132
- # Cleanup code (if needed)
133
  pass
134
 
135
  app = FastAPI(lifespan=lifespan)
136
 
137
- # ... [rest of the FastAPI routes remain the same] ...
 
 
 
 
 
 
 
 
 
 
 
138
 
139
  def main():
140
  """Main function to initialize and launch the application."""
141
  try:
142
  global model
143
-
144
- # Initialize the model if not already initialized
145
  if model is None:
146
  model = QwenModel()
147
 
148
- # Create and launch the Gradio interface
149
  interface = create_gradio_interface(model)
150
-
151
- # Mount FastAPI app to Gradio
152
  app.mount("/", interface.app)
153
 
154
- # Launch with uvicorn
155
  uvicorn.run(
156
  app,
157
  host="0.0.0.0",
 
13
  from threading import Lock
14
  from pathlib import Path
15
  from huggingface_hub import hf_hub_download, list_repo_files
16
+ from contextlib import asynccontextmanager
17
 
18
  # Configure logging
19
  logging.basicConfig(level=logging.INFO)
 
31
  try:
32
  logger.info("Listing repository files...")
33
  files = list_repo_files("G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF")
 
34
  gguf_files = [f for f in files if f.endswith('.gguf')]
35
  if not gguf_files:
36
  raise ValueError("No GGUF model files found in repository")
 
44
  """Download the model file from Hugging Face."""
45
  try:
46
  logger.info("Downloading model from Hugging Face Hub...")
 
 
47
  model_dir = Path("models")
48
  model_dir.mkdir(exist_ok=True)
49
 
 
50
  model_filename = get_model_filename()
51
  logger.info(f"Using model file: {model_filename}")
52
 
 
53
  local_path = hf_hub_download(
54
  repo_id="G17c21ds/Qwen2.5-14B-Instruct-Uncensored-Q8_0-GGUF",
55
  filename=model_filename,
56
  local_dir=model_dir,
57
  local_dir_use_symlinks=False
58
  )
 
59
  return Path(local_path)
60
  except Exception as e:
61
  logger.error(f"Error downloading model: {str(e)}")
 
65
  def __init__(self):
66
  """Initialize the Qwen model with automatic device detection."""
67
  try:
 
68
  self.has_gpu = torch.cuda.is_available()
69
  self.device_count = torch.cuda.device_count() if self.has_gpu else 0
70
  logger.info(f"GPU available: {self.has_gpu}, Device count: {self.device_count}")
71
 
 
72
  model_path = download_model_from_hf()
73
  logger.info(f"Model path: {model_path}")
74
 
 
75
  n_gpu_layers = 40 if self.has_gpu else 0
76
  logger.info(f"Using {'GPU' if self.has_gpu else 'CPU'} for inference")
77
 
78
+ n_batch = 512 if self.has_gpu else 64
79
+ n_ctx = 2048 if not self.has_gpu else 4096
 
80
 
81
  self.llm = LlamaCpp(
82
  model_path=str(model_path),
 
91
  f16_kv=self.has_gpu,
92
  use_mlock=True,
93
  use_mmap=True,
94
+ seed=42,
95
+ repeat_penalty=1.1,
96
+ rope_scaling={"type": "linear", "factor": 1.0},
97
  )
98
 
 
99
  self.lock = Lock()
100
 
101
  except Exception as e:
102
  logger.error(f"Failed to initialize model: {str(e)}")
103
  raise
104
 
105
+ def generate_cot_prompt(self, messages: List[Dict[str, str]]) -> str:
106
+ """Generate a chain-of-thought prompt from message history."""
107
+ conversation = []
108
+ for msg in messages:
109
+ role = msg.get("role", "")
110
+ content = msg.get("content", "")
111
+
112
+ if role == "system":
113
+ conversation.append(f"System: {content}")
114
+ elif role == "user":
115
+ conversation.append(f"Human: {content}")
116
+ elif role == "assistant":
117
+ conversation.append(f"Assistant: {content}")
118
+
119
+ last_user_msg = next((msg["content"] for msg in reversed(messages)
120
+ if msg["role"] == "user"), None)
121
+
122
+ if not last_user_msg:
123
+ raise ValueError("No user message found in the conversation")
124
+
125
+ cot_template = f"""Previous conversation:
126
+ {chr(10).join(conversation)}
127
+
128
+ Let's approach the latest question step-by-step:
129
+
130
+ 1. Understanding the question:
131
+ {last_user_msg}
132
+
133
+ 2. Breaking down components:
134
+ - Key elements to consider
135
+ - Specific information requested
136
+ - Relevant constraints
137
+
138
+ 3. Reasoning process:
139
+ - Systematic approach
140
+ - Applicable knowledge
141
+ - Potential challenges
142
+
143
+ 4. Step-by-step solution:
144
+
145
+ """
146
+ return cot_template
147
+
148
+ def process_response(self, response: str) -> str:
149
+ """Process and format the model's response."""
150
+ try:
151
+ response = response.strip()
152
+ if not response.startswith("Step"):
153
+ response = "Step-by-step solution:\n" + response
154
+ return response
155
+ except Exception as e:
156
+ logger.error(f"Error processing response: {str(e)}")
157
+ return "Error processing response"
158
+
159
+ def generate_response(self,
160
+ messages: List[Dict[str, str]],
161
+ temperature: float = 0.7,
162
+ max_tokens: int = 2048) -> Dict[str, Any]:
163
+ """Generate a response using chain-of-thought reasoning."""
164
+ try:
165
+ with self.lock:
166
+ full_prompt = self.generate_cot_prompt(messages)
167
+
168
+ start_time = time.time()
169
+ response = self.llm(
170
+ full_prompt,
171
+ temperature=temperature,
172
+ max_tokens=max_tokens
173
+ )
174
+ end_time = time.time()
175
+
176
+ processed_response = self.process_response(response)
177
+
178
+ return {
179
+ "id": f"chatcmpl-{int(time.time()*1000)}",
180
+ "object": "chat.completion",
181
+ "created": int(time.time()),
182
+ "model": "qwen-2.5-14b",
183
+ "choices": [{
184
+ "index": 0,
185
+ "message": {
186
+ "role": "assistant",
187
+ "content": processed_response
188
+ },
189
+ "finish_reason": "stop"
190
+ }],
191
+ "usage": {
192
+ "prompt_tokens": len(full_prompt.split()),
193
+ "completion_tokens": len(processed_response.split()),
194
+ "total_tokens": len(full_prompt.split()) + len(processed_response.split())
195
+ },
196
+ "system_info": {
197
+ "device": "gpu" if self.has_gpu else "cpu",
198
+ "processing_time": round(end_time - start_time, 2)
199
+ }
200
+ }
201
+ except Exception as e:
202
+ logger.error(f"Error generating response: {str(e)}")
203
+ raise HTTPException(status_code=500, detail=str(e))
204
+
205
+ def create_gradio_interface(model: QwenModel):
206
+ """Create and configure the Gradio interface."""
207
+
208
+ def predict(message: str,
209
+ temperature: float,
210
+ max_tokens: int) -> str:
211
+ messages = [{"role": "user", "content": message}]
212
+ response = model.generate_response(
213
+ messages,
214
+ temperature=temperature,
215
+ max_tokens=max_tokens
216
+ )
217
+ return response["choices"][0]["message"]["content"]
218
+
219
+ iface = gr.Interface(
220
+ fn=predict,
221
+ inputs=[
222
+ gr.Textbox(
223
+ label="Input",
224
+ placeholder="Enter your question or task here...",
225
+ lines=5
226
+ ),
227
+ gr.Slider(
228
+ minimum=0.1,
229
+ maximum=1.0,
230
+ value=0.7,
231
+ label="Temperature",
232
+ info="Higher values make the output more random"
233
+ ),
234
+ gr.Slider(
235
+ minimum=64,
236
+ maximum=4096,
237
+ value=2048,
238
+ step=64,
239
+ label="Max Tokens",
240
+ info="Maximum length of the generated response"
241
+ )
242
+ ],
243
+ outputs=gr.Textbox(label="Response", lines=10),
244
+ title="Qwen 2.5 14B Instruct Model",
245
+ description="""This is a Qwen 2.5 14B model interface with chain-of-thought prompting.
246
+ The model will break down complex problems and solve them step by step.""",
247
+ examples=[
248
+ ["Explain how photosynthesis works", 0.7, 2048],
249
+ ["Solve the quadratic equation: x² + 5x + 6 = 0", 0.7, 1024],
250
+ ["What are the implications of Moore's Law for future computing?", 0.8, 2048]
251
+ ]
252
+ )
253
+ return iface
254
 
255
  # Initialize FastAPI with lifespan
256
  app = FastAPI(title="Qwen 2.5 API")
 
267
  logger.info("Model initialized successfully")
268
  yield
269
  finally:
 
270
  pass
271
 
272
  app = FastAPI(lifespan=lifespan)
273
 
274
+ @app.post("/v1/chat/completions")
275
+ async def create_chat_completion(request: ChatCompletionRequest):
276
+ """OpenAI-compatible chat completions endpoint."""
277
+ try:
278
+ response = model.generate_response(
279
+ request.messages,
280
+ temperature=request.temperature,
281
+ max_tokens=request.max_tokens
282
+ )
283
+ return JSONResponse(content=response)
284
+ except Exception as e:
285
+ raise HTTPException(status_code=500, detail=str(e))
286
 
287
  def main():
288
  """Main function to initialize and launch the application."""
289
  try:
290
  global model
 
 
291
  if model is None:
292
  model = QwenModel()
293
 
 
294
  interface = create_gradio_interface(model)
 
 
295
  app.mount("/", interface.app)
296
 
 
297
  uvicorn.run(
298
  app,
299
  host="0.0.0.0",