JJ94 commited on
Commit
817f664
Β·
verified Β·
1 Parent(s): 0e6877a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -28
app.py CHANGED
@@ -1,14 +1,14 @@
1
- from flask import Flask, request, Response, jsonify
2
  from llama_cpp import Llama
 
3
 
4
  app = Flask(__name__)
5
 
6
- # Load the model
7
- print("πŸ”„ Loading model...")
8
  llm = Llama.from_pretrained(
9
  repo_id="bartowski/google_gemma-3-1b-it-GGUF",
10
  filename="google_gemma-3-1b-it-IQ4_XS.gguf",
11
- n_ctx=2048
12
  )
13
  print("βœ… Model loaded!")
14
 
@@ -17,37 +17,28 @@ def home():
17
  print("πŸ“’ Serving index.html")
18
  return render_template("index.html")
19
 
 
 
 
 
20
 
21
- def generate_response(user_input):
22
- """Generator function to stream model output"""
23
- try:
24
  response = llm.create_chat_completion(
25
  messages=[{"role": "user", "content": user_input}],
26
- stream=True # Enable streaming
27
  )
28
-
29
  for chunk in response:
30
- if "choices" in chunk and len(chunk["choices"]) > 0:
31
- token = chunk["choices"][0]["delta"].get("content", "")
32
- if token:
33
- print(f"πŸ“ Token: {token}", flush=True) # Debugging
34
- yield token
35
-
36
- except Exception as e:
37
- print(f"❌ Error generating response: {e}")
38
- yield "[Error occurred]"
39
-
40
-
41
- @app.route("/chat", methods=["POST"])
42
- def chat():
43
- user_input = request.json.get("message", "")
44
- if not user_input:
45
- return jsonify({"error": "Empty input"}), 400
46
-
47
- return Response(generate_response(user_input), content_type="text/plain")
48
 
 
49
 
50
  if __name__ == "__main__":
51
- app.run(host="0.0.0.0", port=7860, debug=True)
52
 
53
 
 
1
+ from flask import Flask, render_template, request, Response, stream_with_context
2
  from llama_cpp import Llama
3
+ import time
4
 
5
  app = Flask(__name__)
6
 
7
+ # Load the Llama model
8
+ print("πŸš€ Loading model...")
9
  llm = Llama.from_pretrained(
10
  repo_id="bartowski/google_gemma-3-1b-it-GGUF",
11
  filename="google_gemma-3-1b-it-IQ4_XS.gguf",
 
12
  )
13
  print("βœ… Model loaded!")
14
 
 
17
  print("πŸ“’ Serving index.html")
18
  return render_template("index.html")
19
 
20
+ @app.route("/chat", methods=["POST"])
21
+ def chat():
22
+ user_input = request.json.get("message", "")
23
+ print(f"πŸ’¬ Received message: {user_input}")
24
 
25
+ def generate_response():
26
+ print("πŸ€– Generating response...")
 
27
  response = llm.create_chat_completion(
28
  messages=[{"role": "user", "content": user_input}],
29
+ stream=True # Enable streaming response
30
  )
31
+
32
  for chunk in response:
33
+ token = chunk.get("choices", [{}])[0].get("delta", {}).get("content", "")
34
+ if token:
35
+ print(f"πŸ“ Token: {token}", end="", flush=True)
36
+ yield token # Send token to the client
37
+ time.sleep(0.05) # Simulate a more natural delay
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ return Response(stream_with_context(generate_response()), content_type="text/plain")
40
 
41
  if __name__ == "__main__":
42
+ app.run(debug=True, host="0.0.0.0", port=7860)
43
 
44