from transformers import AutoModelForCausalLM, AutoTokenizer from flask import Flask, request, jsonify app = Flask(__name__) model_name = "vilm/vinallama-7b-chat" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto") @app.route("/generate", methods=["POST"]) def generate(): data = request.get_json() prompt = data.get("prompt", "") inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024) outputs = model.generate(inputs.input_ids, max_length=512, pad_token_id=tokenizer.eos_token_id) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return jsonify({"response": response}) if __name__ == "__main__": app.run(host="0.0.0.0", port=7860)