import logging from typing import cast from threading import Lock from transformers import AutoModelForCausalLM, AutoTokenizer import torch from conversation import get_default_conv_template import gradio as gr from llama_cpp import Llama import wget """ model = Model(model_path='/path/to/model.bin') while True: try: prompt = input("You: ", flush=True) if prompt == '': continue print(f"AI:", end='') for token in model.generate(prompt): print(f"{token}", end='', flush=True) print() except KeyboardInterrupt: break """ from huggingface_hub import hf_hub_download model_path = "minichat-3b.q8_0.gguf" mdlpath = hf_hub_download(repo_id="afrideva/MiniChat-3B-GGUF", filename=model_path) lcpp_model = Llama(model_path=mdlpath) def m3b_talk(text): resp = "" formattedQuery = " [|User|]" + text + " [|Assistant|]" for token in lcpp_model(formattedQuery, stop=["[|User|]", "\n"], echo=True): resp += token answer = resp["choices"][0]["text"].replace(formattedQuery, "") return answer def main(): logging.basicConfig(level=logging.INFO) with gr.Blocks() as demo: with gr.Row(variant="panel"): gr.Markdown("## Talk to MiniChat-3B\n\nTalk to MiniChat-3B.") with gr.Row(variant="panel"): with gr.Column(variant="panel"): m3b_talk_input = gr.Textbox(label="Message", placeholder="Type something here...") with gr.Column(variant="panel"): m3b_talk_output = gr.Textbox() m3b_talk_btn = gr.Button("Send") m3b_talk_btn.click(m3b_talk, inputs=m3b_talk_input, outputs=m3b_talk_output, api_name="talk_m3b") demo.queue(concurrency_count=1).launch() if __name__ == "__main__": main()