from threading import Thread import os from typing import Iterator import gradio as gr import torch from gradio.components import textbox from huggingface_hub import hf_hub_download from llama_cpp import Llama from transformers import AutoModelForCausalLM, TextIteratorStreamer, LlamaTokenizer from transformers import AutoTokenizer, AutoModelForCausalLM llm = Llama.from_pretrained( repo_id="igor-im/flux_prompt_expander", filename="unsloth.Q8_0.gguf" # n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed # n_ctx=2048, # Uncomment to increase the context window ) def response(prompt): llm_response = llm( prompt, # Prompt max_tokens=200, # Generate up to 32 tokens, set to None to generate up to the end of the context window echo=True # Echo the prompt back in the output ) return llm_response.get('choices')[0].get('text') interface = gr.Interface(fn=response, inputs='textbox', outputs='textbox') interface.launch()