Spaces:
Running
Running
from threading import Thread | |
import os | |
from typing import Iterator | |
import gradio as gr | |
import torch | |
from gradio.components import textbox | |
from huggingface_hub import hf_hub_download | |
from llama_cpp import Llama | |
from transformers import AutoModelForCausalLM, TextIteratorStreamer, LlamaTokenizer | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
llm = Llama.from_pretrained( | |
repo_id="igor-im/flux_prompt_expander", | |
filename="unsloth.Q8_0.gguf" | |
# n_gpu_layers=-1, # Uncomment to use GPU acceleration | |
# seed=1337, # Uncomment to set a specific seed | |
# n_ctx=2048, # Uncomment to increase the context window | |
) | |
def response(prompt): | |
llm_response = llm( | |
prompt, # Prompt | |
max_tokens=200, # Generate up to 32 tokens, set to None to generate up to the end of the context window | |
echo=True # Echo the prompt back in the output | |
) | |
return llm_response.get('choices')[0].get('text') | |
interface = gr.Interface(fn=response, inputs='textbox', outputs='textbox') | |
interface.launch() |