Spaces:
Sleeping
Sleeping
File size: 1,860 Bytes
34e2eaa 80d1253 34e2eaa 9eb0dec 34e2eaa 9eb0dec d816c9c 9eb0dec 34e2eaa 80d1253 34e2eaa 9eb0dec 34e2eaa 49d2c83 34e2eaa 49d2c83 34e2eaa a7121e5 9eb0dec 30782f5 d58e122 c74934d 30782f5 80d1253 34e2eaa 80d1253 9eb0dec 80d1253 34e2eaa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import gradio as gr
import copy
import time
import llama_cpp
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
zefiro = Llama(
model_path=hf_hub_download(
repo_id="giux78/zefiro-7b-beta-ITA-v0.1-GGUF",
filename="zefiro-7b-beta-ITA-v0.1-q4_0.gguf"
",
),
n_ctx=4086,
)
history = []
def generate_text(message, history):
temp = ""
input_prompt = "Chiedi a zefiro"
for interaction in history:
input_prompt += "[|Umano|] " + interaction[0] + "\n"
input_prompt += "[|Assistente|]" + interaction[1]
input_prompt += "[|Umano|] " + message + "\n[|Assistente|]"
print(input_prompt)
output = zefiro(input_prompt,
temperature= 0.15,
top_p= 0.1,
top_k= 40,
repeat_penalty= 1.1,
max_tokens= 1024,
stop= [
"[|Umano|]",
"[|Assistente|]",
],
stream= True)
for out in output:
stream = copy.deepcopy(out)
temp += stream["choices"][0]["text"]
yield temp
history = ["init", input_prompt]
with gr.Blocks() as demo:
with gr.Tab('zefiro'):
gr.ChatInterface(
generate_text,
title="zefiro-7b-v01 running on CPU (quantized Q4_K)",
description="This is a quantized version of zefiro-7b-v01 running on CPU (very slow). It is less powerful than the original version, but it can even run on the free tier of huggingface.",
examples=[
"Dammi 3 idee di ricette che posso fare con i pistacchi",
"Prepara un piano di esercizi da poter fare a casa",
"Scrivi una poesia su una giornato di pioggia"
],
cache_examples=False,
retry_btn=None,
undo_btn="Delete Previous",
clear_btn="Clear",
)
demo.queue(concurrency_count=1, max_size=5)
demo.launch() |