import gradio as gr from llama_cpp import Llama import datetime import os import datetime from huggingface_hub import hf_hub_download #MODEL SETTINGS also for DISPLAY convHistory = '' modelfile = hf_hub_download( repo_id=os.environ.get("REPO_ID", "RichardErkhov/scb10x_-_llama-3-typhoon-v1.5-8b-instruct-gguf"), filename=os.environ.get("MODEL_FILE", "llama-3-typhoon-v1.5-8b-instruct.Q4_K_M.gguf"), ) repetitionpenalty = 1.15 contextlength=8192 logfile = 'typhoon-v1.5-8b-instruct_logs.txt' print("loading model...") stt = datetime.datetime.now() # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. llm = Llama( model_path=modelfile, # Download the model file first n_ctx=contextlength, # The max sequence length to use - note that longer sequence lengths require much more resources n_threads=2, # The number of CPU threads to use, tailor to your system and the resulting performance ) dt = datetime.datetime.now() - stt print(f"Model loaded in {dt}") def writehistory(text): with open(logfile, 'a') as f: f.write(text) f.write('\n') f.close() """ gr.themes.Base() gr.themes.Default() gr.themes.Glass() gr.themes.Monochrome() gr.themes.Soft() """ def combine(a, b, c, d,e,f): global convHistory import datetime SYSTEM_PROMPT = f"""{a} """ temperature = c max_new_tokens = d repeat_penalty = f top_p = e prompt = f"<|user|>\n{b}<|endoftext|>\n<|assistant|>" # prompt = [ # {"role": "system", "content": SYSTEM_PROMPT} , # {"role": "user", "content": b}, # ] prompt = f"""{prompt}""" start = datetime.datetime.now() generation = "" delta = "" prompt_tokens = f"Prompt Tokens: {len(llm.tokenize(bytes(prompt,encoding='utf-8')))}" generated_text = "" answer_tokens = '' total_tokens = '' for character in llm(prompt, max_tokens=max_new_tokens, #stop=["<|eot_id|>"], temperature = temperature, repeat_penalty = repeat_penalty, top_p = top_p, # Example stop token - not necessarily correct for this specific model! Please check before using. echo=False, stream=True): generation += character["choices"][0]["text"] answer_tokens = f"Out Tkns: {len(llm.tokenize(bytes(generation,encoding='utf-8')))}" total_tokens = f"Total Tkns: {len(llm.tokenize(bytes(prompt,encoding='utf-8'))) + len(llm.tokenize(bytes(generation,encoding='utf-8')))}" delta = datetime.datetime.now() - start yield generation, delta, prompt_tokens, answer_tokens, total_tokens print(f"Response: {generation}") timestamp = datetime.datetime.now() logger = f"""time: {timestamp}\n Temp: {temperature} - MaxNewTokens: {max_new_tokens} - RepPenalty: 1.5 \nPROMPT: \n{prompt}\nStableZephyr3B: {generation}\nGenerated in {delta}\nPromptTokens: {prompt_tokens} Output Tokens: {answer_tokens} Total Tokens: {total_tokens}\n\n---\n\n""" writehistory(logger) convHistory = convHistory + prompt + "\n" + generation + "\n" print(convHistory) return generation, delta, prompt_tokens, answer_tokens, total_tokens #return generation, delta # MAIN GRADIO INTERFACE with gr.Blocks(theme='Medguy/base2') as demo: #theme=gr.themes.Glass() #theme='remilia/Ghostly' #TITLE SECTION with gr.Row(variant='compact'): with gr.Column(scale=10): gr.HTML("