File size: 6,160 Bytes
aac6e8d 55e6f4a aac6e8d cf0c2b1 aac6e8d fca257f aa7dbbd aac6e8d c6062c4 aac6e8d aa7dbbd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
# gradio app for the LLM model --> use the retr environment
# Run the script and open the link in the browser.
import os
import pandas as pd
import datasets
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# training from scratch with latbert tokenizer
CHECKPOINT_PATH= 'scratch_2-nodes_tokenizer_latbert-original_packing_fcocchi/'
CHECKPOINT_PATH= 'itserr/scratch_2-nodes_tokenizer_latbert-original_packing_fcocchi'
print(f"Loading model from: {CHECKPOINT_PATH}")
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, token=os.environ['HF_TOKEN_READ'])
model = AutoModelForCausalLM.from_pretrained(CHECKPOINT_PATH, token=os.environ['HF_TOKEN_READ'])
preference_dataset_name= "itserr/latin_gpt_preferences"
global dataset_hf
dataset_hf = datasets.load_dataset(preference_dataset_name, token=os.environ['HF_TOKEN_READ'], download_mode='force_redownload')
dataset_hf = dataset_hf['train'].to_pandas()
This is a Latin Language Model (LLM) based on GPT-2 and it was trained on a large corpus of Latin texts and can generate text in Latin. \n
Demo instructions:
- Enter a prompt in Latin in the Input Text box.
- Select the temperature value to control the randomness of the generated text (higher value produce a more creative and unstable answer).
- Click the 'Generate Text' button to trigger model generation.
- (Optional) insert a Feedback text in the box.
- Click the 'Like' or 'Dislike' button to judge the generation correctness.
# (L<sup>2</sup>) - Latin Language Model
title= "LatinGPT"
article= "hello world ..."
examples= ['Accidere ex una scintilla', 'Audacter calumniare,', 'Consolatium misero comites', 'Errare humanum est,', 'Excusatio non petita,']
logo_image= 'ITSERR_row_logo.png'
def generate_text(prompt, slider):
if torch.cuda.is_available(): device = torch.device("cuda")
device = torch.device("cpu")
print("No GPU available")
print("***** Generate *****")
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)
#generated_text = text_generator(prompt, max_length=100)
generated_text = text_generator(prompt, max_length=50, do_sample=True, temperature=slider, repetition_penalty=2.0, truncation=True)
return generated_text[0]['generated_text']
# Function to handle user preferences
def handle_preference(preference, input, output, feedback, temp_value):
Format values stored in preferences:
- input text
- output generated text
- user feedback
- float temperature value
# first time staring from a csv file (edited the present one), then work with parquet file
# input_text,generated_text,feedback,temperature,like,dislike,count_like,count_dislike
global dataset_hf
if input == output:
output_tuple= ("", "")
output_tuple= (input, output.split(input)[-1])
if preference == "like":
count_like= dataset_hf.iloc[-1]['count_like']
count_dislike= dataset_hf.iloc[-1]['count_dislike']
if output_tuple[1] != "" :
count_like= dataset_hf.iloc[-1]['count_like'] + 1
elif preference == "dislike":
count_like= dataset_hf.iloc[-1]['count_like']
count_dislike= dataset_hf.iloc[-1]['count_dislike']
if output_tuple[1] != "" :
count_dislike= dataset_hf.iloc[-1]['count_dislike'] + 1
inp_text= output_tuple[0]
out_text= output_tuple[1]
new_data = pd.DataFrame({'input_text': inp_text, 'generated_text': out_text, 'feedback': feedback,
'temperature': float(temp_value), 'like': like, 'dislike': dislike,
'count_like': count_like, 'count_dislike': count_dislike}, index=[0])
dataset_hf = pd.concat([dataset_hf, new_data], ignore_index=True)
hf_dataset = datasets.Dataset.from_pandas(dataset_hf)
dataset_dict = datasets.DatasetDict({"train": hf_dataset})
dataset_dict.push_to_hub(preference_dataset_name, token=os.environ['HF_TOKEN_WRITE'])
# print dataset statistics
print(f"Admin log: like: {count_like} and dislike: {count_dislike}")
return f"You select '{preference}' as answer of the model generation. Thank you for your time!"
custom_css = """
#logo {
display: block;
margin-left: auto;
margin-right: auto;
width: 280px;
height: 140px;
with gr.Blocks(css=custom_css) as demo:
gr.Image(logo_image, elem_id="logo")
gr.Markdown(f"<h1 style='text-align: center;'>{title}</h1>")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(lines=5, placeholder="Enter latin text here...", label="Input Text")
with gr.Column():
output_text = gr.Textbox(lines=5, placeholder="Output text will appear here...", label="Output Text")
gr.Examples(examples=examples, inputs=input_text, cache_examples=True, fn=generate_text, outputs=output_text) # , cache_examples="true"
temperature_slider = gr.Slider(minimum=0.1, maximum=5.0, step=0.1, value=1.0, label="Temperature")
clean_button = gr.Button("Generate Text"), inputs=[input_text, temperature_slider], outputs=output_text)
feedback_output = gr.Textbox(lines=1, placeholder="If you want to provide a feedback, please fill this box ...", label="Feedback")
with gr.Row():
like_button = gr.Button("Like")
dislike_button = gr.Button("Dislike")
button_output = gr.Textbox(lines=1, placeholder="Please submit your choice", label="Latin Language Model Demo") x,y,z,v: handle_preference("like", x, y, z, v), inputs=[input_text, output_text, feedback_output, temperature_slider], outputs=button_output) x,y,z,v: handle_preference("dislike", x, y, z, v), inputs=[input_text, output_text, feedback_output, temperature_slider], outputs=button_output)