|
import streamlit as st |
|
import torch |
|
import os |
|
import sys |
|
import time |
|
import json |
|
from typing import List |
|
import datasets |
|
import csv |
|
from transformers import LlamaTokenizer, LlamaForCausalLM |
|
import tqdm |
|
|
|
|
|
base_model_name = "EthioNLP/Amharic-llama-base-model" |
|
adapters_name = 'EthioNLP/Amharic-LLAMA-all-data' |
|
|
|
|
|
|
|
|
|
BASE_PROMPT = """Below is an interaction between a human and an AI fluent in English and Amharic, providing reliable and informative answers. The AI is supposed to answer test questions from the human with short responses saying just the answer and nothing else. |
|
|
|
Human: {instruction} |
|
|
|
Assistant [Amharic] : """ |
|
|
|
|
|
|
|
from peft import PeftModel |
|
from transformers import LlamaForCausalLM, LlamaConfig |
|
|
|
|
|
def load_model(model_name, quantization): |
|
model = LlamaForCausalLM.from_pretrained( |
|
model_name, |
|
return_dict=True, |
|
load_in_8bit=quantization, |
|
device_map='cpu', |
|
low_cpu_mem_usage=True, |
|
) |
|
return model |
|
|
|
|
|
|
|
def load_peft_model(model, peft_model): |
|
peft_model = PeftModel.from_pretrained(model, peft_model,offload_folder='./') |
|
return peft_model |
|
|
|
|
|
def load_llama_from_config(config_path): |
|
model_config = LlamaConfig.from_pretrained(config_path) |
|
model = LlamaForCausalLM(config=model_config) |
|
return model |
|
|
|
|
|
|
|
def main( |
|
model, |
|
tokenizer, |
|
datasource, |
|
csv_file_path, |
|
max_new_tokens=100, |
|
seed=42, |
|
do_sample=True, |
|
min_length=None, |
|
use_cache=True, |
|
top_p=1.0, |
|
temperature=1.0, |
|
top_k=5, |
|
repetition_penalty=5.0, |
|
length_penalty=1, |
|
enable_azure_content_safety=False, |
|
enable_sensitive_topics=False, |
|
enable_saleforce_content_safety=False, |
|
**kwargs |
|
): |
|
|
|
print("*** Ensure that you have replaced the default tokenizer with the appropriate one for your use case.") |
|
|
|
model.eval() |
|
|
|
|
|
dataset = hf_dataset['test'] |
|
|
|
|
|
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file: |
|
writer = csv.writer(file) |
|
writer.writerow(['Instruction', 'Input Text', 'Datasource','response', 'gold_label']) |
|
|
|
for item in tqdm.tqdm(dataset): |
|
instruction = item['instruction'] |
|
input_text = item['input'] |
|
datasource = item['datasource'] |
|
gold_label=item['output'] |
|
|
|
|
|
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}") |
|
|
|
batch = tokenizer(user_prompt, return_tensors="pt") |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
|
|
start = time.perf_counter() |
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**batch, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=do_sample, |
|
top_p=top_p, |
|
temperature=temperature, |
|
min_length=min_length, |
|
use_cache=use_cache, |
|
top_k=top_k, |
|
repetition_penalty=repetition_penalty, |
|
length_penalty=length_penalty, |
|
**kwargs) |
|
|
|
e2e_inference_time = (time.perf_counter() - start) * 1000 |
|
|
|
|
|
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):] |
|
|
|
|
|
|
|
writer.writerow([instruction, input_text,datasource, output_text, gold_label]) |
|
torch.cuda.empty_cache() |
|
|
|
|
|
|
|
|
|
|
|
|
|
model = load_model(base_model_name, quantization=True) |
|
|
|
tokenizer = LlamaTokenizer.from_pretrained(adapters_name) |
|
embedding_size = model.get_input_embeddings().weight.shape[0] |
|
|
|
if len(tokenizer) != embedding_size: |
|
print("resize the embedding size by the size of the tokenizer") |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
|
|
model.load_adapter(adapters_name) |
|
|
|
BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}") |
|
|
|
|
|
|
|
|
|
max_new_tokens=100 |
|
seed=42 |
|
do_sample=True |
|
min_length=None |
|
use_cache=True |
|
top_p=1.0 |
|
temperature=1.0 |
|
top_k=5 |
|
repetition_penalty=5.0 |
|
length_penalty=1 |
|
enable_azure_content_safety=False |
|
enable_sensitive_topics=False |
|
enable_saleforce_content_safety=False |
|
|
|
|
|
def predict(instruction,input_text=" "): |
|
user_prompt = BASE_PROMPT.format(instruction=f"{instruction}\n{input_text}") |
|
|
|
batch = tokenizer(user_prompt, return_tensors="pt") |
|
batch = {k: v.to(model.device) for k, v in batch.items()} |
|
|
|
start = time.perf_counter() |
|
|
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**batch, |
|
max_new_tokens=max_new_tokens, |
|
do_sample=do_sample, |
|
top_p=top_p, |
|
temperature=temperature, |
|
min_length=min_length, |
|
use_cache=use_cache, |
|
top_k=top_k, |
|
repetition_penalty=repetition_penalty, |
|
length_penalty=length_penalty) |
|
|
|
e2e_inference_time = (time.perf_counter() - start) * 1000 |
|
|
|
|
|
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)[len(user_prompt):] |
|
|
|
return output_text |
|
|
|
|
|
|
|
|
|
st.title('LLM Interaction Interface') |
|
|
|
user_input = st.text_input("Ask a question:") |
|
|
|
if user_input: |
|
|
|
response = predict(user_input) |
|
st.text_area("Response:", value=response, height=300, max_chars=None, help=None) |
|
|
|
|