import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Load a user-specified model def load_user_model(repo_id, model_file): print(f"Downloading model {model_file} from repository {repo_id}...") local_path = hf_hub_download(repo_id=repo_id, filename=model_file) print(f"Model downloaded to: {local_path}") return Llama(model_path=local_path, n_ctx=2048, n_threads=8) # Generate a response using the specified model and prompt def generate_response(model, prompt): response = model(prompt, max_tokens=512, temperature=0.5, top_p=0.95) return response["choices"][0]["text"] # Evaluate responses using the LoRA evaluation model def evaluate_responses(prompt, repo_a, model_a, repo_b, model_b, evaluation_criteria): if len(evaluation_criteria) > 3: return "Error: Please select up to 3 evaluation criteria only.", "", "" # Load models model_a_instance = load_user_model(repo_a, model_a) model_b_instance = load_user_model(repo_b, model_b) # Generate responses response_a = generate_response(model_a_instance, prompt) response_b = generate_response(model_b_instance, prompt) # Display generated responses print(f"Response A: {response_a}") print(f"Response B: {response_b}") # Format the evaluation prompt criteria_list = ", ".join(evaluation_criteria) evaluation_prompt = f""" Prompt: {prompt} Response A: {response_a} Response B: {response_b} Evaluation Criteria: {criteria_list} Please evaluate the responses based on the selected criteria. For each criterion, rate both responses on a scale from 1 to 4 and provide a justification. Finally, declare the winner (or 'draw' if they are equal). """ # Use the LoRA model to evaluate the responses evaluation_response = lora_model.create_completion( prompt=evaluation_prompt, max_tokens=512, temperature=0.5, top_p=0.95, ) evaluation_results = evaluation_response["choices"][0]["text"] return response_a, response_b, evaluation_results # Load the LoRA evaluation model def load_lora_model(): repo_id = "KolumbusLindh/LoRA-6150" model_file = "unsloth.F16.gguf" print(f"Downloading LoRA evaluation model from repository {repo_id}...") local_path = hf_hub_download(repo_id=repo_id, filename=model_file) print(f"LoRA evaluation model downloaded to: {local_path}") return Llama(model_path=local_path, n_ctx=2048, n_threads=8) lora_model = load_lora_model() print("LoRA evaluation model loaded successfully!") # Gradio interface with gr.Blocks(title="LLM as a Judge") as demo: gr.Markdown("## LLM as a Judge 𐄷") # Model inputs repo_a_input = gr.Textbox(label="Model A Repository", placeholder="KolumbusLindh/LoRA-6150") model_a_input = gr.Textbox(label="Model A File Name", placeholder="unsloth.F16.gguf") repo_b_input = gr.Textbox(label="Model B Repository", placeholder="forestav/LoRA-2000") model_b_input = gr.Textbox(label="Model B File Name", placeholder="unsloth.F16.gguf") # Prompt and criteria inputs prompt_input = gr.Textbox(label="Enter Prompt", placeholder="Enter the prompt here...", lines=3) criteria_dropdown = gr.CheckboxGroup( label="Select Evaluation Criteria (Max 3)", choices=["Clarity", "Completeness", "Accuracy"] # Restricted criteria ) # Button and outputs evaluate_button = gr.Button("Evaluate Models") response_a_output = gr.Textbox( label="Response A", placeholder="Response from Model A will appear here...", lines=10, interactive=False ) response_b_output = gr.Textbox( label="Response B", placeholder="Response from Model B will appear here...", lines=10, interactive=False ) evaluation_output = gr.Textbox( label="Evaluation Results", placeholder="The evaluation analysis will appear here...", lines=20, interactive=False ) # Link evaluation function evaluate_button.click( fn=evaluate_responses, inputs=[prompt_input, repo_a_input, model_a_input, repo_b_input, model_b_input, criteria_dropdown], outputs=[response_a_output, response_b_output, evaluation_output] ) # Launch app if __name__ == "__main__": demo.launch()