|
import re |
|
import nltk |
|
import torch |
|
import numpy as np |
|
from collections import Counter |
|
from nltk.tokenize import word_tokenize |
|
import textstat |
|
import json |
|
import requests |
|
import concurrent.futures |
|
|
|
import tensorflow as tf |
|
from keras.layers import Layer |
|
from transformers import DebertaV2Tokenizer, TFAutoModel |
|
import streamlit as st |
|
from google import genai |
|
import pandas as pd |
|
|
|
torch.classes.__path__ = [] |
|
|
|
nltk.download('punkt', quiet=True) |
|
|
|
|
|
def clean_response(text: str) -> str: |
|
text = re.sub(r"[*_`#>\-\[\]()]", "", text) |
|
text = re.sub(r"\s+", " ", text) |
|
return text.strip() |
|
|
|
|
|
def get_response_from_gemini(prompt: str, key) -> str: |
|
gemini_client = genai.Client(api_key=key) |
|
response = gemini_client.models.generate_content( |
|
model="gemini-2.5-pro-exp-03-25", |
|
contents=prompt, |
|
) |
|
return response.text.strip() |
|
|
|
def get_response_from_deepseek(prompt: str, key) -> str: |
|
response = requests.post( |
|
url="https://openrouter.ai/api/v1/chat/completions", |
|
headers={"Authorization": f"Bearer {key}"}, |
|
data=json.dumps({ |
|
"model": "deepseek/deepseek-r1:free", |
|
"messages": [{"role": "user", "content": prompt}] |
|
}) |
|
) |
|
return response.json()["choices"][0]["message"]["content"] |
|
|
|
def get_response_from_llamafourscout(prompt: str, key) -> str: |
|
response = requests.post( |
|
url="https://openrouter.ai/api/v1/chat/completions", |
|
headers={"Authorization": f"Bearer {key}"}, |
|
data=json.dumps({ |
|
"model": "meta-llama/llama-4-scout:free", |
|
"messages": [{"role": "user", "content": prompt}] |
|
}) |
|
) |
|
return response.json()["choices"][0]["message"]["content"] |
|
|
|
def get_response_from_mistralsmall(prompt: str, key) -> str: |
|
response = requests.post( |
|
url="https://openrouter.ai/api/v1/chat/completions", |
|
headers={"Authorization": f"Bearer {key}"}, |
|
data=json.dumps({ |
|
"model": "mistralai/mistral-small-3.1-24b-instruct:free", |
|
"messages": [{"role": "user", "content": prompt}] |
|
}) |
|
) |
|
return response.json()["choices"][0]["message"]["content"] |
|
|
|
|
|
MODEL_MAP = { |
|
"Gemini": get_response_from_gemini, |
|
"DeepSeek": get_response_from_deepseek, |
|
"LLaMA 4 Scout": get_response_from_llamafourscout, |
|
"Mistral Small": get_response_from_mistralsmall, |
|
} |
|
|
|
|
|
def calculate_entropy(text: str) -> float: |
|
try: |
|
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()] |
|
if not tokens: |
|
return -999999 |
|
freq_dist = Counter(tokens) |
|
total_words = len(tokens) |
|
probabilities = [count / total_words for count in freq_dist.values()] |
|
return -sum(p * np.log2(p) for p in probabilities) |
|
except: |
|
return -999999 |
|
|
|
def calculate_ttr(text: str) -> float: |
|
try: |
|
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()] |
|
return len(set(tokens)) / len(tokens) if tokens else -999999 |
|
except: |
|
return -999999 |
|
|
|
def get_fk_score(text: str) -> float: |
|
try: |
|
return textstat.flesch_kincaid_grade(text) |
|
except: |
|
return -999999 |
|
|
|
def get_dc_score(text: str) -> float: |
|
try: |
|
return textstat.dale_chall_readability_score(text) |
|
except: |
|
return -999999 |
|
|
|
|
|
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base") |
|
|
|
class DebertaEmbeddingLayer(Layer): |
|
def __init__(self, **kwargs): |
|
super(DebertaEmbeddingLayer, self).__init__(**kwargs) |
|
self.deberta = TFAutoModel.from_pretrained("microsoft/deberta-v3-base") |
|
|
|
def call(self, inputs): |
|
input_ids, attention_mask = inputs |
|
outputs = self.deberta(input_ids, attention_mask=tf.cast(attention_mask, dtype=tf.int32)) |
|
return outputs.last_hidden_state |
|
|
|
def compute_output_shape(self, input_shape): |
|
return (input_shape[0][0], input_shape[0][1], 768) |
|
|
|
model = tf.keras.models.load_model("models/hybrid_lstm_model.keras", custom_objects={"DebertaEmbeddingLayer": DebertaEmbeddingLayer}) |
|
|
|
|
|
def preprocess_inputs(prompt: str, response_a: str, response_b: str, tokenizer, max_length=512): |
|
combined_text = prompt + " " + response_a + " " + response_b |
|
encoded = tokenizer( |
|
[combined_text], |
|
padding="max_length", |
|
truncation=True, |
|
max_length=max_length, |
|
return_tensors="tf" |
|
) |
|
metrics = np.array([ |
|
get_fk_score(response_a), |
|
get_fk_score(response_b), |
|
get_dc_score(response_a), |
|
get_dc_score(response_b), |
|
calculate_ttr(response_a), |
|
calculate_ttr(response_b), |
|
calculate_entropy(response_a), |
|
calculate_entropy(response_b) |
|
]).reshape(1, -1).astype(np.float32) |
|
return encoded["input_ids"], encoded["attention_mask"], metrics |
|
|
|
|
|
|
|
st.set_page_config(page_title="LMSYS Demo", layout="wide") |
|
|
|
st.markdown( |
|
""" |
|
<style> |
|
* { |
|
font-family: 'Georgia', serif !important; |
|
} |
|
.stButton>button { |
|
background-color: #C2B280; |
|
color: #3B2F2F; |
|
border-radius: 8px; |
|
border: 1px solid #7C3E2E; |
|
} |
|
.stButton>button:hover { |
|
background-color: #A67B5B; |
|
color: white; |
|
} |
|
.stTextInput>div>div>input { |
|
background-color: #fdf6e3; |
|
color: #3B2F2F; |
|
border-radius: 4px; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True |
|
) |
|
|
|
st.title("Predicting Human Preference : LLM Battleground") |
|
st.write("As part of this demo, we make use of any two of the following SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/), [DeepSeek R1](https://api-docs.deepseek.com/news/news250120), [Mistral Small 3.1](https://mistral.ai/news/mistral-small-3-1) and [LLaMa 4 Scout](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) and make them compete against each other on a given prompt (to be entered through the sidebar)") |
|
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.") |
|
|
|
st.sidebar.title("Ask a Question!") |
|
model_choices = list(MODEL_MAP.keys()) |
|
model_a_name = st.sidebar.selectbox("Choose Model A", model_choices, index=0) |
|
model_b_name = st.sidebar.selectbox("Choose Model B", model_choices, index=1) |
|
question = st.sidebar.text_area("Enter your question:", key="prompt_input") |
|
|
|
if "generated" not in st.session_state: |
|
st.session_state["generated"] = False |
|
|
|
import concurrent.futures |
|
|
|
if st.sidebar.button("Generate Responses") and question: |
|
with st.spinner("Generating LLM responses"): |
|
|
|
def fetch_model_response(model_name): |
|
api_key = st.secrets["GEMINI_API_KEY"] if model_name == "Gemini" else st.secrets["OPENROUTER_API_KEY"] |
|
return MODEL_MAP[model_name](question, api_key) |
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_a = executor.submit(fetch_model_response, model_a_name) |
|
future_b = executor.submit(fetch_model_response, model_b_name) |
|
raw_a = future_a.result() |
|
raw_b = future_b.result() |
|
|
|
st.session_state.update({ |
|
"response_a_raw": raw_a, |
|
"response_b_raw": raw_b, |
|
"response_a_clean": clean_response(raw_a), |
|
"response_b_clean": clean_response(raw_b), |
|
"generated": True, |
|
"prediction": None, |
|
"model_a_name": model_a_name, |
|
"model_b_name": model_b_name |
|
}) |
|
|
|
if st.session_state["generated"]: |
|
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "Metric Curves"]) |
|
|
|
with tab1: |
|
st.subheader("Model Responses") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown(f"#### {st.session_state['model_a_name']}") |
|
st.markdown(st.session_state["response_a_raw"]) |
|
with col2: |
|
st.markdown(f"#### {st.session_state['model_b_name']}") |
|
st.markdown(st.session_state["response_b_raw"]) |
|
|
|
if st.button("Predict Winner"): |
|
with st.spinner("Running model..."): |
|
input_ids, attention_mask, num_features = preprocess_inputs( |
|
question, |
|
st.session_state["response_a_clean"], |
|
st.session_state["response_b_clean"], |
|
tokenizer |
|
) |
|
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0) |
|
predicted_class = np.argmax(predictions, axis=-1)[0] |
|
label_map = {0: f"{st.session_state['model_a_name']}!", 1: f"{st.session_state['model_b_name']}!", 2: "Tie!"} |
|
st.session_state["prediction"] = label_map[predicted_class] |
|
|
|
|
|
|
|
if st.session_state.get("prediction"): |
|
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}") |
|
|
|
with tab2: |
|
st.subheader("Model Architecture") |
|
st.image("images/arch.png", caption="Dual-LSTM + Attention + Numerical Features") |
|
|
|
with tab3: |
|
st.subheader("Training vs Validation Metrics") |
|
|
|
st.markdown("### RNN") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.image("images/plots/rnn_baseline_acc.png", caption="Accuracy - RNN", use_container_width=True) |
|
with col2: |
|
st.image("images/plots/rnn_baseline_loss.png", caption="Log Loss - RNN", use_container_width=True) |
|
|
|
st.markdown("### LSTM") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.image("images/plots/lstm_baseline_acc.png", caption="Accuracy - LSTM", use_container_width=True) |
|
with col2: |
|
st.image("images/plots/lstm_baseline_loss.png", caption="Log Loss - LSTM", use_container_width=True) |
|
|
|
st.markdown("### Bi-LSTM") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.image("images/plots/bilstm_baseline_acc.png", caption="Accuracy - Bi-LSTM", use_container_width=True) |
|
with col2: |
|
st.image("images/plots/bilstm_baseline_loss.png", caption="Log Loss - Bi-LSTM", use_container_width=True) |
|
|
|
st.markdown("### Hybrid (Dual-LSTM)") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.image("images/plots/duallstm_hybrid_acc.png", caption="Accuracy - Hybrid (Dual-LSTM)", use_container_width=True) |
|
with col2: |
|
st.image("images/plots/duallstm_hybrid_loss.png", caption="Log Loss - Hybrid (Dual-LSTM)", use_container_width=True) |
|
|
|
st.markdown("### Hybrid (Bi-LSTM)") |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_container_width=True) |
|
with col2: |
|
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_container_width=True) |
|
|
|
|