|
import re
|
|
import nltk
|
|
import torch
|
|
import numpy as np
|
|
from collections import Counter
|
|
from nltk.tokenize import word_tokenize
|
|
import textstat
|
|
import json
|
|
import requests
|
|
import tensorflow as tf
|
|
from keras.layers import Layer
|
|
from transformers import DebertaV2Tokenizer, TFAutoModel
|
|
import streamlit as st
|
|
from google import genai
|
|
torch.classes.__path__ = []
|
|
|
|
nltk.download('punkt', quiet=True)
|
|
|
|
|
|
def clean_response(text: str) -> str:
|
|
|
|
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
|
|
text = re.sub(r"\s+", " ", text)
|
|
return text.strip()
|
|
|
|
|
|
|
|
|
|
def get_response_from_gemini(prompt: str, key) -> str:
|
|
gemini_client = genai.Client(api_key=key)
|
|
response = gemini_client.models.generate_content(
|
|
model="gemini-2.5-pro-exp-03-25",
|
|
contents=prompt,
|
|
)
|
|
return response.text.strip()
|
|
|
|
|
|
def get_response_from_deepseek(prompt: str, key) -> str:
|
|
response = requests.post(
|
|
url="https://openrouter.ai/api/v1/chat/completions",
|
|
headers={"Authorization": f"Bearer {key}"},
|
|
data=json.dumps({
|
|
"model": "deepseek/deepseek-r1:free",
|
|
"messages": [{"role": "user", "content": prompt}]
|
|
})
|
|
)
|
|
return response.json()["choices"][0]["message"]["content"]
|
|
|
|
|
|
def calculate_entropy(text: str) -> float:
|
|
try:
|
|
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
|
if not tokens:
|
|
return -999999
|
|
freq_dist = Counter(tokens)
|
|
total_words = len(tokens)
|
|
probabilities = [count / total_words for count in freq_dist.values()]
|
|
return -sum(p * np.log2(p) for p in probabilities)
|
|
except:
|
|
return -999999
|
|
|
|
def calculate_ttr(text: str) -> float:
|
|
try:
|
|
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
|
|
return len(set(tokens)) / len(tokens) if tokens else -999999
|
|
except:
|
|
return -999999
|
|
|
|
def get_fk_score(text: str) -> float:
|
|
try:
|
|
return textstat.flesch_kincaid_grade(text)
|
|
except:
|
|
return -999999
|
|
|
|
def get_dc_score(text: str) -> float:
|
|
try:
|
|
return textstat.dale_chall_readability_score(text)
|
|
except:
|
|
return -999999
|
|
|
|
|
|
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
|
|
|
|
class DebertaEmbeddingLayer(Layer):
|
|
def __init__(self, **kwargs):
|
|
super(DebertaEmbeddingLayer, self).__init__(**kwargs)
|
|
self.deberta = TFAutoModel.from_pretrained("microsoft/deberta-v3-base")
|
|
|
|
def call(self, inputs):
|
|
input_ids, attention_mask = inputs
|
|
outputs = self.deberta(input_ids, attention_mask=tf.cast(attention_mask, dtype=tf.int32))
|
|
return outputs.last_hidden_state
|
|
|
|
def compute_output_shape(self, input_shape):
|
|
return (input_shape[0][0], input_shape[0][1], 768)
|
|
|
|
model = tf.keras.models.load_model("models/hybrid_lstm_model.keras", custom_objects={"DebertaEmbeddingLayer": DebertaEmbeddingLayer})
|
|
|
|
|
|
def preprocess_inputs(prompt: str, response_a: str, response_b: str, tokenizer, max_length=512):
|
|
combined_text = prompt + " " + response_a + " " + response_b
|
|
encoded = tokenizer(
|
|
[combined_text],
|
|
padding="max_length",
|
|
truncation=True,
|
|
max_length=max_length,
|
|
return_tensors="tf"
|
|
)
|
|
metrics = np.array([
|
|
get_fk_score(response_a),
|
|
get_fk_score(response_b),
|
|
get_dc_score(response_a),
|
|
get_dc_score(response_b),
|
|
calculate_ttr(response_a),
|
|
calculate_ttr(response_b),
|
|
calculate_entropy(response_a),
|
|
calculate_entropy(response_b)
|
|
]).reshape(1, -1).astype(np.float32)
|
|
return encoded["input_ids"], encoded["attention_mask"], metrics
|
|
|
|
|
|
st.set_page_config(page_title="LMSYS Demo", layout="wide")
|
|
|
|
|
|
st.markdown(
|
|
"""
|
|
<style>
|
|
* {
|
|
font-family: 'Georgia', serif !important;
|
|
}
|
|
.stButton>button {
|
|
background-color: #C2B280;
|
|
color: #3B2F2F;
|
|
border-radius: 8px;
|
|
border: 1px solid #7C3E2E;
|
|
}
|
|
.stButton>button:hover {
|
|
background-color: #A67B5B;
|
|
color: white;
|
|
}
|
|
.stTextInput>div>div>input {
|
|
background-color: #fdf6e3;
|
|
color: #3B2F2F;
|
|
border-radius: 4px;
|
|
}
|
|
</style>
|
|
""",
|
|
unsafe_allow_html=True
|
|
)
|
|
|
|
st.title("Predicting Human Preference : Gemini vs DeepSeek")
|
|
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
|
|
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
|
|
st.sidebar.title("Ask a Question!")
|
|
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
|
|
|
|
|
|
if "generated" not in st.session_state:
|
|
st.session_state["generated"] = False
|
|
|
|
|
|
if st.sidebar.button("Generate Responses") and question:
|
|
with st.spinner("Generating LLM responses..."):
|
|
raw_a = get_response_from_gemini(question, st.secrets["GEMINI_API_KEY"])
|
|
raw_b = get_response_from_deepseek(question, st.secrets["OPENROUTER_API_KEY"])
|
|
|
|
st.session_state["response_a_raw"] = raw_a
|
|
st.session_state["response_b_raw"] = raw_b
|
|
st.session_state["response_a_clean"] = clean_response(raw_a)
|
|
st.session_state["response_b_clean"] = clean_response(raw_b)
|
|
|
|
st.session_state["generated"] = True
|
|
st.session_state["prediction"] = None
|
|
|
|
|
|
if st.session_state["generated"]:
|
|
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "📈 Metric Curves"])
|
|
|
|
with tab1:
|
|
st.subheader("Model Responses")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.markdown("#### Gemini")
|
|
st.markdown(st.session_state["response_a_raw"])
|
|
with col2:
|
|
st.markdown("#### DeepSeek")
|
|
st.markdown(st.session_state["response_b_raw"])
|
|
|
|
|
|
if st.button("Predict Winner"):
|
|
with st.spinner("Running model..."):
|
|
input_ids, attention_mask, num_features = preprocess_inputs(
|
|
question,
|
|
st.session_state["response_a_clean"],
|
|
st.session_state["response_b_clean"],
|
|
tokenizer
|
|
)
|
|
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
|
|
predicted_class = np.argmax(predictions, axis=-1)[0]
|
|
label_map = {0: "Gemini!", 1: "DeepSeek!", 2: "Tie!"}
|
|
st.session_state["prediction"] = label_map[predicted_class]
|
|
|
|
if st.session_state.get("prediction"):
|
|
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
|
|
|
|
with tab2:
|
|
st.subheader("Model Architecture")
|
|
st.image("images/arch.png", caption="Dual-LSTM + Attention + Numerical Features")
|
|
|
|
with tab3:
|
|
st.subheader("Training vs Validation Metrics")
|
|
|
|
st.markdown("### RNN")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.image("images/plots/rnn_baseline_acc.png", caption="Accuracy - RNN", use_column_width=True)
|
|
with col2:
|
|
st.image("images/plots/rnn_baseline_loss.png", caption="Log Loss - RNN", use_column_width=True)
|
|
|
|
st.markdown("### LSTM")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.image("images/plots/lstm_baseline_acc.png", caption="Accuracy - LSTM", use_column_width=True)
|
|
with col2:
|
|
st.image("images/plots/lstm_baseline_loss.png", caption="Log Loss - LSTM", use_column_width=True)
|
|
|
|
st.markdown("### Bi-LSTM")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.image("images/plots/bilstm_baseline_acc.png", caption="Accuracy - Bi-LSTM", use_column_width=True)
|
|
with col2:
|
|
st.image("images/plots/bilstm_baseline_loss.png", caption="Log Loss - Bi-LSTM", use_column_width=True)
|
|
|
|
st.markdown("### Hybrid (Dual-LSTM)")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.image("images/plots/duallstm_hybrid_acc.png", caption="Accuracy - Hybrid (Dual-LSTM)", use_column_width=True)
|
|
with col2:
|
|
st.image("images/plots/duallstm_hybrid_loss.png", caption="Log Loss - Hybrid (Dual-LSTM)", use_column_width=True)
|
|
|
|
st.markdown("### Hybrid (Bi-LSTM)")
|
|
col1, col2 = st.columns(2)
|
|
with col1:
|
|
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_column_width=True)
|
|
with col2:
|
|
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_column_width=True)
|
|
|