coldn00dl3s's picture
Upload 18 files
407c110 verified
raw
history blame
9.28 kB
import re
import nltk
import torch
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize
import textstat
import json
import requests
import tensorflow as tf
from keras.layers import Layer
from transformers import DebertaV2Tokenizer, TFAutoModel
import streamlit as st
from google import genai
torch.classes.__path__ = []
# Download tokenizer data once
nltk.download('punkt', quiet=True)
# === Cleaning Function ===
def clean_response(text: str) -> str:
# Simple markdown cleaner
text = re.sub(r"[*_`#>\-\[\]()]", "", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
# === Gemini API ===
def get_response_from_gemini(prompt: str, key) -> str:
gemini_client = genai.Client(api_key=key)
response = gemini_client.models.generate_content(
model="gemini-2.5-pro-exp-03-25",
contents=prompt,
)
return response.text.strip()
# === DeepSeek API ===
def get_response_from_deepseek(prompt: str, key) -> str:
response = requests.post(
url="https://openrouter.ai/api/v1/chat/completions",
headers={"Authorization": f"Bearer {key}"},
data=json.dumps({
"model": "deepseek/deepseek-r1:free",
"messages": [{"role": "user", "content": prompt}]
})
)
return response.json()["choices"][0]["message"]["content"]
# === Metrics ===
def calculate_entropy(text: str) -> float:
try:
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
if not tokens:
return -999999
freq_dist = Counter(tokens)
total_words = len(tokens)
probabilities = [count / total_words for count in freq_dist.values()]
return -sum(p * np.log2(p) for p in probabilities)
except:
return -999999
def calculate_ttr(text: str) -> float:
try:
tokens = [token.lower() for token in word_tokenize(text) if token.isalnum()]
return len(set(tokens)) / len(tokens) if tokens else -999999
except:
return -999999
def get_fk_score(text: str) -> float:
try:
return textstat.flesch_kincaid_grade(text)
except:
return -999999
def get_dc_score(text: str) -> float:
try:
return textstat.dale_chall_readability_score(text)
except:
return -999999
# === Model Setup ===
tokenizer = DebertaV2Tokenizer.from_pretrained("microsoft/deberta-v3-base")
class DebertaEmbeddingLayer(Layer):
def __init__(self, **kwargs):
super(DebertaEmbeddingLayer, self).__init__(**kwargs)
self.deberta = TFAutoModel.from_pretrained("microsoft/deberta-v3-base")
def call(self, inputs):
input_ids, attention_mask = inputs
outputs = self.deberta(input_ids, attention_mask=tf.cast(attention_mask, dtype=tf.int32))
return outputs.last_hidden_state
def compute_output_shape(self, input_shape):
return (input_shape[0][0], input_shape[0][1], 768)
model = tf.keras.models.load_model("models/hybrid_lstm_model.keras", custom_objects={"DebertaEmbeddingLayer": DebertaEmbeddingLayer})
# === Preprocessing ===
def preprocess_inputs(prompt: str, response_a: str, response_b: str, tokenizer, max_length=512):
combined_text = prompt + " " + response_a + " " + response_b
encoded = tokenizer(
[combined_text],
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="tf"
)
metrics = np.array([
get_fk_score(response_a),
get_fk_score(response_b),
get_dc_score(response_a),
get_dc_score(response_b),
calculate_ttr(response_a),
calculate_ttr(response_b),
calculate_entropy(response_a),
calculate_entropy(response_b)
]).reshape(1, -1).astype(np.float32)
return encoded["input_ids"], encoded["attention_mask"], metrics
# === Streamlit UI ===
st.set_page_config(page_title="LMSYS Demo", layout="wide")
# Optional styling (vintage theme)
st.markdown(
"""
<style>
* {
font-family: 'Georgia', serif !important;
}
.stButton>button {
background-color: #C2B280;
color: #3B2F2F;
border-radius: 8px;
border: 1px solid #7C3E2E;
}
.stButton>button:hover {
background-color: #A67B5B;
color: white;
}
.stTextInput>div>div>input {
background-color: #fdf6e3;
color: #3B2F2F;
border-radius: 4px;
}
</style>
""",
unsafe_allow_html=True
)
st.title("Predicting Human Preference : Gemini vs DeepSeek")
st.write("As part of this demo, we make use of two SOTA LLMs : [Gemini 2.5 Pro](https://deepmind.google/technologies/gemini/pro/) and [DeepSeek R1](https://api-docs.deepseek.com/news/news250120) and make them compete against each other on a given prompt (to be entered through the sidebar)")
st.write("Using our proposed hybrid model, we predict which response is more suited to be preferred by a human user.")
st.sidebar.title("Ask a Question!")
question = st.sidebar.text_area("Enter your question:", key="prompt_input")
# Init session state
if "generated" not in st.session_state:
st.session_state["generated"] = False
# Generate responses
if st.sidebar.button("Generate Responses") and question:
with st.spinner("Generating LLM responses..."):
raw_a = get_response_from_gemini(question, st.secrets["GEMINI_API_KEY"])
raw_b = get_response_from_deepseek(question, st.secrets["OPENROUTER_API_KEY"])
st.session_state["response_a_raw"] = raw_a
st.session_state["response_b_raw"] = raw_b
st.session_state["response_a_clean"] = clean_response(raw_a)
st.session_state["response_b_clean"] = clean_response(raw_b)
st.session_state["generated"] = True
st.session_state["prediction"] = None
# Display and interact
if st.session_state["generated"]:
tab1, tab2, tab3 = st.tabs(["Predictions","Model Architecture", "📈 Metric Curves"])
with tab1:
st.subheader("Model Responses")
col1, col2 = st.columns(2)
with col1:
st.markdown("#### Gemini")
st.markdown(st.session_state["response_a_raw"])
with col2:
st.markdown("#### DeepSeek")
st.markdown(st.session_state["response_b_raw"])
if st.button("Predict Winner"):
with st.spinner("Running model..."):
input_ids, attention_mask, num_features = preprocess_inputs(
question,
st.session_state["response_a_clean"],
st.session_state["response_b_clean"],
tokenizer
)
predictions = model.predict([input_ids, attention_mask, num_features], verbose=0)
predicted_class = np.argmax(predictions, axis=-1)[0]
label_map = {0: "Gemini!", 1: "DeepSeek!", 2: "Tie!"}
st.session_state["prediction"] = label_map[predicted_class]
if st.session_state.get("prediction"):
st.success(f"🤖 Model Prediction: {st.session_state['prediction']}")
with tab2:
st.subheader("Model Architecture")
st.image("images/arch.png", caption="Dual-LSTM + Attention + Numerical Features")
with tab3:
st.subheader("Training vs Validation Metrics")
st.markdown("### RNN")
col1, col2 = st.columns(2)
with col1:
st.image("images/plots/rnn_baseline_acc.png", caption="Accuracy - RNN", use_column_width=True)
with col2:
st.image("images/plots/rnn_baseline_loss.png", caption="Log Loss - RNN", use_column_width=True)
st.markdown("### LSTM")
col1, col2 = st.columns(2)
with col1:
st.image("images/plots/lstm_baseline_acc.png", caption="Accuracy - LSTM", use_column_width=True)
with col2:
st.image("images/plots/lstm_baseline_loss.png", caption="Log Loss - LSTM", use_column_width=True)
st.markdown("### Bi-LSTM")
col1, col2 = st.columns(2)
with col1:
st.image("images/plots/bilstm_baseline_acc.png", caption="Accuracy - Bi-LSTM", use_column_width=True)
with col2:
st.image("images/plots/bilstm_baseline_loss.png", caption="Log Loss - Bi-LSTM", use_column_width=True)
st.markdown("### Hybrid (Dual-LSTM)")
col1, col2 = st.columns(2)
with col1:
st.image("images/plots/duallstm_hybrid_acc.png", caption="Accuracy - Hybrid (Dual-LSTM)", use_column_width=True)
with col2:
st.image("images/plots/duallstm_hybrid_loss.png", caption="Log Loss - Hybrid (Dual-LSTM)", use_column_width=True)
st.markdown("### Hybrid (Bi-LSTM)")
col1, col2 = st.columns(2)
with col1:
st.image("images/plots/bilstm_hybrid_acc.png", caption="Accuracy - Hybrid (Bi-LSTM)", use_column_width=True)
with col2:
st.image("images/plots/bilstm_hybrid_loss.png", caption="Log Loss - Hybrid (Bi-LSTM)", use_column_width=True)