retrieval-compression / analysis.py
ethanrom's picture
Upload 6 files
fc850f2
from typing import List
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
import matplotlib.pyplot as plt
def calculate_word_overlaps(documents: List[str], query: str):
"""
Calculate the average word overlaps between documents and the query.
"""
query_words = set(query.lower().split())
word_overlaps = []
for doc in documents:
doc_words = set(doc.lower().split())
overlap = len(query_words.intersection(doc_words))
word_overlaps.append(overlap)
if len(word_overlaps) > 0:
average_word_overlap = np.mean(word_overlaps)
else:
average_word_overlap = 0.0
return average_word_overlap
def calculate_duplication_rate(documents: List[str]):
"""
Calculate the duplication rate among a list of documents.
"""
total_words_set = set()
total_words = 0
for doc in documents:
doc_words = doc.lower().split()
total_words_set.update(doc_words)
total_words += len(doc_words)
if total_words > 0:
duplication_rate = (total_words - len(total_words_set)) / total_words
else:
duplication_rate = 0.0
return duplication_rate
def cosine_similarity_score(documents: List[str], query: str):
"""
Calculate cosine similarity between the query and each document.
"""
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents)
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])
return cosine_similarities[0]
def jaccard_similarity_score(documents: List[str], query: str):
"""
Calculate Jaccard similarity between the query and each document.
"""
query_words = set(query.lower().split())
jaccard_similarities = []
for doc in documents:
doc_words = set(doc.lower().split())
intersection_size = len(query_words.intersection(doc_words))
union_size = len(query_words.union(doc_words))
jaccard_similarity = intersection_size / union_size if union_size > 0 else 0
jaccard_similarities.append(jaccard_similarity)
return jaccard_similarities
def display_similarity_results(cosine_scores, jaccard_scores, title):
st.subheader(f"{title} - Cosine Similarity to Query")
plt.bar(range(len(cosine_scores)), cosine_scores)
plt.xlabel("Documents")
plt.ylabel("Cosine Similarity")
st.pyplot(plt)
st.subheader(f"{title} - Jaccard Similarity to Query")
plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange')
plt.xlabel("Documents")
plt.ylabel("Jaccard Similarity")
st.pyplot(plt)