Spaces:
Runtime error
Runtime error
from typing import List | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
def calculate_word_overlaps(documents: List[str], query: str): | |
""" | |
Calculate the average word overlaps between documents and the query. | |
""" | |
query_words = set(query.lower().split()) | |
word_overlaps = [] | |
for doc in documents: | |
doc_words = set(doc.lower().split()) | |
overlap = len(query_words.intersection(doc_words)) | |
word_overlaps.append(overlap) | |
if len(word_overlaps) > 0: | |
average_word_overlap = np.mean(word_overlaps) | |
else: | |
average_word_overlap = 0.0 | |
return average_word_overlap | |
def calculate_duplication_rate(documents: List[str]): | |
""" | |
Calculate the duplication rate among a list of documents. | |
""" | |
total_words_set = set() | |
total_words = 0 | |
for doc in documents: | |
doc_words = doc.lower().split() | |
total_words_set.update(doc_words) | |
total_words += len(doc_words) | |
if total_words > 0: | |
duplication_rate = (total_words - len(total_words_set)) / total_words | |
else: | |
duplication_rate = 0.0 | |
return duplication_rate | |
def cosine_similarity_score(documents: List[str], query: str): | |
""" | |
Calculate cosine similarity between the query and each document. | |
""" | |
tfidf_vectorizer = TfidfVectorizer() | |
tfidf_matrix = tfidf_vectorizer.fit_transform([query] + documents) | |
cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]) | |
return cosine_similarities[0] | |
def jaccard_similarity_score(documents: List[str], query: str): | |
""" | |
Calculate Jaccard similarity between the query and each document. | |
""" | |
query_words = set(query.lower().split()) | |
jaccard_similarities = [] | |
for doc in documents: | |
doc_words = set(doc.lower().split()) | |
intersection_size = len(query_words.intersection(doc_words)) | |
union_size = len(query_words.union(doc_words)) | |
jaccard_similarity = intersection_size / union_size if union_size > 0 else 0 | |
jaccard_similarities.append(jaccard_similarity) | |
return jaccard_similarities | |
def display_similarity_results(cosine_scores, jaccard_scores, title): | |
st.subheader(f"{title} - Cosine Similarity to Query") | |
plt.bar(range(len(cosine_scores)), cosine_scores) | |
plt.xlabel("Documents") | |
plt.ylabel("Cosine Similarity") | |
st.pyplot(plt) | |
st.subheader(f"{title} - Jaccard Similarity to Query") | |
plt.bar(range(len(jaccard_scores)), jaccard_scores, color='orange') | |
plt.xlabel("Documents") | |
plt.ylabel("Jaccard Similarity") | |
st.pyplot(plt) |