|
import streamlit as st |
|
import numpy as np |
|
import plotly.graph_objects as go |
|
from sklearn.manifold import MDS |
|
from collections import defaultdict |
|
|
|
|
|
st.set_page_config( |
|
page_title="Address Similarity Explorer", |
|
page_icon="π ", |
|
layout="wide" |
|
) |
|
|
|
class ShingleSimilarity: |
|
def __init__(self, k=3): |
|
self.k = k |
|
|
|
def get_shingles(self, text): |
|
text = text.lower().strip() |
|
if len(text) < self.k: |
|
return {text} |
|
return {text[i:i+self.k] for i in range(len(text) - self.k + 1)} |
|
|
|
def similarity(self, text1, text2): |
|
shingles1 = self.get_shingles(text1) |
|
shingles2 = self.get_shingles(text2) |
|
|
|
intersection = len(shingles1.intersection(shingles2)) |
|
union = len(shingles1.union(shingles2)) |
|
|
|
return intersection / union if union > 0 else 0.0 |
|
|
|
def create_similarity_visualization(texts, shingle_sim): |
|
n = len(texts) |
|
similarity_matrix = np.zeros((n, n)) |
|
|
|
for i in range(n): |
|
for j in range(n): |
|
similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j]) |
|
|
|
|
|
fig = go.Figure(data=go.Heatmap( |
|
z=similarity_matrix, |
|
x=texts, |
|
y=texts, |
|
colorscale='Viridis', |
|
text=np.round(similarity_matrix, 4), |
|
texttemplate='%{text}', |
|
textfont={"size": 12}, |
|
)) |
|
|
|
fig.update_layout( |
|
title="Address Similarity Matrix", |
|
height=600, |
|
width=800 |
|
) |
|
|
|
return fig |
|
|
|
def main(): |
|
st.title("π Address Similarity Analyzer") |
|
|
|
|
|
example_addresses = [ |
|
"123 North Hampton Blvd", |
|
"123 N Hampton Blvd", |
|
"123 North Hampton Boulevard", |
|
"123 N. Hampton Blvd", |
|
"123 N. Hampton Boulevard", |
|
"65 South Hampton Blvd" |
|
] |
|
|
|
st.markdown("### Address Similarity Analysis") |
|
st.info("If the similarity score is over 0.4, addresses are considered to be the same location") |
|
|
|
|
|
shingle_sim = ShingleSimilarity(k=3) |
|
|
|
|
|
fig = create_similarity_visualization(example_addresses, shingle_sim) |
|
st.plotly_chart(fig) |
|
|
|
|
|
st.markdown("### Compare Two Addresses") |
|
col1, col2 = st.columns(2) |
|
|
|
with col1: |
|
addr1 = st.selectbox("Select first address:", example_addresses) |
|
with col2: |
|
addr2 = st.selectbox("Select second address:", example_addresses, index=1) |
|
|
|
if st.button("Compare Addresses"): |
|
similarity = shingle_sim.similarity(addr1, addr2) |
|
|
|
st.metric( |
|
label="Similarity Score", |
|
value=f"{similarity:.4f}" |
|
) |
|
|
|
is_same = similarity > 0.4 |
|
status = "β
Same Location" if is_same else "β Different Locations" |
|
color = "success" if is_same else "error" |
|
st.markdown(f"**Status:** :{color}[{status}]") |
|
|
|
|
|
col1, col2 = st.columns(2) |
|
with col1: |
|
st.markdown(f"**Shingles for Address 1:**") |
|
st.write(sorted(shingle_sim.get_shingles(addr1))) |
|
with col2: |
|
st.markdown(f"**Shingles for Address 2:**") |
|
st.write(sorted(shingle_sim.get_shingles(addr2))) |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|