Update app.py
Browse files
app.py
CHANGED
@@ -6,35 +6,22 @@ from collections import defaultdict
|
|
6 |
|
7 |
# Page configuration
|
8 |
st.set_page_config(
|
9 |
-
page_title="
|
10 |
-
page_icon="
|
11 |
layout="wide"
|
12 |
)
|
13 |
|
14 |
-
# Custom CSS
|
15 |
-
st.markdown("""
|
16 |
-
<style>
|
17 |
-
.title-font {
|
18 |
-
font-size: 28px !important;
|
19 |
-
font-weight: bold;
|
20 |
-
color: #2c3e50;
|
21 |
-
}
|
22 |
-
</style>
|
23 |
-
""", unsafe_allow_html=True)
|
24 |
-
|
25 |
class ShingleSimilarity:
|
26 |
def __init__(self, k=3):
|
27 |
-
self.k = k
|
28 |
|
29 |
def get_shingles(self, text):
|
30 |
-
"""Convert text to k-shingles"""
|
31 |
text = text.lower().strip()
|
32 |
if len(text) < self.k:
|
33 |
return {text}
|
34 |
return {text[i:i+self.k] for i in range(len(text) - self.k + 1)}
|
35 |
|
36 |
def similarity(self, text1, text2):
|
37 |
-
"""Calculate Jaccard similarity between two texts using shingles"""
|
38 |
shingles1 = self.get_shingles(text1)
|
39 |
shingles2 = self.get_shingles(text2)
|
40 |
|
@@ -43,196 +30,86 @@ class ShingleSimilarity:
|
|
43 |
|
44 |
return intersection / union if union > 0 else 0.0
|
45 |
|
46 |
-
def
|
47 |
-
"""Create visualization based on shingle similarity distances"""
|
48 |
n = len(texts)
|
49 |
-
|
50 |
-
# Create similarity matrix
|
51 |
similarity_matrix = np.zeros((n, n))
|
|
|
52 |
for i in range(n):
|
53 |
for j in range(n):
|
54 |
similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
|
55 |
|
56 |
-
#
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
fig.add_trace(go.Scatter(
|
65 |
-
x=[0, 1-similarity],
|
66 |
-
y=[0, 0],
|
67 |
-
mode='markers+text',
|
68 |
-
text=texts,
|
69 |
-
textposition='top center',
|
70 |
-
marker=dict(size=10, color=['blue', 'red'])
|
71 |
-
))
|
72 |
-
|
73 |
-
fig.update_layout(
|
74 |
-
title=f"Text Similarity Visualization (Similarity: {similarity:.3f})",
|
75 |
-
xaxis_title="Relative Distance",
|
76 |
-
yaxis_title="",
|
77 |
-
height=400,
|
78 |
-
showlegend=False,
|
79 |
-
xaxis=dict(range=[-0.1, 1.1]),
|
80 |
-
yaxis=dict(range=[-0.5, 0.5])
|
81 |
-
)
|
82 |
-
else:
|
83 |
-
# For 3 or more texts, use MDS for 3D visualization
|
84 |
-
mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
|
85 |
-
coords = mds.fit_transform(distance_matrix)
|
86 |
-
|
87 |
-
fig = go.Figure()
|
88 |
-
|
89 |
-
# Add points
|
90 |
-
fig.add_trace(go.Scatter3d(
|
91 |
-
x=coords[:, 0],
|
92 |
-
y=coords[:, 1],
|
93 |
-
z=coords[:, 2],
|
94 |
-
mode='markers+text',
|
95 |
-
text=texts,
|
96 |
-
textposition='top center',
|
97 |
-
marker=dict(
|
98 |
-
size=10,
|
99 |
-
color=list(range(len(texts))),
|
100 |
-
colorscale='Viridis',
|
101 |
-
opacity=0.8
|
102 |
-
),
|
103 |
-
name='Texts'
|
104 |
-
))
|
105 |
-
|
106 |
-
# Add lines between points
|
107 |
-
for i in range(n):
|
108 |
-
for j in range(i+1, n):
|
109 |
-
opacity = max(0.1, min(1.0, similarity_matrix[i,j]))
|
110 |
-
fig.add_trace(go.Scatter3d(
|
111 |
-
x=[coords[i,0], coords[j,0]],
|
112 |
-
y=[coords[i,1], coords[j,1]],
|
113 |
-
z=[coords[i,2], coords[j,2]],
|
114 |
-
mode='lines',
|
115 |
-
line=dict(color='gray', width=2),
|
116 |
-
opacity=opacity,
|
117 |
-
showlegend=False,
|
118 |
-
hoverinfo='skip'
|
119 |
-
))
|
120 |
-
|
121 |
-
fig.update_layout(
|
122 |
-
title="3D Similarity Visualization",
|
123 |
-
scene=dict(
|
124 |
-
xaxis_title="Dimension 1",
|
125 |
-
yaxis_title="Dimension 2",
|
126 |
-
zaxis_title="Dimension 3",
|
127 |
-
camera=dict(
|
128 |
-
up=dict(x=0, y=0, z=1),
|
129 |
-
center=dict(x=0, y=0, z=0),
|
130 |
-
eye=dict(x=1.5, y=1.5, z=1.5)
|
131 |
-
)
|
132 |
-
),
|
133 |
-
height=700,
|
134 |
-
showlegend=True
|
135 |
-
)
|
136 |
|
137 |
return fig
|
138 |
|
139 |
def main():
|
140 |
-
st.title("
|
141 |
-
st.markdown("<p class='title-font'>Analyze text similarities using k-shingles</p>", unsafe_allow_html=True)
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
-
|
148 |
-
|
149 |
-
- **Shingles** are consecutive character sequences of length k
|
150 |
-
- **Higher k** means more precise matching
|
151 |
-
- **Lower k** captures more general similarities
|
152 |
-
- Similarity is calculated using Jaccard similarity of shingle sets
|
153 |
-
""")
|
154 |
|
155 |
-
|
|
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
st.write(sorted(shingle_sim.get_shingles(text2)))
|
170 |
-
|
171 |
-
if st.button("Analyze Similarity", key="sim_button"):
|
172 |
-
similarity = shingle_sim.similarity(text1, text2)
|
173 |
-
|
174 |
-
col1, col2 = st.columns(2)
|
175 |
-
with col1:
|
176 |
-
st.metric(
|
177 |
-
label="Similarity Score",
|
178 |
-
value=f"{similarity:.4f}",
|
179 |
-
help="1.0 = identical, 0.0 = completely different"
|
180 |
-
)
|
181 |
-
interpretation = (
|
182 |
-
"π’ Very Similar" if similarity > 0.8
|
183 |
-
else "π‘ Moderately Similar" if similarity > 0.5
|
184 |
-
else "π΄ Different"
|
185 |
-
)
|
186 |
-
st.info(f"Interpretation: {interpretation}")
|
187 |
-
|
188 |
-
with col2:
|
189 |
-
st.plotly_chart(
|
190 |
-
create_similarity_based_visualization([text1, text2], shingle_sim),
|
191 |
-
use_container_width=True
|
192 |
-
)
|
193 |
|
194 |
-
|
195 |
-
|
196 |
-
num_texts = st.slider("Number of texts:", 2, 6, 3)
|
197 |
-
texts = []
|
198 |
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
|
|
207 |
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
)
|
213 |
-
|
214 |
-
|
215 |
-
st.
|
216 |
-
similarity_matrix = np.zeros((len(texts), len(texts)))
|
217 |
-
for i in range(len(texts)):
|
218 |
-
for j in range(len(texts)):
|
219 |
-
similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
|
220 |
-
|
221 |
-
fig = go.Figure(data=go.Heatmap(
|
222 |
-
z=similarity_matrix,
|
223 |
-
x=[f"Text {i+1}" for i in range(len(texts))],
|
224 |
-
y=[f"Text {i+1}" for i in range(len(texts))],
|
225 |
-
colorscale='Viridis',
|
226 |
-
text=np.round(similarity_matrix, 3),
|
227 |
-
texttemplate='%{text}',
|
228 |
-
textfont={"size": 12},
|
229 |
-
))
|
230 |
-
|
231 |
-
fig.update_layout(
|
232 |
-
title="Similarity Matrix",
|
233 |
-
height=400
|
234 |
-
)
|
235 |
-
st.plotly_chart(fig, use_container_width=True)
|
236 |
|
237 |
if __name__ == "__main__":
|
238 |
main()
|
|
|
6 |
|
7 |
# Page configuration
|
8 |
st.set_page_config(
|
9 |
+
page_title="Address Similarity Explorer",
|
10 |
+
page_icon="π ",
|
11 |
layout="wide"
|
12 |
)
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
class ShingleSimilarity:
|
15 |
def __init__(self, k=3):
|
16 |
+
self.k = k
|
17 |
|
18 |
def get_shingles(self, text):
|
|
|
19 |
text = text.lower().strip()
|
20 |
if len(text) < self.k:
|
21 |
return {text}
|
22 |
return {text[i:i+self.k] for i in range(len(text) - self.k + 1)}
|
23 |
|
24 |
def similarity(self, text1, text2):
|
|
|
25 |
shingles1 = self.get_shingles(text1)
|
26 |
shingles2 = self.get_shingles(text2)
|
27 |
|
|
|
30 |
|
31 |
return intersection / union if union > 0 else 0.0
|
32 |
|
33 |
+
def create_similarity_visualization(texts, shingle_sim):
|
|
|
34 |
n = len(texts)
|
|
|
|
|
35 |
similarity_matrix = np.zeros((n, n))
|
36 |
+
|
37 |
for i in range(n):
|
38 |
for j in range(n):
|
39 |
similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
|
40 |
|
41 |
+
# Create visualization
|
42 |
+
fig = go.Figure(data=go.Heatmap(
|
43 |
+
z=similarity_matrix,
|
44 |
+
x=texts,
|
45 |
+
y=texts,
|
46 |
+
colorscale='Viridis',
|
47 |
+
text=np.round(similarity_matrix, 4),
|
48 |
+
texttemplate='%{text}',
|
49 |
+
textfont={"size": 12},
|
50 |
+
))
|
51 |
|
52 |
+
fig.update_layout(
|
53 |
+
title="Address Similarity Matrix",
|
54 |
+
height=600,
|
55 |
+
width=800
|
56 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
return fig
|
59 |
|
60 |
def main():
|
61 |
+
st.title("π Address Similarity Analyzer")
|
|
|
62 |
|
63 |
+
# Example addresses from the image
|
64 |
+
example_addresses = [
|
65 |
+
"123 North Hampton Blvd",
|
66 |
+
"123 N Hampton Blvd",
|
67 |
+
"123 North Hampton Boulevard",
|
68 |
+
"123 N. Hampton Blvd",
|
69 |
+
"123 N. Hampton Boulevard",
|
70 |
+
"65 South Hampton Blvd"
|
71 |
+
]
|
72 |
|
73 |
+
st.markdown("### Address Similarity Analysis")
|
74 |
+
st.info("If the similarity score is over 0.4, addresses are considered to be the same location")
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
+
# Initialize shingle similarity with k=3
|
77 |
+
shingle_sim = ShingleSimilarity(k=3)
|
78 |
|
79 |
+
# Display similarity matrix
|
80 |
+
fig = create_similarity_visualization(example_addresses, shingle_sim)
|
81 |
+
st.plotly_chart(fig)
|
82 |
+
|
83 |
+
# Interactive comparison
|
84 |
+
st.markdown("### Compare Two Addresses")
|
85 |
+
col1, col2 = st.columns(2)
|
86 |
+
|
87 |
+
with col1:
|
88 |
+
addr1 = st.selectbox("Select first address:", example_addresses)
|
89 |
+
with col2:
|
90 |
+
addr2 = st.selectbox("Select second address:", example_addresses, index=1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
+
if st.button("Compare Addresses"):
|
93 |
+
similarity = shingle_sim.similarity(addr1, addr2)
|
|
|
|
|
94 |
|
95 |
+
st.metric(
|
96 |
+
label="Similarity Score",
|
97 |
+
value=f"{similarity:.4f}"
|
98 |
+
)
|
99 |
+
|
100 |
+
is_same = similarity > 0.4
|
101 |
+
status = "β
Same Location" if is_same else "β Different Locations"
|
102 |
+
color = "success" if is_same else "error"
|
103 |
+
st.markdown(f"**Status:** :{color}[{status}]")
|
104 |
|
105 |
+
# Show shingles comparison
|
106 |
+
col1, col2 = st.columns(2)
|
107 |
+
with col1:
|
108 |
+
st.markdown(f"**Shingles for Address 1:**")
|
109 |
+
st.write(sorted(shingle_sim.get_shingles(addr1)))
|
110 |
+
with col2:
|
111 |
+
st.markdown(f"**Shingles for Address 2:**")
|
112 |
+
st.write(sorted(shingle_sim.get_shingles(addr2)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
if __name__ == "__main__":
|
115 |
main()
|