DexterSptizu commited on
Commit
47be852
Β·
verified Β·
1 Parent(s): d37b73b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -188
app.py CHANGED
@@ -6,35 +6,22 @@ from collections import defaultdict
6
 
7
  # Page configuration
8
  st.set_page_config(
9
- page_title="Text Similarity Explorer",
10
- page_icon="πŸ“",
11
  layout="wide"
12
  )
13
 
14
- # Custom CSS
15
- st.markdown("""
16
- <style>
17
- .title-font {
18
- font-size: 28px !important;
19
- font-weight: bold;
20
- color: #2c3e50;
21
- }
22
- </style>
23
- """, unsafe_allow_html=True)
24
-
25
  class ShingleSimilarity:
26
  def __init__(self, k=3):
27
- self.k = k # shingle size
28
 
29
  def get_shingles(self, text):
30
- """Convert text to k-shingles"""
31
  text = text.lower().strip()
32
  if len(text) < self.k:
33
  return {text}
34
  return {text[i:i+self.k] for i in range(len(text) - self.k + 1)}
35
 
36
  def similarity(self, text1, text2):
37
- """Calculate Jaccard similarity between two texts using shingles"""
38
  shingles1 = self.get_shingles(text1)
39
  shingles2 = self.get_shingles(text2)
40
 
@@ -43,196 +30,86 @@ class ShingleSimilarity:
43
 
44
  return intersection / union if union > 0 else 0.0
45
 
46
- def create_similarity_based_visualization(texts, shingle_sim):
47
- """Create visualization based on shingle similarity distances"""
48
  n = len(texts)
49
-
50
- # Create similarity matrix
51
  similarity_matrix = np.zeros((n, n))
 
52
  for i in range(n):
53
  for j in range(n):
54
  similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
55
 
56
- # Convert similarities to distances
57
- distance_matrix = 1 - similarity_matrix
 
 
 
 
 
 
 
 
58
 
59
- if n == 2:
60
- # For 2 texts, create a simple 2D visualization
61
- fig = go.Figure()
62
- similarity = similarity_matrix[0][1]
63
-
64
- fig.add_trace(go.Scatter(
65
- x=[0, 1-similarity],
66
- y=[0, 0],
67
- mode='markers+text',
68
- text=texts,
69
- textposition='top center',
70
- marker=dict(size=10, color=['blue', 'red'])
71
- ))
72
-
73
- fig.update_layout(
74
- title=f"Text Similarity Visualization (Similarity: {similarity:.3f})",
75
- xaxis_title="Relative Distance",
76
- yaxis_title="",
77
- height=400,
78
- showlegend=False,
79
- xaxis=dict(range=[-0.1, 1.1]),
80
- yaxis=dict(range=[-0.5, 0.5])
81
- )
82
- else:
83
- # For 3 or more texts, use MDS for 3D visualization
84
- mds = MDS(n_components=3, dissimilarity='precomputed', random_state=42)
85
- coords = mds.fit_transform(distance_matrix)
86
-
87
- fig = go.Figure()
88
-
89
- # Add points
90
- fig.add_trace(go.Scatter3d(
91
- x=coords[:, 0],
92
- y=coords[:, 1],
93
- z=coords[:, 2],
94
- mode='markers+text',
95
- text=texts,
96
- textposition='top center',
97
- marker=dict(
98
- size=10,
99
- color=list(range(len(texts))),
100
- colorscale='Viridis',
101
- opacity=0.8
102
- ),
103
- name='Texts'
104
- ))
105
-
106
- # Add lines between points
107
- for i in range(n):
108
- for j in range(i+1, n):
109
- opacity = max(0.1, min(1.0, similarity_matrix[i,j]))
110
- fig.add_trace(go.Scatter3d(
111
- x=[coords[i,0], coords[j,0]],
112
- y=[coords[i,1], coords[j,1]],
113
- z=[coords[i,2], coords[j,2]],
114
- mode='lines',
115
- line=dict(color='gray', width=2),
116
- opacity=opacity,
117
- showlegend=False,
118
- hoverinfo='skip'
119
- ))
120
-
121
- fig.update_layout(
122
- title="3D Similarity Visualization",
123
- scene=dict(
124
- xaxis_title="Dimension 1",
125
- yaxis_title="Dimension 2",
126
- zaxis_title="Dimension 3",
127
- camera=dict(
128
- up=dict(x=0, y=0, z=1),
129
- center=dict(x=0, y=0, z=0),
130
- eye=dict(x=1.5, y=1.5, z=1.5)
131
- )
132
- ),
133
- height=700,
134
- showlegend=True
135
- )
136
 
137
  return fig
138
 
139
  def main():
140
- st.title("πŸ“ Text Similarity Explorer")
141
- st.markdown("<p class='title-font'>Analyze text similarities using k-shingles</p>", unsafe_allow_html=True)
142
 
143
- # Shingle size configuration
144
- shingle_size = st.sidebar.slider("Shingle Size (k)", 2, 5, 3)
145
- shingle_sim = ShingleSimilarity(k=shingle_size)
 
 
 
 
 
 
146
 
147
- with st.expander("ℹ️ About Shingle-based Similarity", expanded=True):
148
- st.markdown("""
149
- - **Shingles** are consecutive character sequences of length k
150
- - **Higher k** means more precise matching
151
- - **Lower k** captures more general similarities
152
- - Similarity is calculated using Jaccard similarity of shingle sets
153
- """)
154
 
155
- tabs = st.tabs(["πŸ’« Text Similarity", "🎯 Multi-Text Analysis"])
 
156
 
157
- with tabs[0]:
158
- st.markdown("### Compare Two Texts")
159
- col1, col2 = st.columns(2)
160
-
161
- with col1:
162
- text1 = st.text_area("First Text", value="I love programming in Python", height=100)
163
- st.markdown("#### Shingles:")
164
- st.write(sorted(shingle_sim.get_shingles(text1)))
165
-
166
- with col2:
167
- text2 = st.text_area("Second Text", value="Coding with Python is amazing", height=100)
168
- st.markdown("#### Shingles:")
169
- st.write(sorted(shingle_sim.get_shingles(text2)))
170
-
171
- if st.button("Analyze Similarity", key="sim_button"):
172
- similarity = shingle_sim.similarity(text1, text2)
173
-
174
- col1, col2 = st.columns(2)
175
- with col1:
176
- st.metric(
177
- label="Similarity Score",
178
- value=f"{similarity:.4f}",
179
- help="1.0 = identical, 0.0 = completely different"
180
- )
181
- interpretation = (
182
- "🟒 Very Similar" if similarity > 0.8
183
- else "🟑 Moderately Similar" if similarity > 0.5
184
- else "πŸ”΄ Different"
185
- )
186
- st.info(f"Interpretation: {interpretation}")
187
-
188
- with col2:
189
- st.plotly_chart(
190
- create_similarity_based_visualization([text1, text2], shingle_sim),
191
- use_container_width=True
192
- )
193
 
194
- with tabs[1]:
195
- st.markdown("### Analyze Multiple Texts")
196
- num_texts = st.slider("Number of texts:", 2, 6, 3)
197
- texts = []
198
 
199
- for i in range(num_texts):
200
- text = st.text_area(
201
- f"Text {i+1}",
202
- value=f"Example text {i+1}",
203
- height=100,
204
- key=f"text_{i}"
205
- )
206
- texts.append(text)
 
207
 
208
- if st.button("Analyze Texts", key="analyze_button"):
209
- st.plotly_chart(
210
- create_similarity_based_visualization(texts, shingle_sim),
211
- use_container_width=True
212
- )
213
-
214
- # Show similarity matrix
215
- st.markdown("### Similarity Matrix")
216
- similarity_matrix = np.zeros((len(texts), len(texts)))
217
- for i in range(len(texts)):
218
- for j in range(len(texts)):
219
- similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
220
-
221
- fig = go.Figure(data=go.Heatmap(
222
- z=similarity_matrix,
223
- x=[f"Text {i+1}" for i in range(len(texts))],
224
- y=[f"Text {i+1}" for i in range(len(texts))],
225
- colorscale='Viridis',
226
- text=np.round(similarity_matrix, 3),
227
- texttemplate='%{text}',
228
- textfont={"size": 12},
229
- ))
230
-
231
- fig.update_layout(
232
- title="Similarity Matrix",
233
- height=400
234
- )
235
- st.plotly_chart(fig, use_container_width=True)
236
 
237
  if __name__ == "__main__":
238
  main()
 
6
 
7
  # Page configuration
8
  st.set_page_config(
9
+ page_title="Address Similarity Explorer",
10
+ page_icon="🏠",
11
  layout="wide"
12
  )
13
 
 
 
 
 
 
 
 
 
 
 
 
14
  class ShingleSimilarity:
15
  def __init__(self, k=3):
16
+ self.k = k
17
 
18
  def get_shingles(self, text):
 
19
  text = text.lower().strip()
20
  if len(text) < self.k:
21
  return {text}
22
  return {text[i:i+self.k] for i in range(len(text) - self.k + 1)}
23
 
24
  def similarity(self, text1, text2):
 
25
  shingles1 = self.get_shingles(text1)
26
  shingles2 = self.get_shingles(text2)
27
 
 
30
 
31
  return intersection / union if union > 0 else 0.0
32
 
33
+ def create_similarity_visualization(texts, shingle_sim):
 
34
  n = len(texts)
 
 
35
  similarity_matrix = np.zeros((n, n))
36
+
37
  for i in range(n):
38
  for j in range(n):
39
  similarity_matrix[i][j] = shingle_sim.similarity(texts[i], texts[j])
40
 
41
+ # Create visualization
42
+ fig = go.Figure(data=go.Heatmap(
43
+ z=similarity_matrix,
44
+ x=texts,
45
+ y=texts,
46
+ colorscale='Viridis',
47
+ text=np.round(similarity_matrix, 4),
48
+ texttemplate='%{text}',
49
+ textfont={"size": 12},
50
+ ))
51
 
52
+ fig.update_layout(
53
+ title="Address Similarity Matrix",
54
+ height=600,
55
+ width=800
56
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  return fig
59
 
60
  def main():
61
+ st.title("🏠 Address Similarity Analyzer")
 
62
 
63
+ # Example addresses from the image
64
+ example_addresses = [
65
+ "123 North Hampton Blvd",
66
+ "123 N Hampton Blvd",
67
+ "123 North Hampton Boulevard",
68
+ "123 N. Hampton Blvd",
69
+ "123 N. Hampton Boulevard",
70
+ "65 South Hampton Blvd"
71
+ ]
72
 
73
+ st.markdown("### Address Similarity Analysis")
74
+ st.info("If the similarity score is over 0.4, addresses are considered to be the same location")
 
 
 
 
 
75
 
76
+ # Initialize shingle similarity with k=3
77
+ shingle_sim = ShingleSimilarity(k=3)
78
 
79
+ # Display similarity matrix
80
+ fig = create_similarity_visualization(example_addresses, shingle_sim)
81
+ st.plotly_chart(fig)
82
+
83
+ # Interactive comparison
84
+ st.markdown("### Compare Two Addresses")
85
+ col1, col2 = st.columns(2)
86
+
87
+ with col1:
88
+ addr1 = st.selectbox("Select first address:", example_addresses)
89
+ with col2:
90
+ addr2 = st.selectbox("Select second address:", example_addresses, index=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ if st.button("Compare Addresses"):
93
+ similarity = shingle_sim.similarity(addr1, addr2)
 
 
94
 
95
+ st.metric(
96
+ label="Similarity Score",
97
+ value=f"{similarity:.4f}"
98
+ )
99
+
100
+ is_same = similarity > 0.4
101
+ status = "βœ… Same Location" if is_same else "❌ Different Locations"
102
+ color = "success" if is_same else "error"
103
+ st.markdown(f"**Status:** :{color}[{status}]")
104
 
105
+ # Show shingles comparison
106
+ col1, col2 = st.columns(2)
107
+ with col1:
108
+ st.markdown(f"**Shingles for Address 1:**")
109
+ st.write(sorted(shingle_sim.get_shingles(addr1)))
110
+ with col2:
111
+ st.markdown(f"**Shingles for Address 2:**")
112
+ st.write(sorted(shingle_sim.get_shingles(addr2)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  if __name__ == "__main__":
115
  main()