dejanseo commited on
Commit
e0c7076
·
verified ·
1 Parent(s): 903c83f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +190 -0
app.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
+ from sentence_transformers import SentenceTransformer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sklearn.manifold import TSNE
8
+ import numpy as np
9
+ from numpy.linalg import norm
10
+ import matplotlib.pyplot as plt
11
+ import plotly.express as px
12
+ import re
13
+
14
+ # Load the LaBSE model
15
+ @st.cache_resource
16
+ def load_model():
17
+ return SentenceTransformer("sentence-transformers/LaBSE")
18
+
19
+ model = load_model()
20
+
21
+ def fetch_sitemap_urls(domain):
22
+ """Fetch and parse URLs from sitemaps, excluding images and handling nested sitemaps."""
23
+ domain = domain.replace("https://", "").replace("http://", "").strip("/")
24
+ sitemap_urls = [
25
+ f"https://{domain}/sitemap.xml",
26
+ f"https://{domain}/sitemap_index.xml",
27
+ f"https://{domain}/robots.txt"
28
+ ]
29
+ all_urls = []
30
+
31
+ for sitemap_url in sitemap_urls:
32
+ try:
33
+ response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10)
34
+ response.raise_for_status()
35
+ if "robots.txt" in sitemap_url:
36
+ for line in response.text.splitlines():
37
+ if line.lower().startswith("sitemap:"):
38
+ nested_sitemap_url = line.split(":", 1)[1].strip()
39
+ all_urls.extend(fetch_sitemap_urls_from_xml(nested_sitemap_url, domain, recursive=True))
40
+ else:
41
+ all_urls.extend(fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=True))
42
+ except requests.RequestException:
43
+ continue
44
+ return list(set(all_urls))
45
+
46
+ def fetch_sitemap_urls_from_xml(sitemap_url, domain, recursive=False):
47
+ """Fetch URLs from a sitemap XML file."""
48
+ urls = []
49
+ try:
50
+ response = requests.get(sitemap_url, headers={"User-Agent": "SiteFocusTool/1.0"}, timeout=10)
51
+ response.raise_for_status()
52
+ soup = BeautifulSoup(response.content, "xml")
53
+ if soup.find_all("sitemap"):
54
+ for sitemap in soup.find_all("sitemap"):
55
+ loc = sitemap.find("loc").text
56
+ if recursive:
57
+ urls.extend(fetch_sitemap_urls_from_xml(loc, domain, recursive=True))
58
+ else:
59
+ for loc in soup.find_all("loc"):
60
+ url = loc.text
61
+ if not re.search(r"\.(jpg|jpeg|png|gif|svg|webp|bmp|tif|tiff)$", url, re.IGNORECASE):
62
+ urls.append(url)
63
+ except requests.RequestException:
64
+ pass
65
+ return urls
66
+
67
+ def clean_text_from_url(url, domain):
68
+ """Clean URL by removing root domain and extracting readable text."""
69
+ domain = domain.replace("https://", "").replace("http://", "").strip("/")
70
+ url = url.replace(f"https://{domain}/", "").replace(f"http://{domain}/", "")
71
+ text = re.sub(r"[^\w\s]", " ", url)
72
+ text = text.replace("/", " ").replace("_", " ").replace("-", " ")
73
+ return text.strip()
74
+
75
+ def compute_embeddings(data):
76
+ """Generate normalized embeddings for the cleaned text."""
77
+ data["Embedding"] = data["Cleaned Text"].apply(lambda text: model.encode(text))
78
+ data["Embedding"] = data["Embedding"].apply(lambda emb: emb / norm(emb)) # Normalize
79
+ return data
80
+
81
+ def calculate_site_focus_and_radius(embeddings):
82
+ """Calculate site focus score and site radius."""
83
+ centroid_embedding = np.mean(embeddings, axis=0)
84
+ deviations = [1 - cosine_similarity([embedding], [centroid_embedding])[0][0] for embedding in embeddings]
85
+ site_radius = np.mean(deviations)
86
+ site_focus_score = max(0, 1 - site_radius)
87
+ return site_focus_score, site_radius, centroid_embedding, deviations
88
+
89
+ def plot_gradient_strip_with_indicator(score, title):
90
+ """Visualize the score as a gradient strip with an indicator."""
91
+ plt.figure(figsize=(8, 1))
92
+ gradient = np.linspace(0, 1, 256).reshape(1, -1)
93
+ gradient = np.vstack((gradient, gradient))
94
+ plt.imshow(gradient, aspect="auto", cmap="RdYlGn_r") # Red to Green reversed for correct mapping
95
+ plt.axvline(x=score * 256, color="black", linestyle="--", linewidth=2)
96
+ plt.gca().set_axis_off()
97
+ plt.title(f"{title}: {score * 100:.2f}%")
98
+ plt.show()
99
+ st.pyplot(plt)
100
+
101
+ def plot_3d_tsne(embeddings, urls, centroid, deviations):
102
+ """Interactive 3D t-SNE scatter plot with hover labels."""
103
+ tsne = TSNE(n_components=3, random_state=42, perplexity=min(30, len(embeddings) - 1))
104
+ tsne_results = tsne.fit_transform(np.vstack([embeddings, centroid]))
105
+ centroid_tsne = tsne_results[-1] # Last point is the centroid
106
+ tsne_results = tsne_results[:-1] # Remaining points are pages
107
+
108
+ fig = px.scatter_3d(
109
+ x=tsne_results[:, 0],
110
+ y=tsne_results[:, 1],
111
+ z=tsne_results[:, 2],
112
+ color=deviations,
113
+ color_continuous_scale="RdYlGn_r",
114
+ hover_name=urls,
115
+ labels={"color": "Deviation"},
116
+ title="3D t-SNE Projection of Page Embeddings"
117
+ )
118
+ fig.add_scatter3d(
119
+ x=[centroid_tsne[0]],
120
+ y=[centroid_tsne[1]],
121
+ z=[centroid_tsne[2]],
122
+ mode="markers",
123
+ marker=dict(size=15, color="green"),
124
+ name="Centroid"
125
+ )
126
+ st.plotly_chart(fig)
127
+
128
+ def plot_spherical_distances_optimized(deviations, embeddings, urls):
129
+ """Improved scatter plot showing distances in a spherical layout with better angle distribution."""
130
+ # Normalize embeddings
131
+ normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)
132
+ num_points = len(deviations)
133
+ angles = np.linspace(0, 2 * np.pi, num_points, endpoint=False) # Spread angles evenly
134
+
135
+ # Create polar scatter plot
136
+ fig = px.scatter_polar(
137
+ r=deviations,
138
+ theta=np.degrees(angles),
139
+ color=deviations,
140
+ color_continuous_scale="RdYlGn_r",
141
+ title="Optimized Spherical Plot of Page Distances from Centroid",
142
+ labels={"color": "Deviation"}
143
+ )
144
+ # Update traces to show text (labels) only on hover
145
+ fig.update_traces(
146
+ mode="markers", # Display only markers by default
147
+ hovertemplate="%{text}<extra></extra>", # Show text on hover
148
+ text=urls # Set URLs as hover labels
149
+ )
150
+ st.plotly_chart(fig)
151
+
152
+ # Streamlit Interface
153
+ st.title("SiteFocus Tool")
154
+
155
+ domain = st.text_input("Enter domain:", placeholder="example.com")
156
+
157
+ if st.button("START"):
158
+ if domain:
159
+ urls = fetch_sitemap_urls(domain)
160
+ if not urls:
161
+ st.error("No URLs found. Please check the domain and try again.")
162
+ else:
163
+ cleaned_texts = [clean_text_from_url(url, domain) for url in urls]
164
+ embeddings = np.array([model.encode(text) / norm(model.encode(text)) for text in cleaned_texts])
165
+ site_focus_score, site_radius, centroid, deviations = calculate_site_focus_and_radius(embeddings)
166
+
167
+ # Visualize siteFocusScore
168
+ st.subheader("siteFocusScore")
169
+ st.markdown("**Description:** The siteFocusScore reflects how tightly aligned a site's content is to a single thematic area. A higher score indicates greater thematic focus, which can improve topical authority in SEO.")
170
+ plot_gradient_strip_with_indicator(site_focus_score, "siteFocusScore")
171
+
172
+ # Visualize siteRadius
173
+ st.subheader("siteRadius")
174
+ st.markdown("**Description:** The siteRadius measures how far individual pages deviate from the site's central theme. A smaller radius indicates higher consistency across the site, which is beneficial for SEO.")
175
+ plot_gradient_strip_with_indicator(site_radius, "siteRadius")
176
+
177
+ # Sorted dataframe by closeness to centroid
178
+ st.subheader("Pages Closest to Centroid")
179
+ distances = [1 - dev for dev in deviations]
180
+ df = pd.DataFrame({"URL": urls, "Distance to Centroid": distances})
181
+ df_sorted = df.sort_values(by="Distance to Centroid", ascending=False)
182
+ st.dataframe(df_sorted)
183
+
184
+ # Interactive 3D t-SNE plot
185
+ st.subheader("3D t-SNE Projection")
186
+ plot_3d_tsne(embeddings, urls, centroid, deviations)
187
+
188
+ # Optimized spherical distance plot
189
+ st.subheader("Spherical Distance Plot")
190
+ plot_spherical_distances_optimized(deviations, embeddings, urls)