supunTE commited on
Commit
814b935
·
1 Parent(s): f2fe0c9

create streamlit app

Browse files
Files changed (3) hide show
  1. app.py +286 -0
  2. requirements.txt +132 -0
  3. scrape-content.ipynb +3 -1
app.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import copy
3
+ import json
4
+ from collections import Counter
5
+ from urllib.parse import urljoin
6
+
7
+ import streamlit as st
8
+ from bs4 import BeautifulSoup
9
+
10
+
11
+ def remove_svg_elements(element):
12
+ """
13
+ Remove all SVG elements from a BeautifulSoup element.
14
+ Returns a copy of the element with SVGs removed.
15
+ """
16
+ # Create a copy of the element to avoid modifying the original
17
+ element_copy = copy.copy(element)
18
+
19
+ # Find and remove all SVG elements
20
+ if hasattr(element_copy, 'find_all'):
21
+ svg_elements = element_copy.find_all('svg')
22
+ for svg in svg_elements:
23
+ svg.decompose()
24
+
25
+ return element_copy
26
+
27
+ def get_element_signature(element):
28
+ """
29
+ Create a signature for an element based on its structure.
30
+ """
31
+ signature = {
32
+ 'tag': element.name,
33
+ 'classes': tuple(sorted(element.get('class', []))),
34
+ 'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
35
+ 'has_image': bool(element.find('img')),
36
+ 'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
37
+ 'has_link': bool(element.find('a')),
38
+ }
39
+ return str(signature)
40
+
41
+ def analyze_children_similarity(element):
42
+ """
43
+ Analyze how similar the direct children of an element are.
44
+ """
45
+ if not element.contents:
46
+ return 0, 0
47
+
48
+ child_signatures = [
49
+ get_element_signature(child)
50
+ for child in element.find_all(recursive=False)
51
+ if child.name
52
+ ]
53
+
54
+ if not child_signatures:
55
+ return 0, 0
56
+
57
+ signature_counts = Counter(child_signatures)
58
+ most_common_sig, most_common_count = signature_counts.most_common(1)[0]
59
+ similarity_score = most_common_count / len(child_signatures)
60
+
61
+ return similarity_score, most_common_count
62
+
63
+ def count_images_in_element(element):
64
+ """
65
+ Count all images within an element, including nested ones.
66
+ """
67
+ return len(element.find_all('img', recursive=True))
68
+
69
+ def get_element_identifier(element):
70
+ """
71
+ Create a unique identifier for an element including tag and classes.
72
+ """
73
+ identifier = element.name
74
+ if element.get('class'):
75
+ identifier += f" .{' .'.join(element['class'])}"
76
+ if element.get('id'):
77
+ identifier += f" #{element['id']}"
78
+ return identifier
79
+
80
+ def convert_relative_urls(soup, base_url):
81
+ """
82
+ Convert all relative URLs in the soup object to absolute URLs.
83
+ """
84
+ for tag in soup.find_all(href=True):
85
+ tag['href'] = urljoin(base_url, tag['href'])
86
+ for tag in soup.find_all(src=True):
87
+ tag['src'] = urljoin(base_url, tag['src'])
88
+ for tag in soup.find_all(attrs={'data-src': True}):
89
+ tag['data-src'] = urljoin(base_url, tag['data-src'])
90
+ return soup
91
+
92
+ def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
93
+ """
94
+ Find elements containing images and return both sorted list and detailed top element info.
95
+ """
96
+ soup = BeautifulSoup(html_content, "html.parser")
97
+
98
+ # Convert relative URLs to absolute if base_url is provided
99
+ if base_url:
100
+ soup = convert_relative_urls(soup, base_url)
101
+
102
+ # Collect potential container elements with their scores
103
+ elements_with_scores = []
104
+ for element in soup.find_all():
105
+ if element.name in ['div', 'ul', 'section', 'main']:
106
+ similarity_score, similar_children_count = analyze_children_similarity(element)
107
+ image_count = count_images_in_element(element)
108
+
109
+ if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
110
+ # Count products (direct children with images)
111
+ products_count = len([child for child in element.find_all(recursive=False)
112
+ if child.name and child.find('img', recursive=True)])
113
+
114
+ combined_score = (similarity_score * similar_children_count * image_count)
115
+ elements_with_scores.append((element, image_count, combined_score, products_count))
116
+
117
+ if not elements_with_scores:
118
+ return [], {"error": "No elements with images found"}, ""
119
+
120
+ # Sort by combined score
121
+ elements_with_scores.sort(key=lambda x: x[2], reverse=True)
122
+
123
+ # Process elements for sorted list output
124
+ sorted_elements = []
125
+ for element, image_count, _, products_count in elements_with_scores:
126
+ sorted_elements.append((get_element_identifier(element), image_count, products_count))
127
+
128
+ # Get top element (one with highest combined score)
129
+ top_element = elements_with_scores[0][0]
130
+
131
+ # Remove SVGs from the top element for HTML output
132
+ top_element_no_svg = remove_svg_elements(top_element)
133
+
134
+ # Separate child elements with images
135
+ products = []
136
+ for child in top_element_no_svg.find_all(recursive=False):
137
+ if child.name: # Skip text nodes
138
+ # Remove SVGs from each product
139
+ child_no_svg = remove_svg_elements(child)
140
+ product_info = {
141
+ "html_content": str(child_no_svg),
142
+ "images": []
143
+ }
144
+
145
+ # Get all images within this product
146
+ for img in child_no_svg.find_all('img', recursive=True):
147
+ image_info = {
148
+ "src": img.get('src', 'No source'),
149
+ "alt": img.get('alt', 'No alt text')
150
+ }
151
+ product_info["images"].append(image_info)
152
+
153
+ products.append(product_info)
154
+
155
+ # Create result dictionary for top element
156
+ top_element_info = {
157
+ "parent": {
158
+ "tag": top_element_no_svg.name,
159
+ "identifier": get_element_identifier(top_element_no_svg),
160
+ "classes": top_element_no_svg.get('class', []),
161
+ "id": top_element_no_svg.get('id', None)
162
+ },
163
+ "products_count": len(products),
164
+ "products": products
165
+ }
166
+
167
+ html_output = str(top_element_no_svg)
168
+
169
+ return sorted_elements, top_element_info, html_output
170
+
171
+ def get_download_link(content, filename, content_type="file/json"):
172
+ """Generate a download link for the given content"""
173
+ b64 = base64.b64encode(content.encode()).decode()
174
+ return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
175
+
176
+ def main():
177
+ st.title("HTML File Analyzer")
178
+ st.write("Upload HTML files to analyze their structure and find image-rich elements")
179
+
180
+ # File uploader allows multiple files
181
+ uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
182
+
183
+ if uploaded_files:
184
+ all_results = {}
185
+ all_html_outputs = {}
186
+
187
+ # Analysis parameters
188
+ col1, col2 = st.columns(2)
189
+ with col1:
190
+ min_children = st.slider("Minimum number of similar children", 1, 10, 4)
191
+ with col2:
192
+ min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
193
+
194
+ # Generate button
195
+ if st.button("Generate Analysis"):
196
+ # Show processing message
197
+ with st.spinner('Processing files...'):
198
+ all_results = {}
199
+ all_html_outputs = {}
200
+
201
+ # Process each file
202
+ for uploaded_file in uploaded_files:
203
+ st.subheader(f"Analysis for {uploaded_file.name}")
204
+
205
+ try:
206
+ # Read and process the file
207
+ html_content = uploaded_file.read().decode('utf-8')
208
+ sorted_elements, top_element_info, html_output = find_image_rich_parents(
209
+ html_content,
210
+ min_children=min_children,
211
+ min_similarity=min_similarity
212
+ )
213
+
214
+ # Display results
215
+ st.write("Elements containing images:")
216
+ for element, img_count, prod_count in sorted_elements:
217
+ st.write(f"- {element}: {img_count} images, {prod_count} products")
218
+
219
+ # Store results
220
+ all_results[uploaded_file.name] = top_element_info
221
+ all_html_outputs[uploaded_file.name] = html_output
222
+
223
+ except Exception as e:
224
+ st.error(f"Error processing {uploaded_file.name}: {str(e)}")
225
+ continue
226
+
227
+ # Create download buttons if we have results
228
+ if all_results:
229
+ st.subheader("Download Results")
230
+ col1, col2 = st.columns(2)
231
+
232
+ # JSON download
233
+ with col1:
234
+ json_str = json.dumps(all_results, indent=2)
235
+ st.markdown(get_download_link(json_str, 'analysis_results.json'),
236
+ unsafe_allow_html=True)
237
+
238
+ # HTML download
239
+ with col2:
240
+ # Combine all HTML outputs with file names as headers
241
+ combined_html = """
242
+ <!DOCTYPE html>
243
+ <html>
244
+ <head>
245
+ <meta charset='UTF-8'>
246
+ <style>
247
+ div {
248
+ width: auto !important;
249
+ height: auto !important;
250
+ padding: 0 !important;
251
+ margin: 0 !important;
252
+ }
253
+ img {
254
+ width: 300px;
255
+ height: 300px;
256
+ object-fit: contain;
257
+ }
258
+ body { font-family: Arial, sans-serif; }
259
+ .file-section { margin: 20px 0; }
260
+ .file-header {
261
+ background: #f0f0f0;
262
+ padding: 10px;
263
+ margin: 20px 0;
264
+ }
265
+ </style>
266
+ </head>
267
+ <body>
268
+ """
269
+ for filename, html in all_html_outputs.items():
270
+ combined_html += f"""
271
+ <div class="file-section">
272
+ <h2 class="file-header">{filename}</h2>
273
+ {html}
274
+ </div>
275
+ """
276
+ combined_html += "</body></html>"
277
+
278
+ st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
279
+ unsafe_allow_html=True)
280
+
281
+ # Success message
282
+ st.success("Analysis completed successfully!")
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()
requirements.txt ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==5.4.1
2
+ anyio==4.6.2.post1
3
+ argon2-cffi==23.1.0
4
+ argon2-cffi-bindings==21.2.0
5
+ arrow==1.3.0
6
+ asttokens==2.4.1
7
+ async-lru==2.0.4
8
+ attrs==24.2.0
9
+ babel==2.16.0
10
+ beautifulsoup4==4.12.3
11
+ black==24.10.0
12
+ bleach==6.1.0
13
+ blinker==1.8.2
14
+ bs4==0.0.2
15
+ cachetools==5.5.0
16
+ certifi==2024.8.30
17
+ cffi==1.17.1
18
+ charset-normalizer==3.4.0
19
+ click==8.1.7
20
+ comm==0.2.2
21
+ debugpy==1.8.7
22
+ decorator==5.1.1
23
+ defusedxml==0.7.1
24
+ executing==2.1.0
25
+ fake-headers==1.0.2
26
+ fastjsonschema==2.20.0
27
+ fqdn==1.5.1
28
+ gitdb==4.0.11
29
+ GitPython==3.1.43
30
+ h11==0.14.0
31
+ html5lib==1.1
32
+ httpcore==1.0.6
33
+ httpx==0.27.2
34
+ idna==3.10
35
+ ipykernel==6.29.5
36
+ ipython==8.28.0
37
+ ipywidgets==8.1.5
38
+ isoduration==20.11.0
39
+ jedi==0.19.1
40
+ Jinja2==3.1.4
41
+ json5==0.9.25
42
+ jsonpointer==3.0.0
43
+ jsonschema==4.23.0
44
+ jsonschema-specifications==2024.10.1
45
+ jupyter==1.1.1
46
+ jupyter-console==6.6.3
47
+ jupyter-events==0.10.0
48
+ jupyter-lsp==2.2.5
49
+ jupyter_client==8.6.3
50
+ jupyter_core==5.7.2
51
+ jupyter_server==2.14.2
52
+ jupyter_server_terminals==0.5.3
53
+ jupyterlab==4.2.5
54
+ jupyterlab_pygments==0.3.0
55
+ jupyterlab_server==2.27.3
56
+ jupyterlab_widgets==3.0.13
57
+ markdown-it-py==3.0.0
58
+ MarkupSafe==3.0.2
59
+ matplotlib-inline==0.1.7
60
+ mdurl==0.1.2
61
+ mistune==3.0.2
62
+ mypy-extensions==1.0.0
63
+ narwhals==1.11.0
64
+ nbclient==0.10.0
65
+ nbconvert==7.16.4
66
+ nbformat==5.10.4
67
+ nest-asyncio==1.6.0
68
+ notebook==7.2.2
69
+ notebook_shim==0.2.4
70
+ numpy==2.1.2
71
+ outcome==1.3.0.post0
72
+ overrides==7.7.0
73
+ packaging==24.1
74
+ pandas==2.2.3
75
+ pandocfilters==1.5.1
76
+ parso==0.8.4
77
+ pathspec==0.12.1
78
+ pexpect==4.9.0
79
+ pillow==10.4.0
80
+ platformdirs==4.3.6
81
+ prometheus_client==0.21.0
82
+ prompt_toolkit==3.0.48
83
+ protobuf==5.28.3
84
+ psutil==6.1.0
85
+ ptyprocess==0.7.0
86
+ pure_eval==0.2.3
87
+ pyarrow==17.0.0
88
+ pycparser==2.22
89
+ pydeck==0.9.1
90
+ Pygments==2.18.0
91
+ PySocks==1.7.1
92
+ python-dateutil==2.9.0.post0
93
+ python-json-logger==2.0.7
94
+ pytz==2024.2
95
+ PyYAML==6.0.2
96
+ pyzmq==26.2.0
97
+ referencing==0.35.1
98
+ requests==2.32.3
99
+ rfc3339-validator==0.1.4
100
+ rfc3986-validator==0.1.1
101
+ rich==13.9.3
102
+ rpds-py==0.20.0
103
+ selenium==4.25.0
104
+ Send2Trash==1.8.3
105
+ setuptools==75.2.0
106
+ six==1.16.0
107
+ smmap==5.0.1
108
+ sniffio==1.3.1
109
+ sortedcontainers==2.4.0
110
+ soupsieve==2.6
111
+ stack-data==0.6.3
112
+ streamlit==1.39.0
113
+ tenacity==9.0.0
114
+ terminado==0.18.1
115
+ tinycss2==1.4.0
116
+ toml==0.10.2
117
+ tornado==6.4.1
118
+ traitlets==5.14.3
119
+ trio==0.27.0
120
+ trio-websocket==0.11.1
121
+ types-python-dateutil==2.9.0.20241003
122
+ typing_extensions==4.12.2
123
+ tzdata==2024.2
124
+ uri-template==1.3.0
125
+ urllib3==2.2.3
126
+ watchdog==5.0.3
127
+ wcwidth==0.2.13
128
+ webcolors==24.8.0
129
+ webencodings==0.5.1
130
+ websocket-client==1.8.0
131
+ widgetsnbextension==4.0.13
132
+ wsproto==1.2.0
scrape-content.ipynb CHANGED
@@ -207,6 +207,7 @@
207
  " if not element.contents:\n",
208
  " return 0, 0\n",
209
  "\n",
 
210
  " child_signatures = [\n",
211
  " get_element_signature(child)\n",
212
  " for child in element.find_all(recursive=False)\n",
@@ -216,9 +217,10 @@
216
  " if not child_signatures:\n",
217
  " return 0, 0\n",
218
  "\n",
 
219
  " signature_counts = Counter(child_signatures)\n",
220
  " most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
221
- " similarity_score = most_common_count / len(child_signatures) if child_signatures else 0\n",
222
  "\n",
223
  " return similarity_score, most_common_count\n",
224
  "\n",
 
207
  " if not element.contents:\n",
208
  " return 0, 0\n",
209
  "\n",
210
+ " # Get signatures for all direct children that are elements (have a tag name)\n",
211
  " child_signatures = [\n",
212
  " get_element_signature(child)\n",
213
  " for child in element.find_all(recursive=False)\n",
 
217
  " if not child_signatures:\n",
218
  " return 0, 0\n",
219
  "\n",
220
+ " # Count how many times each signature appears and get the most common one\n",
221
  " signature_counts = Counter(child_signatures)\n",
222
  " most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
223
+ " similarity_score = most_common_count / len(child_signatures)\n",
224
  "\n",
225
  " return similarity_score, most_common_count\n",
226
  "\n",