Spaces:
Sleeping
Sleeping
create streamlit app
Browse files- app.py +286 -0
- requirements.txt +132 -0
- scrape-content.ipynb +3 -1
app.py
ADDED
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import copy
|
3 |
+
import json
|
4 |
+
from collections import Counter
|
5 |
+
from urllib.parse import urljoin
|
6 |
+
|
7 |
+
import streamlit as st
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
|
10 |
+
|
11 |
+
def remove_svg_elements(element):
|
12 |
+
"""
|
13 |
+
Remove all SVG elements from a BeautifulSoup element.
|
14 |
+
Returns a copy of the element with SVGs removed.
|
15 |
+
"""
|
16 |
+
# Create a copy of the element to avoid modifying the original
|
17 |
+
element_copy = copy.copy(element)
|
18 |
+
|
19 |
+
# Find and remove all SVG elements
|
20 |
+
if hasattr(element_copy, 'find_all'):
|
21 |
+
svg_elements = element_copy.find_all('svg')
|
22 |
+
for svg in svg_elements:
|
23 |
+
svg.decompose()
|
24 |
+
|
25 |
+
return element_copy
|
26 |
+
|
27 |
+
def get_element_signature(element):
|
28 |
+
"""
|
29 |
+
Create a signature for an element based on its structure.
|
30 |
+
"""
|
31 |
+
signature = {
|
32 |
+
'tag': element.name,
|
33 |
+
'classes': tuple(sorted(element.get('class', []))),
|
34 |
+
'child_tags': tuple(sorted(child.name for child in element.find_all(recursive=False) if child.name)),
|
35 |
+
'has_image': bool(element.find('img')),
|
36 |
+
'has_price': bool(any(c in element.get_text() for c in '$€£¥')),
|
37 |
+
'has_link': bool(element.find('a')),
|
38 |
+
}
|
39 |
+
return str(signature)
|
40 |
+
|
41 |
+
def analyze_children_similarity(element):
|
42 |
+
"""
|
43 |
+
Analyze how similar the direct children of an element are.
|
44 |
+
"""
|
45 |
+
if not element.contents:
|
46 |
+
return 0, 0
|
47 |
+
|
48 |
+
child_signatures = [
|
49 |
+
get_element_signature(child)
|
50 |
+
for child in element.find_all(recursive=False)
|
51 |
+
if child.name
|
52 |
+
]
|
53 |
+
|
54 |
+
if not child_signatures:
|
55 |
+
return 0, 0
|
56 |
+
|
57 |
+
signature_counts = Counter(child_signatures)
|
58 |
+
most_common_sig, most_common_count = signature_counts.most_common(1)[0]
|
59 |
+
similarity_score = most_common_count / len(child_signatures)
|
60 |
+
|
61 |
+
return similarity_score, most_common_count
|
62 |
+
|
63 |
+
def count_images_in_element(element):
|
64 |
+
"""
|
65 |
+
Count all images within an element, including nested ones.
|
66 |
+
"""
|
67 |
+
return len(element.find_all('img', recursive=True))
|
68 |
+
|
69 |
+
def get_element_identifier(element):
|
70 |
+
"""
|
71 |
+
Create a unique identifier for an element including tag and classes.
|
72 |
+
"""
|
73 |
+
identifier = element.name
|
74 |
+
if element.get('class'):
|
75 |
+
identifier += f" .{' .'.join(element['class'])}"
|
76 |
+
if element.get('id'):
|
77 |
+
identifier += f" #{element['id']}"
|
78 |
+
return identifier
|
79 |
+
|
80 |
+
def convert_relative_urls(soup, base_url):
|
81 |
+
"""
|
82 |
+
Convert all relative URLs in the soup object to absolute URLs.
|
83 |
+
"""
|
84 |
+
for tag in soup.find_all(href=True):
|
85 |
+
tag['href'] = urljoin(base_url, tag['href'])
|
86 |
+
for tag in soup.find_all(src=True):
|
87 |
+
tag['src'] = urljoin(base_url, tag['src'])
|
88 |
+
for tag in soup.find_all(attrs={'data-src': True}):
|
89 |
+
tag['data-src'] = urljoin(base_url, tag['data-src'])
|
90 |
+
return soup
|
91 |
+
|
92 |
+
def find_image_rich_parents(html_content, base_url="", min_children=4, min_similarity=0.7):
|
93 |
+
"""
|
94 |
+
Find elements containing images and return both sorted list and detailed top element info.
|
95 |
+
"""
|
96 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
97 |
+
|
98 |
+
# Convert relative URLs to absolute if base_url is provided
|
99 |
+
if base_url:
|
100 |
+
soup = convert_relative_urls(soup, base_url)
|
101 |
+
|
102 |
+
# Collect potential container elements with their scores
|
103 |
+
elements_with_scores = []
|
104 |
+
for element in soup.find_all():
|
105 |
+
if element.name in ['div', 'ul', 'section', 'main']:
|
106 |
+
similarity_score, similar_children_count = analyze_children_similarity(element)
|
107 |
+
image_count = count_images_in_element(element)
|
108 |
+
|
109 |
+
if similar_children_count >= min_children and similarity_score >= min_similarity and image_count > 0:
|
110 |
+
# Count products (direct children with images)
|
111 |
+
products_count = len([child for child in element.find_all(recursive=False)
|
112 |
+
if child.name and child.find('img', recursive=True)])
|
113 |
+
|
114 |
+
combined_score = (similarity_score * similar_children_count * image_count)
|
115 |
+
elements_with_scores.append((element, image_count, combined_score, products_count))
|
116 |
+
|
117 |
+
if not elements_with_scores:
|
118 |
+
return [], {"error": "No elements with images found"}, ""
|
119 |
+
|
120 |
+
# Sort by combined score
|
121 |
+
elements_with_scores.sort(key=lambda x: x[2], reverse=True)
|
122 |
+
|
123 |
+
# Process elements for sorted list output
|
124 |
+
sorted_elements = []
|
125 |
+
for element, image_count, _, products_count in elements_with_scores:
|
126 |
+
sorted_elements.append((get_element_identifier(element), image_count, products_count))
|
127 |
+
|
128 |
+
# Get top element (one with highest combined score)
|
129 |
+
top_element = elements_with_scores[0][0]
|
130 |
+
|
131 |
+
# Remove SVGs from the top element for HTML output
|
132 |
+
top_element_no_svg = remove_svg_elements(top_element)
|
133 |
+
|
134 |
+
# Separate child elements with images
|
135 |
+
products = []
|
136 |
+
for child in top_element_no_svg.find_all(recursive=False):
|
137 |
+
if child.name: # Skip text nodes
|
138 |
+
# Remove SVGs from each product
|
139 |
+
child_no_svg = remove_svg_elements(child)
|
140 |
+
product_info = {
|
141 |
+
"html_content": str(child_no_svg),
|
142 |
+
"images": []
|
143 |
+
}
|
144 |
+
|
145 |
+
# Get all images within this product
|
146 |
+
for img in child_no_svg.find_all('img', recursive=True):
|
147 |
+
image_info = {
|
148 |
+
"src": img.get('src', 'No source'),
|
149 |
+
"alt": img.get('alt', 'No alt text')
|
150 |
+
}
|
151 |
+
product_info["images"].append(image_info)
|
152 |
+
|
153 |
+
products.append(product_info)
|
154 |
+
|
155 |
+
# Create result dictionary for top element
|
156 |
+
top_element_info = {
|
157 |
+
"parent": {
|
158 |
+
"tag": top_element_no_svg.name,
|
159 |
+
"identifier": get_element_identifier(top_element_no_svg),
|
160 |
+
"classes": top_element_no_svg.get('class', []),
|
161 |
+
"id": top_element_no_svg.get('id', None)
|
162 |
+
},
|
163 |
+
"products_count": len(products),
|
164 |
+
"products": products
|
165 |
+
}
|
166 |
+
|
167 |
+
html_output = str(top_element_no_svg)
|
168 |
+
|
169 |
+
return sorted_elements, top_element_info, html_output
|
170 |
+
|
171 |
+
def get_download_link(content, filename, content_type="file/json"):
|
172 |
+
"""Generate a download link for the given content"""
|
173 |
+
b64 = base64.b64encode(content.encode()).decode()
|
174 |
+
return f'<a href="data:{content_type};base64,{b64}" download="{filename}">Download {filename}</a>'
|
175 |
+
|
176 |
+
def main():
|
177 |
+
st.title("HTML File Analyzer")
|
178 |
+
st.write("Upload HTML files to analyze their structure and find image-rich elements")
|
179 |
+
|
180 |
+
# File uploader allows multiple files
|
181 |
+
uploaded_files = st.file_uploader("Choose HTML files", accept_multiple_files=True, type=['html'])
|
182 |
+
|
183 |
+
if uploaded_files:
|
184 |
+
all_results = {}
|
185 |
+
all_html_outputs = {}
|
186 |
+
|
187 |
+
# Analysis parameters
|
188 |
+
col1, col2 = st.columns(2)
|
189 |
+
with col1:
|
190 |
+
min_children = st.slider("Minimum number of similar children", 1, 10, 4)
|
191 |
+
with col2:
|
192 |
+
min_similarity = st.slider("Minimum similarity score", 0.0, 1.0, 0.7)
|
193 |
+
|
194 |
+
# Generate button
|
195 |
+
if st.button("Generate Analysis"):
|
196 |
+
# Show processing message
|
197 |
+
with st.spinner('Processing files...'):
|
198 |
+
all_results = {}
|
199 |
+
all_html_outputs = {}
|
200 |
+
|
201 |
+
# Process each file
|
202 |
+
for uploaded_file in uploaded_files:
|
203 |
+
st.subheader(f"Analysis for {uploaded_file.name}")
|
204 |
+
|
205 |
+
try:
|
206 |
+
# Read and process the file
|
207 |
+
html_content = uploaded_file.read().decode('utf-8')
|
208 |
+
sorted_elements, top_element_info, html_output = find_image_rich_parents(
|
209 |
+
html_content,
|
210 |
+
min_children=min_children,
|
211 |
+
min_similarity=min_similarity
|
212 |
+
)
|
213 |
+
|
214 |
+
# Display results
|
215 |
+
st.write("Elements containing images:")
|
216 |
+
for element, img_count, prod_count in sorted_elements:
|
217 |
+
st.write(f"- {element}: {img_count} images, {prod_count} products")
|
218 |
+
|
219 |
+
# Store results
|
220 |
+
all_results[uploaded_file.name] = top_element_info
|
221 |
+
all_html_outputs[uploaded_file.name] = html_output
|
222 |
+
|
223 |
+
except Exception as e:
|
224 |
+
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
225 |
+
continue
|
226 |
+
|
227 |
+
# Create download buttons if we have results
|
228 |
+
if all_results:
|
229 |
+
st.subheader("Download Results")
|
230 |
+
col1, col2 = st.columns(2)
|
231 |
+
|
232 |
+
# JSON download
|
233 |
+
with col1:
|
234 |
+
json_str = json.dumps(all_results, indent=2)
|
235 |
+
st.markdown(get_download_link(json_str, 'analysis_results.json'),
|
236 |
+
unsafe_allow_html=True)
|
237 |
+
|
238 |
+
# HTML download
|
239 |
+
with col2:
|
240 |
+
# Combine all HTML outputs with file names as headers
|
241 |
+
combined_html = """
|
242 |
+
<!DOCTYPE html>
|
243 |
+
<html>
|
244 |
+
<head>
|
245 |
+
<meta charset='UTF-8'>
|
246 |
+
<style>
|
247 |
+
div {
|
248 |
+
width: auto !important;
|
249 |
+
height: auto !important;
|
250 |
+
padding: 0 !important;
|
251 |
+
margin: 0 !important;
|
252 |
+
}
|
253 |
+
img {
|
254 |
+
width: 300px;
|
255 |
+
height: 300px;
|
256 |
+
object-fit: contain;
|
257 |
+
}
|
258 |
+
body { font-family: Arial, sans-serif; }
|
259 |
+
.file-section { margin: 20px 0; }
|
260 |
+
.file-header {
|
261 |
+
background: #f0f0f0;
|
262 |
+
padding: 10px;
|
263 |
+
margin: 20px 0;
|
264 |
+
}
|
265 |
+
</style>
|
266 |
+
</head>
|
267 |
+
<body>
|
268 |
+
"""
|
269 |
+
for filename, html in all_html_outputs.items():
|
270 |
+
combined_html += f"""
|
271 |
+
<div class="file-section">
|
272 |
+
<h2 class="file-header">{filename}</h2>
|
273 |
+
{html}
|
274 |
+
</div>
|
275 |
+
"""
|
276 |
+
combined_html += "</body></html>"
|
277 |
+
|
278 |
+
st.markdown(get_download_link(combined_html, 'analysis_results.html', 'text/html'),
|
279 |
+
unsafe_allow_html=True)
|
280 |
+
|
281 |
+
# Success message
|
282 |
+
st.success("Analysis completed successfully!")
|
283 |
+
|
284 |
+
|
285 |
+
if __name__ == "__main__":
|
286 |
+
main()
|
requirements.txt
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair==5.4.1
|
2 |
+
anyio==4.6.2.post1
|
3 |
+
argon2-cffi==23.1.0
|
4 |
+
argon2-cffi-bindings==21.2.0
|
5 |
+
arrow==1.3.0
|
6 |
+
asttokens==2.4.1
|
7 |
+
async-lru==2.0.4
|
8 |
+
attrs==24.2.0
|
9 |
+
babel==2.16.0
|
10 |
+
beautifulsoup4==4.12.3
|
11 |
+
black==24.10.0
|
12 |
+
bleach==6.1.0
|
13 |
+
blinker==1.8.2
|
14 |
+
bs4==0.0.2
|
15 |
+
cachetools==5.5.0
|
16 |
+
certifi==2024.8.30
|
17 |
+
cffi==1.17.1
|
18 |
+
charset-normalizer==3.4.0
|
19 |
+
click==8.1.7
|
20 |
+
comm==0.2.2
|
21 |
+
debugpy==1.8.7
|
22 |
+
decorator==5.1.1
|
23 |
+
defusedxml==0.7.1
|
24 |
+
executing==2.1.0
|
25 |
+
fake-headers==1.0.2
|
26 |
+
fastjsonschema==2.20.0
|
27 |
+
fqdn==1.5.1
|
28 |
+
gitdb==4.0.11
|
29 |
+
GitPython==3.1.43
|
30 |
+
h11==0.14.0
|
31 |
+
html5lib==1.1
|
32 |
+
httpcore==1.0.6
|
33 |
+
httpx==0.27.2
|
34 |
+
idna==3.10
|
35 |
+
ipykernel==6.29.5
|
36 |
+
ipython==8.28.0
|
37 |
+
ipywidgets==8.1.5
|
38 |
+
isoduration==20.11.0
|
39 |
+
jedi==0.19.1
|
40 |
+
Jinja2==3.1.4
|
41 |
+
json5==0.9.25
|
42 |
+
jsonpointer==3.0.0
|
43 |
+
jsonschema==4.23.0
|
44 |
+
jsonschema-specifications==2024.10.1
|
45 |
+
jupyter==1.1.1
|
46 |
+
jupyter-console==6.6.3
|
47 |
+
jupyter-events==0.10.0
|
48 |
+
jupyter-lsp==2.2.5
|
49 |
+
jupyter_client==8.6.3
|
50 |
+
jupyter_core==5.7.2
|
51 |
+
jupyter_server==2.14.2
|
52 |
+
jupyter_server_terminals==0.5.3
|
53 |
+
jupyterlab==4.2.5
|
54 |
+
jupyterlab_pygments==0.3.0
|
55 |
+
jupyterlab_server==2.27.3
|
56 |
+
jupyterlab_widgets==3.0.13
|
57 |
+
markdown-it-py==3.0.0
|
58 |
+
MarkupSafe==3.0.2
|
59 |
+
matplotlib-inline==0.1.7
|
60 |
+
mdurl==0.1.2
|
61 |
+
mistune==3.0.2
|
62 |
+
mypy-extensions==1.0.0
|
63 |
+
narwhals==1.11.0
|
64 |
+
nbclient==0.10.0
|
65 |
+
nbconvert==7.16.4
|
66 |
+
nbformat==5.10.4
|
67 |
+
nest-asyncio==1.6.0
|
68 |
+
notebook==7.2.2
|
69 |
+
notebook_shim==0.2.4
|
70 |
+
numpy==2.1.2
|
71 |
+
outcome==1.3.0.post0
|
72 |
+
overrides==7.7.0
|
73 |
+
packaging==24.1
|
74 |
+
pandas==2.2.3
|
75 |
+
pandocfilters==1.5.1
|
76 |
+
parso==0.8.4
|
77 |
+
pathspec==0.12.1
|
78 |
+
pexpect==4.9.0
|
79 |
+
pillow==10.4.0
|
80 |
+
platformdirs==4.3.6
|
81 |
+
prometheus_client==0.21.0
|
82 |
+
prompt_toolkit==3.0.48
|
83 |
+
protobuf==5.28.3
|
84 |
+
psutil==6.1.0
|
85 |
+
ptyprocess==0.7.0
|
86 |
+
pure_eval==0.2.3
|
87 |
+
pyarrow==17.0.0
|
88 |
+
pycparser==2.22
|
89 |
+
pydeck==0.9.1
|
90 |
+
Pygments==2.18.0
|
91 |
+
PySocks==1.7.1
|
92 |
+
python-dateutil==2.9.0.post0
|
93 |
+
python-json-logger==2.0.7
|
94 |
+
pytz==2024.2
|
95 |
+
PyYAML==6.0.2
|
96 |
+
pyzmq==26.2.0
|
97 |
+
referencing==0.35.1
|
98 |
+
requests==2.32.3
|
99 |
+
rfc3339-validator==0.1.4
|
100 |
+
rfc3986-validator==0.1.1
|
101 |
+
rich==13.9.3
|
102 |
+
rpds-py==0.20.0
|
103 |
+
selenium==4.25.0
|
104 |
+
Send2Trash==1.8.3
|
105 |
+
setuptools==75.2.0
|
106 |
+
six==1.16.0
|
107 |
+
smmap==5.0.1
|
108 |
+
sniffio==1.3.1
|
109 |
+
sortedcontainers==2.4.0
|
110 |
+
soupsieve==2.6
|
111 |
+
stack-data==0.6.3
|
112 |
+
streamlit==1.39.0
|
113 |
+
tenacity==9.0.0
|
114 |
+
terminado==0.18.1
|
115 |
+
tinycss2==1.4.0
|
116 |
+
toml==0.10.2
|
117 |
+
tornado==6.4.1
|
118 |
+
traitlets==5.14.3
|
119 |
+
trio==0.27.0
|
120 |
+
trio-websocket==0.11.1
|
121 |
+
types-python-dateutil==2.9.0.20241003
|
122 |
+
typing_extensions==4.12.2
|
123 |
+
tzdata==2024.2
|
124 |
+
uri-template==1.3.0
|
125 |
+
urllib3==2.2.3
|
126 |
+
watchdog==5.0.3
|
127 |
+
wcwidth==0.2.13
|
128 |
+
webcolors==24.8.0
|
129 |
+
webencodings==0.5.1
|
130 |
+
websocket-client==1.8.0
|
131 |
+
widgetsnbextension==4.0.13
|
132 |
+
wsproto==1.2.0
|
scrape-content.ipynb
CHANGED
@@ -207,6 +207,7 @@
|
|
207 |
" if not element.contents:\n",
|
208 |
" return 0, 0\n",
|
209 |
"\n",
|
|
|
210 |
" child_signatures = [\n",
|
211 |
" get_element_signature(child)\n",
|
212 |
" for child in element.find_all(recursive=False)\n",
|
@@ -216,9 +217,10 @@
|
|
216 |
" if not child_signatures:\n",
|
217 |
" return 0, 0\n",
|
218 |
"\n",
|
|
|
219 |
" signature_counts = Counter(child_signatures)\n",
|
220 |
" most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
|
221 |
-
" similarity_score = most_common_count / len(child_signatures)
|
222 |
"\n",
|
223 |
" return similarity_score, most_common_count\n",
|
224 |
"\n",
|
|
|
207 |
" if not element.contents:\n",
|
208 |
" return 0, 0\n",
|
209 |
"\n",
|
210 |
+
" # Get signatures for all direct children that are elements (have a tag name)\n",
|
211 |
" child_signatures = [\n",
|
212 |
" get_element_signature(child)\n",
|
213 |
" for child in element.find_all(recursive=False)\n",
|
|
|
217 |
" if not child_signatures:\n",
|
218 |
" return 0, 0\n",
|
219 |
"\n",
|
220 |
+
" # Count how many times each signature appears and get the most common one\n",
|
221 |
" signature_counts = Counter(child_signatures)\n",
|
222 |
" most_common_sig, most_common_count = signature_counts.most_common(1)[0]\n",
|
223 |
+
" similarity_score = most_common_count / len(child_signatures)\n",
|
224 |
"\n",
|
225 |
" return similarity_score, most_common_count\n",
|
226 |
"\n",
|