Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- pages/1_URLs.py +85 -41
- styles.css +3 -0
- utils/__pycache__/footer.cpython-38.pyc +0 -0
pages/1_URLs.py
CHANGED
@@ -25,13 +25,17 @@ def check_sitemap(url):
|
|
25 |
# Check for sitemap-specific elements
|
26 |
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
|
27 |
return True
|
28 |
-
except
|
29 |
-
|
30 |
-
|
31 |
# Additional conditions for identifying sitemaps
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
return False
|
37 |
|
@@ -152,29 +156,35 @@ def process_urls(sitemap_urls , category):
|
|
152 |
# function to process for a single URL
|
153 |
def run_function(url , category):
|
154 |
extracted_txt = ""
|
155 |
-
# Check if the user has provided a URL
|
156 |
-
if url:
|
157 |
-
if valid_url(url):
|
158 |
-
temp_para = extract_data_from_url_(url)
|
159 |
-
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
160 |
-
extracted_txt = temp_txt_data
|
161 |
-
extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
|
162 |
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
-
|
170 |
-
|
|
|
|
|
171 |
else:
|
|
|
|
|
172 |
return False, None, None
|
173 |
-
|
174 |
-
st.error("
|
175 |
-
# return extract status, and the data extracted
|
176 |
-
return False, None, None
|
177 |
|
|
|
178 |
|
179 |
|
180 |
def main():
|
@@ -314,23 +324,26 @@ def main():
|
|
314 |
save_as_json = st.checkbox("jsonl", value=False)
|
315 |
|
316 |
if not save_as_txt and not save_as_json:
|
317 |
-
|
318 |
-
|
319 |
-
st.
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
328 |
col1, col2 = st.columns([0.5, 0.5])
|
329 |
# save column
|
330 |
with col1:
|
331 |
|
332 |
if is_a_sitemap:
|
333 |
-
|
334 |
if save_as_txt:
|
335 |
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
|
336 |
saved_successfully = True
|
@@ -357,7 +370,34 @@ def main():
|
|
357 |
del st.session_state['sitemap_data_jsonl']
|
358 |
st.session_state.button_enter_url = False
|
359 |
st.experimental_rerun()
|
360 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
if saved_successfully:
|
362 |
# Confirmation message
|
363 |
st.success(f"File saved successfully.")
|
@@ -365,10 +405,14 @@ def main():
|
|
365 |
st.write("#")
|
366 |
else:
|
367 |
st.warning("Data not extracted")
|
368 |
-
|
369 |
-
|
370 |
-
st.
|
371 |
-
|
|
|
|
|
|
|
|
|
372 |
st.write("#")
|
373 |
st.write("#")
|
374 |
|
|
|
25 |
# Check for sitemap-specific elements
|
26 |
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex':
|
27 |
return True
|
28 |
+
except Exception as e:
|
29 |
+
st.error("Invalid sitemap!!")
|
|
|
30 |
# Additional conditions for identifying sitemaps
|
31 |
+
elif 'sitemap' in url.lower():
|
32 |
+
try:
|
33 |
+
response = requests.get(url)
|
34 |
+
# Perform additional checks specific to the website's structure or naming conventions
|
35 |
+
return True
|
36 |
+
except Exception as e:
|
37 |
+
# st.error("Invalid sitemap!!")
|
38 |
+
pass
|
39 |
|
40 |
return False
|
41 |
|
|
|
156 |
# function to process for a single URL
|
157 |
def run_function(url , category):
|
158 |
extracted_txt = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
+
try:
|
161 |
+
response = requests.get(url)
|
162 |
+
# Check if the user has provided a URL
|
163 |
+
if url:
|
164 |
+
if valid_url(url):
|
165 |
+
temp_para = extract_data_from_url_(url)
|
166 |
+
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
167 |
+
extracted_txt = temp_txt_data
|
168 |
+
extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
|
169 |
+
|
170 |
+
# displaying extracted txt for single URL
|
171 |
+
st.text_area("Extracted Text", value=extracted_txt, height=200)
|
172 |
+
|
173 |
+
|
174 |
+
extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False)
|
175 |
|
176 |
+
# return extract status, and the data extracted
|
177 |
+
return True, extracted_txt, extracted_jsonl
|
178 |
+
else:
|
179 |
+
return False, None, None
|
180 |
else:
|
181 |
+
st.error("Error: An error occurred while fetching content.")
|
182 |
+
# return extract status, and the data extracted
|
183 |
return False, None, None
|
184 |
+
except Exception as e:
|
185 |
+
st.error("Invalid URL")
|
|
|
|
|
186 |
|
187 |
+
return False, None, None
|
188 |
|
189 |
|
190 |
def main():
|
|
|
324 |
save_as_json = st.checkbox("jsonl", value=False)
|
325 |
|
326 |
if not save_as_txt and not save_as_json:
|
327 |
+
clear_c1, clear_c2 = st.columns([0.5,0.5])
|
328 |
+
with clear_c1:
|
329 |
+
if st.button("Clear"):
|
330 |
+
st.session_state.button_enter_url = False
|
331 |
+
st.session_state.Initial = True
|
332 |
+
st.session_state.extracted_url = False
|
333 |
+
if 'sitemap_data_text' in st.session_state:
|
334 |
+
del st.session_state['sitemap_data_text']
|
335 |
+
if 'sitemap_data_jsonl' in st.session_state:
|
336 |
+
del st.session_state['sitemap_data_jsonl']
|
337 |
+
st.session_state.button_enter_url = False
|
338 |
+
st.experimental_rerun()
|
339 |
+
with clear_c2:
|
340 |
+
print()
|
341 |
+
elif (save_as_txt and not save_as_json) or (save_as_json and not save_as_txt):
|
342 |
col1, col2 = st.columns([0.5, 0.5])
|
343 |
# save column
|
344 |
with col1:
|
345 |
|
346 |
if is_a_sitemap:
|
|
|
347 |
if save_as_txt:
|
348 |
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
|
349 |
saved_successfully = True
|
|
|
370 |
del st.session_state['sitemap_data_jsonl']
|
371 |
st.session_state.button_enter_url = False
|
372 |
st.experimental_rerun()
|
373 |
+
elif save_as_txt and save_as_json:
|
374 |
+
savetxt_c1,saveJson_c2,clear_c3 = st.columns([0.25,0.25,0.5])
|
375 |
+
with savetxt_c1:
|
376 |
+
if is_a_sitemap:
|
377 |
+
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ):
|
378 |
+
saved_successfully = True
|
379 |
+
else:
|
380 |
+
if st.download_button(label="Save as txt",data=data_txt ):
|
381 |
+
saved_successfully = True
|
382 |
+
with saveJson_c2:
|
383 |
+
if is_a_sitemap:
|
384 |
+
if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"):
|
385 |
+
saved_successfully = True
|
386 |
+
else:
|
387 |
+
if save_as_json:
|
388 |
+
if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"):
|
389 |
+
saved_successfully = True
|
390 |
+
with clear_c3:
|
391 |
+
if st.button("Clear"):
|
392 |
+
st.session_state.button_enter_url = False
|
393 |
+
st.session_state.Initial = True
|
394 |
+
st.session_state.extracted_url = False
|
395 |
+
if 'sitemap_data_text' in st.session_state:
|
396 |
+
del st.session_state['sitemap_data_text']
|
397 |
+
if 'sitemap_data_jsonl' in st.session_state:
|
398 |
+
del st.session_state['sitemap_data_jsonl']
|
399 |
+
st.session_state.button_enter_url = False
|
400 |
+
st.experimental_rerun()
|
401 |
if saved_successfully:
|
402 |
# Confirmation message
|
403 |
st.success(f"File saved successfully.")
|
|
|
405 |
st.write("#")
|
406 |
else:
|
407 |
st.warning("Data not extracted")
|
408 |
+
notextracted_c1,notextracted_c2 = st.columns([0.5,0.5])
|
409 |
+
with notextracted_c1:
|
410 |
+
if st.button("clear"):
|
411 |
+
st.session_state.button_enter_url = False
|
412 |
+
st.session_state.extracted_url = False
|
413 |
+
st.experimental_rerun()
|
414 |
+
with notextracted_c2:
|
415 |
+
print()
|
416 |
st.write("#")
|
417 |
st.write("#")
|
418 |
|
styles.css
CHANGED
@@ -25,6 +25,9 @@
|
|
25 |
margin: 0px;
|
26 |
}
|
27 |
|
|
|
|
|
|
|
28 |
/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
|
29 |
background-color: #3498db;
|
30 |
padding: 5px;
|
|
|
25 |
margin: 0px;
|
26 |
}
|
27 |
|
28 |
+
button.css-1oz26th.edgvbvh10 {
|
29 |
+
width: 100%;
|
30 |
+
}
|
31 |
/* #root > div:nth-child(1) > div.withScreencast > div > div > div > section > div:nth-child(1) {
|
32 |
background-color: #3498db;
|
33 |
padding: 5px;
|
utils/__pycache__/footer.cpython-38.pyc
CHANGED
Binary files a/utils/__pycache__/footer.cpython-38.pyc and b/utils/__pycache__/footer.cpython-38.pyc differ
|
|