Spaces:
Runtime error
Runtime error
import streamlit as st | |
# setting page config. for centered mode | |
st.set_page_config(layout="centered") | |
from utils.footer import cust_footer | |
from lxml import etree | |
import justext | |
import concurrent.futures | |
import datetime | |
import requests | |
from bs4 import BeautifulSoup | |
import json | |
# ----- FUNCTIONS ----- | |
# function to check whether the url is a sitemap or not | |
def check_sitemap(url): | |
# Check the URL's ending | |
if url.lower().endswith(('sitemap.xml', 'sitemap_index.xml', 'sitemap')): | |
try: | |
# Parse the content as XML | |
response = requests.get(url) | |
xml_content = etree.fromstring(response.content) | |
# Check for sitemap-specific elements | |
if xml_content.tag == 'urlset' or xml_content.tag == 'sitemapindex': | |
return True | |
except Exception as e: | |
st.error("Invalid sitemap!!") | |
# Additional conditions for identifying sitemaps | |
elif 'sitemap' in url.lower(): | |
try: | |
response = requests.get(url) | |
# Perform additional checks specific to the website's structure or naming conventions | |
return True | |
except Exception as e: | |
# st.error("Invalid sitemap!!") | |
pass | |
return False | |
def extract_urls_from_sitemaps(xml_url): | |
# Make a GET request to the URL and extract the xml content | |
response = requests.get(xml_url) | |
soup = BeautifulSoup(response.text, 'xml') | |
extracted_urls = [] | |
# check if the sitemap contains nested sitemaps | |
sitemap_tags = soup.find_all('sitemap') | |
if sitemap_tags: | |
# Process nested sitemaps | |
for sitemap_tag in sitemap_tags: | |
print("sitemap_tags:" + str(sitemap_tag)) | |
nested_url = sitemap_tag.find('loc').text | |
print('nested_url:', nested_url) | |
nested_urls = extract_urls_from_sitemaps(nested_url) | |
extracted_urls.extend(nested_urls) | |
else: | |
# Extract URLs from the current sitemap | |
loc_tags = soup.find_all('loc') | |
for loc_tag in loc_tags: | |
# if loc_tag.parent.name != 'image': | |
url = loc_tag.text | |
if url.endswith('.pdf') or url.endswith('.jpg') or url.endswith('.jpeg'): | |
print(f"url skipped because it is a {url.split('.')[-1]}") | |
else: | |
print('url:', url) | |
extracted_urls.append(url) | |
return extracted_urls | |
# function to check whether the entered url is valid | |
def valid_url(url): | |
try: | |
# Make a GET request to the URL and extract the text content | |
response = requests.get(url) | |
if response.status_code == 200: | |
return True | |
except requests.exceptions.RequestException as e: | |
return False | |
# function to create a custom stoplist for justext | |
def custom_stoplist(): | |
odia_stopwords = [ | |
"ଏହି", "ଏକ", "ଏକାଉଣଟ", "ମୁଁ", "ମୋର", "ମୁଁ ନିଜେ", "ଆମେ", "ଆମର", "ଆମର", "ଆମେ ନିଜେ", "ତୁମେ", "ତୁମର", "ତୁମର", | |
"ନିଜେ", "ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", | |
"ନିଜେ", "ସେ", "ତାଙ୍କୁ", "ତାଙ୍କର", "ନିଜେ", "ଏହା", "ଏହାର", "ନିଜେ |", "ସେମାନେ", "ସେଗୁଡିକ", "ସେମାନଙ୍କର", | |
"ସେମାନଙ୍କର", "ନିଜେ |", "କଣ", "ଯାହା", "କିଏ", "କାହାକୁ", | |
"ଏହା", "ତାହା", "ଏଗୁଡ଼ିକ", "ସେଗୁଡ଼ିକ", "ମୁଁ", "ହେଉଛି", "ହେଉଛି |", "ଥିଲା", "ଥିଲା |", "ହୁଅ", "ହୋଇସାରିଛି |", "ହେବା", | |
"ଅଛି", "ଅଛି", "ଥିଲା", "ଅଛି", "କର", "କରେ |", | |
"କରିଛନ୍ତି", "କରିବା", "ଏବଂ", "କିନ୍ତୁ", "ଯଦି", "କିମ୍ବା", "କାରଣ", "ଯେପରି", "ପର୍ଯ୍ୟନ୍ତ", "ଯେତେବେଳେ", "ର", "ପାଇଁ", | |
"ସହିତ", "ବିଷୟରେ", "ବିପକ୍ଷରେ", "ମଧ୍ୟରେ", "ଭିତରକୁ", "ମାଧ୍ୟମରେ", | |
"ସମୟରେ", "ପୂର୍ବରୁ", "ପରେ", "ଉପରେ", "ନିମ୍ନରେ |", "କୁ", "ଠାରୁ", "ଅପ୍", "ତଳକୁ", "ଭିତରେ", "ବାହାରେ", "ଉପରେ", "ବନ୍ଦ", | |
"ସମାପ୍ତ", "ତଳେ |", "ପୁନର୍ବାର", "ଆଗକୁ", | |
"ତାପରେ", "ଥରେ |", "ଏଠାରେ", "ସେଠାରେ", "କେବେ", "କେଉଁଠାରେ", "କିପରି", "ସମସ୍ତ", "ଉଭୟ", "ପ୍ରତ୍ୟେକ", "ଅଳ୍ପ", "ଅଧିକ", | |
"ଅଧିକାଂଶ", "ଅନ୍ୟ", "କେତେକ", "ଏହିପରି", | |
"ନୁହେଁ |", "କେବଳ", "ନିଜର", "ସମାନ", "ତେଣୁ", "ଅପେକ୍ଷା", "ମଧ୍ୟ", "ବହୁତ", "କରିପାରିବେ |", "ଇଚ୍ଛା", "କେବଳ", | |
"କରିବା ଉଚିତ", "ବର୍ତ୍ତମାନ" | |
] | |
return frozenset(odia_stopwords) | |
# function to extract data from url using justext | |
def extract_data_from_url_(url): | |
response = requests.get(url) | |
response.raise_for_status() | |
page = response.content | |
para = "" | |
paragraphs = justext.justext(page, custom_stoplist(), 70, 140, 0.0, 0.02, 0.5, 150, False) | |
for paragraph in paragraphs: | |
if not paragraph.is_boilerplate: | |
para = para + '\n' + paragraph.text | |
return para | |
sitemap_data = "" | |
# function to process a batch of URLS in sitemaps | |
def process_urls(sitemap_urls , category): | |
extracted_txt = "" | |
extracted_jsonl_list= [] | |
for url in sitemap_urls: | |
if valid_url(url): | |
print(url) | |
# using justext to extract data | |
temp_para = extract_data_from_url_(url) | |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n') | |
temp_jsonl_data = {"text": temp_para, "url": url, "category": category, "timestamp": str(datetime.datetime.now())} | |
extracted_txt += temp_txt_data | |
extracted_jsonl_list.append(temp_jsonl_data) | |
else: | |
st.error("Couldnt extract data from " + url) | |
# Convert data_list to JSONL string | |
extracted_jsonl_list_encoded = [json.dumps(data, ensure_ascii=False) for data in extracted_jsonl_list] | |
extracted_jsonl = '\n'.join(extracted_jsonl_list_encoded) | |
return extracted_txt, extracted_jsonl | |
# function to process for a single URL | |
def run_function(url , category): | |
extracted_txt = "" | |
try: | |
response = requests.get(url) | |
# Check if the user has provided a URL | |
if url: | |
if valid_url(url): | |
temp_para = extract_data_from_url_(url) | |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n') | |
extracted_txt = temp_txt_data | |
extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())} | |
# displaying extracted txt for single URL | |
st.text_area("Extracted Text", value=extracted_txt, height=200) | |
extracted_jsonl = json.dumps(extracted_jsonl, ensure_ascii=False) | |
# return extract status, and the data extracted | |
return True, extracted_txt, extracted_jsonl | |
else: | |
return False, None, None | |
else: | |
st.error("Error: An error occurred while fetching content.") | |
# return extract status, and the data extracted | |
return False, None, None | |
except Exception as e: | |
st.error("Invalid URL") | |
return False, None, None | |
def main(): | |
st.subheader("Extract Data from URLs") | |
category = st.selectbox( | |
'Select a Category', | |
('News Articles','Poems','Magazines', 'Other') ) | |
# dividing the body section into 2 columns for url and enter button | |
col1, col2 = st.columns([0.7,0.3]) | |
with col1: | |
url_or_xml = st.text_input(label='', placeholder="Enter URL") | |
is_a_sitemap = check_sitemap(url_or_xml) | |
with col2: | |
st.write('##') | |
if "button_enter_url" not in st.session_state: | |
st.session_state.button_enter_url = False | |
if st.button("Enter"): | |
st.session_state.button_enter_url = True | |
if "extracted_url" not in st.session_state: | |
st.session_state.extracted_url = False | |
data = "" | |
# the enter button | |
if st.session_state.button_enter_url: | |
# check if it is a sitemap or not | |
if is_a_sitemap: | |
if "Initial" not in st.session_state: | |
st.session_state.Initial = True | |
# check whether its the initial state | |
if st.session_state.Initial == True: | |
xml = url_or_xml | |
st.write("It is a sitemap") | |
stored_sitemap_urls = extract_urls_from_sitemaps(xml) | |
print('\nno. of urls: ', len(stored_sitemap_urls)) | |
st.write('no. of urls {}', format(len(stored_sitemap_urls))) | |
if stored_sitemap_urls: | |
print(stored_sitemap_urls) | |
current_time = datetime.datetime.now() | |
print(current_time) | |
st.write(current_time) | |
# for sitemap_url in stored_sitemap_urls: | |
# if valid_url(sitemap_url): | |
# print(sitemap_url) | |
# # using justext to extract data | |
# data = data + extract_data_from_url_(sitemap_url) | |
# else: | |
# st.error("Couldnt extract data from " + sitemap_url) | |
num_threads = 16 # Number of threads to use | |
# Calculate the split size for each thread | |
split_size = len(stored_sitemap_urls) // num_threads | |
# Create a ThreadPoolExecutor with maximum `num_threads` threads | |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor: | |
futures = [] | |
for i in range(num_threads): | |
start_index = i * split_size | |
end_index = start_index + split_size if i != num_threads - 1 else None | |
temp_urls = stored_sitemap_urls[start_index:end_index] | |
future = executor.submit(process_urls, temp_urls, category) | |
futures.append(future) | |
# Retrieve the extracted data from each thread | |
text_data = [] | |
jsonl_data = [] | |
for future in futures: | |
text_result, jsonl_result = future.result() | |
text_data.append(text_result) | |
jsonl_data.append(jsonl_result) | |
# Combine the extracted data from all threads | |
combined_text_data = ''.join(text_data) | |
combined_jsonl_data = '\n'.join(jsonl_data) | |
# Use the combined data as needed | |
# print("Combined Text Data:") | |
# print(combined_text_data) | |
# print("Combined JSONL Data:") | |
# print(combined_jsonl_data) | |
if "sitemap_data_jsonl" not in st.session_state: | |
st.session_state.sitemap_data_jsonl = combined_jsonl_data | |
if "sitemap_data_text" not in st.session_state: | |
st.session_state.sitemap_data_text = combined_text_data | |
current_time = datetime.datetime.now() | |
print(current_time) | |
st.write(current_time) | |
st.session_state.Initial = False | |
print("\n\n\n\n2)Initial State", st.session_state.Initial, "\n\n\n\n\n") | |
st.session_state.extracted_url = True | |
else: | |
st.error("Error: Invalid sitemap.") | |
else: | |
url = url_or_xml | |
st.session_state.extracted_url, data_txt, data_jsonl = run_function(url , category) | |
if st.session_state.extracted_url: | |
# displaying extracted txt for sitemaps | |
if is_a_sitemap: | |
st.text_area("Extracted Text", value=st.session_state.sitemap_data_text, height=300) | |
save_as,checkbox_c1, checkbox_c2 = st.columns([0.33 , 0.33 , 0.33]) | |
# initializing the checbox bool | |
save_as_txt =False | |
save_as_json = False | |
saved_successfully = False | |
with save_as: | |
st.write("Save as ") | |
with checkbox_c1: | |
save_as_txt = st.checkbox("text", value=False) | |
with checkbox_c2: | |
save_as_json = st.checkbox("jsonl", value=False) | |
if not save_as_txt and not save_as_json: | |
clear_c1, clear_c2 = st.columns([0.5,0.5]) | |
with clear_c1: | |
if st.button("Clear"): | |
st.session_state.button_enter_url = False | |
st.session_state.Initial = True | |
st.session_state.extracted_url = False | |
if 'sitemap_data_text' in st.session_state: | |
del st.session_state['sitemap_data_text'] | |
if 'sitemap_data_jsonl' in st.session_state: | |
del st.session_state['sitemap_data_jsonl'] | |
st.session_state.button_enter_url = False | |
st.experimental_rerun() | |
with clear_c2: | |
print() | |
elif (save_as_txt and not save_as_json) or (save_as_json and not save_as_txt): | |
col1, col2 = st.columns([0.5, 0.5]) | |
# save column | |
with col1: | |
if is_a_sitemap: | |
if save_as_txt: | |
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ): | |
saved_successfully = True | |
if save_as_json: | |
if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"): | |
saved_successfully = True | |
else: | |
if save_as_txt: | |
if st.download_button(label="Save as txt",data=data_txt ): | |
saved_successfully = True | |
if save_as_json: | |
if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"): | |
saved_successfully = True | |
# clear column | |
with col2: | |
if st.button("Clear"): | |
st.session_state.button_enter_url = False | |
st.session_state.Initial = True | |
st.session_state.extracted_url = False | |
if 'sitemap_data_text' in st.session_state: | |
del st.session_state['sitemap_data_text'] | |
if 'sitemap_data_jsonl' in st.session_state: | |
del st.session_state['sitemap_data_jsonl'] | |
st.session_state.button_enter_url = False | |
st.experimental_rerun() | |
elif save_as_txt and save_as_json: | |
savetxt_c1,saveJson_c2,clear_c3 = st.columns([0.25,0.25,0.5]) | |
with savetxt_c1: | |
if is_a_sitemap: | |
if st.download_button(label="Save as txt",data=st.session_state.sitemap_data_text ): | |
saved_successfully = True | |
else: | |
if st.download_button(label="Save as txt",data=data_txt ): | |
saved_successfully = True | |
with saveJson_c2: | |
if is_a_sitemap: | |
if st.download_button(label="Save as jsonl", data=st.session_state.sitemap_data_jsonl, mime="application/json"): | |
saved_successfully = True | |
else: | |
if save_as_json: | |
if st.download_button(label="Save as jsonl", data=data_jsonl, mime="application/json"): | |
saved_successfully = True | |
with clear_c3: | |
if st.button("Clear"): | |
st.session_state.button_enter_url = False | |
st.session_state.Initial = True | |
st.session_state.extracted_url = False | |
if 'sitemap_data_text' in st.session_state: | |
del st.session_state['sitemap_data_text'] | |
if 'sitemap_data_jsonl' in st.session_state: | |
del st.session_state['sitemap_data_jsonl'] | |
st.session_state.button_enter_url = False | |
st.experimental_rerun() | |
if saved_successfully: | |
# Confirmation message | |
st.success(f"File saved successfully.") | |
st.write("#") | |
st.write("#") | |
else: | |
st.warning("Data not extracted") | |
notextracted_c1,notextracted_c2 = st.columns([0.5,0.5]) | |
with notextracted_c1: | |
if st.button("clear"): | |
st.session_state.button_enter_url = False | |
st.session_state.extracted_url = False | |
st.experimental_rerun() | |
with notextracted_c2: | |
print() | |
st.write("#") | |
st.write("#") | |
# Add a success message to the sidebar | |
st.sidebar.success("Select a page above.") | |
# importing the custom footer from utils | |
cust_footer() | |
if __name__ == "__main__": | |
main() | |