Spaces:
Runtime error
Runtime error
Synced repo using 'sync_with_huggingface' Github Action
Browse files- pages/1_URLs.py +16 -7
pages/1_URLs.py
CHANGED
@@ -125,7 +125,7 @@ sitemap_data = ""
|
|
125 |
|
126 |
|
127 |
# function to process a batch of URLS in sitemaps
|
128 |
-
def process_urls(sitemap_urls):
|
129 |
|
130 |
extracted_txt = ""
|
131 |
extracted_jsonl_list= []
|
@@ -135,7 +135,7 @@ def process_urls(sitemap_urls):
|
|
135 |
# using justext to extract data
|
136 |
temp_para = extract_data_from_url_(url)
|
137 |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
138 |
-
temp_jsonl_data = {"text": temp_para, "url": url}
|
139 |
extracted_txt += temp_txt_data
|
140 |
extracted_jsonl_list.append(temp_jsonl_data)
|
141 |
else:
|
@@ -150,7 +150,7 @@ def process_urls(sitemap_urls):
|
|
150 |
|
151 |
|
152 |
# function to process for a single URL
|
153 |
-
def run_function(url):
|
154 |
extracted_txt = ""
|
155 |
# Check if the user has provided a URL
|
156 |
if url:
|
@@ -158,7 +158,7 @@ def run_function(url):
|
|
158 |
temp_para = extract_data_from_url_(url)
|
159 |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
160 |
extracted_txt = temp_txt_data
|
161 |
-
extracted_jsonl = {"text": str(temp_para), "url":str(url)}
|
162 |
|
163 |
# displaying extracted txt for single URL
|
164 |
st.text_area("Extracted Text", value=extracted_txt, height=200)
|
@@ -180,6 +180,10 @@ def run_function(url):
|
|
180 |
def main():
|
181 |
st.subheader("Extract Data from URLs")
|
182 |
|
|
|
|
|
|
|
|
|
183 |
# dividing the body section into 2 columns for url and enter button
|
184 |
col1, col2 = st.columns([0.7,0.3])
|
185 |
|
@@ -199,6 +203,8 @@ def main():
|
|
199 |
st.session_state.extracted_url = False
|
200 |
data = ""
|
201 |
|
|
|
|
|
202 |
# the enter button
|
203 |
if st.session_state.button_enter_url:
|
204 |
# check if it is a sitemap or not
|
@@ -240,7 +246,7 @@ def main():
|
|
240 |
start_index = i * split_size
|
241 |
end_index = start_index + split_size if i != num_threads - 1 else None
|
242 |
temp_urls = stored_sitemap_urls[start_index:end_index]
|
243 |
-
future = executor.submit(process_urls, temp_urls)
|
244 |
futures.append(future)
|
245 |
|
246 |
# Retrieve the extracted data from each thread
|
@@ -284,7 +290,7 @@ def main():
|
|
284 |
|
285 |
else:
|
286 |
url = url_or_xml
|
287 |
-
st.session_state.extracted_url, data_txt, data_jsonl = run_function(url)
|
288 |
|
289 |
|
290 |
if st.session_state.extracted_url:
|
@@ -355,13 +361,16 @@ def main():
|
|
355 |
if saved_successfully:
|
356 |
# Confirmation message
|
357 |
st.success(f"File saved successfully.")
|
358 |
-
|
|
|
359 |
else:
|
360 |
st.warning("Data not extracted")
|
361 |
if st.button("clear"):
|
362 |
st.session_state.button_enter_url = False
|
363 |
st.session_state.extracted_url = False
|
364 |
st.experimental_rerun()
|
|
|
|
|
365 |
|
366 |
|
367 |
# Add a success message to the sidebar
|
|
|
125 |
|
126 |
|
127 |
# function to process a batch of URLS in sitemaps
|
128 |
+
def process_urls(sitemap_urls , category):
|
129 |
|
130 |
extracted_txt = ""
|
131 |
extracted_jsonl_list= []
|
|
|
135 |
# using justext to extract data
|
136 |
temp_para = extract_data_from_url_(url)
|
137 |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
138 |
+
temp_jsonl_data = {"text": temp_para, "url": url, "category": category, "timestamp": str(datetime.datetime.now())}
|
139 |
extracted_txt += temp_txt_data
|
140 |
extracted_jsonl_list.append(temp_jsonl_data)
|
141 |
else:
|
|
|
150 |
|
151 |
|
152 |
# function to process for a single URL
|
153 |
+
def run_function(url , category):
|
154 |
extracted_txt = ""
|
155 |
# Check if the user has provided a URL
|
156 |
if url:
|
|
|
158 |
temp_para = extract_data_from_url_(url)
|
159 |
temp_txt_data = ('\n\nFrom url:' + url + '\n' + temp_para + '\n')
|
160 |
extracted_txt = temp_txt_data
|
161 |
+
extracted_jsonl = {"text": str(temp_para), "url":str(url) , "category": category , "timestamp": str(datetime.datetime.now())}
|
162 |
|
163 |
# displaying extracted txt for single URL
|
164 |
st.text_area("Extracted Text", value=extracted_txt, height=200)
|
|
|
180 |
def main():
|
181 |
st.subheader("Extract Data from URLs")
|
182 |
|
183 |
+
category = st.selectbox(
|
184 |
+
'Select a Category',
|
185 |
+
('News Articles','Poems','Magazines', 'Other') )
|
186 |
+
|
187 |
# dividing the body section into 2 columns for url and enter button
|
188 |
col1, col2 = st.columns([0.7,0.3])
|
189 |
|
|
|
203 |
st.session_state.extracted_url = False
|
204 |
data = ""
|
205 |
|
206 |
+
|
207 |
+
|
208 |
# the enter button
|
209 |
if st.session_state.button_enter_url:
|
210 |
# check if it is a sitemap or not
|
|
|
246 |
start_index = i * split_size
|
247 |
end_index = start_index + split_size if i != num_threads - 1 else None
|
248 |
temp_urls = stored_sitemap_urls[start_index:end_index]
|
249 |
+
future = executor.submit(process_urls, temp_urls, category)
|
250 |
futures.append(future)
|
251 |
|
252 |
# Retrieve the extracted data from each thread
|
|
|
290 |
|
291 |
else:
|
292 |
url = url_or_xml
|
293 |
+
st.session_state.extracted_url, data_txt, data_jsonl = run_function(url , category)
|
294 |
|
295 |
|
296 |
if st.session_state.extracted_url:
|
|
|
361 |
if saved_successfully:
|
362 |
# Confirmation message
|
363 |
st.success(f"File saved successfully.")
|
364 |
+
st.write("#")
|
365 |
+
st.write("#")
|
366 |
else:
|
367 |
st.warning("Data not extracted")
|
368 |
if st.button("clear"):
|
369 |
st.session_state.button_enter_url = False
|
370 |
st.session_state.extracted_url = False
|
371 |
st.experimental_rerun()
|
372 |
+
st.write("#")
|
373 |
+
st.write("#")
|
374 |
|
375 |
|
376 |
# Add a success message to the sidebar
|