Spaces:
Running
Running
function for naming output file
Browse files
app.py
CHANGED
@@ -2,6 +2,13 @@ import streamlit as st
|
|
2 |
import os
|
3 |
import time
|
4 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def check_scraping_status(log_file="scraping_status.log"):
|
7 |
try:
|
@@ -12,14 +19,19 @@ def check_scraping_status(log_file="scraping_status.log"):
|
|
12 |
except FileNotFoundError:
|
13 |
return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
|
14 |
|
15 |
-
def run_scraping(url):
|
|
|
|
|
|
|
16 |
|
17 |
-
if os.path.exists(
|
18 |
-
os.remove(
|
19 |
|
20 |
-
|
|
|
21 |
|
22 |
-
st.success("Scraping started")
|
|
|
23 |
|
24 |
# Streamlit interface
|
25 |
st.title("Scraping")
|
|
|
2 |
import os
|
3 |
import time
|
4 |
import json
|
5 |
+
import re
|
6 |
+
|
7 |
+
def clean_string_for_filename(s):
|
8 |
+
"""Cleans a string to make it safe for use as a filename."""
|
9 |
+
s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters
|
10 |
+
s = re.sub(r"\s+", "_", s) # Replace spaces with underscores
|
11 |
+
return s.strip("_")
|
12 |
|
13 |
def check_scraping_status(log_file="scraping_status.log"):
|
14 |
try:
|
|
|
19 |
except FileNotFoundError:
|
20 |
return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
|
21 |
|
22 |
+
def run_scraping(url, depth_limit, pagecount_limit):
|
23 |
+
# Generate a safe filename based on the URL
|
24 |
+
identifier = clean_string_for_filename(url)
|
25 |
+
output_filename = f"output_{identifier}.json"
|
26 |
|
27 |
+
if os.path.exists(output_filename):
|
28 |
+
os.remove(output_filename)
|
29 |
|
30 |
+
# Run the scraping command
|
31 |
+
os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}")
|
32 |
|
33 |
+
st.success(f"Scraping started. Output will be saved to {output_filename}.")
|
34 |
+
return output_filename
|
35 |
|
36 |
# Streamlit interface
|
37 |
st.title("Scraping")
|