bsenst commited on
Commit
78ad8e5
·
verified ·
1 Parent(s): 1233a02

function for naming output file

Browse files
Files changed (1) hide show
  1. app.py +17 -5
app.py CHANGED
@@ -2,6 +2,13 @@ import streamlit as st
2
  import os
3
  import time
4
  import json
 
 
 
 
 
 
 
5
 
6
  def check_scraping_status(log_file="scraping_status.log"):
7
  try:
@@ -12,14 +19,19 @@ def check_scraping_status(log_file="scraping_status.log"):
12
  except FileNotFoundError:
13
  return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
14
 
15
- def run_scraping(url):
 
 
 
16
 
17
- if os.path.exists("output.json"):
18
- os.remove("output.json")
19
 
20
- os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o output.json")
 
21
 
22
- st.success("Scraping started")
 
23
 
24
  # Streamlit interface
25
  st.title("Scraping")
 
2
  import os
3
  import time
4
  import json
5
+ import re
6
+
7
+ def clean_string_for_filename(s):
8
+ """Cleans a string to make it safe for use as a filename."""
9
+ s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters
10
+ s = re.sub(r"\s+", "_", s) # Replace spaces with underscores
11
+ return s.strip("_")
12
 
13
  def check_scraping_status(log_file="scraping_status.log"):
14
  try:
 
19
  except FileNotFoundError:
20
  return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
21
 
22
+ def run_scraping(url, depth_limit, pagecount_limit):
23
+ # Generate a safe filename based on the URL
24
+ identifier = clean_string_for_filename(url)
25
+ output_filename = f"output_{identifier}.json"
26
 
27
+ if os.path.exists(output_filename):
28
+ os.remove(output_filename)
29
 
30
+ # Run the scraping command
31
+ os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}")
32
 
33
+ st.success(f"Scraping started. Output will be saved to {output_filename}.")
34
+ return output_filename
35
 
36
  # Streamlit interface
37
  st.title("Scraping")