import streamlit as st import os import time import json import re def clean_string_for_filename(s): """Cleans a string to make it safe for use as a filename.""" s = re.sub(r"[^\w\s-]", "", s) # Remove invalid characters s = re.sub(r"\s+", "_", s) # Replace spaces with underscores return s.strip("_") def check_scraping_status(log_file="scraping_status.log"): try: with open(log_file, "r") as file: lines = file.readlines() status = lines[-1] return status except FileNotFoundError: return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing def run_scraping(url, depth_limit, pagecount_limit): # Generate a safe filename based on the URL identifier = clean_string_for_filename(url) output_filename = f"output_{identifier}.json" if os.path.exists(output_filename): os.remove(output_filename) # Run the scraping command os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o {output_filename}") st.success(f"Scraping started. Output will be saved to {output_filename}.") return output_filename # Streamlit interface st.title("Scraping Tool with URL-based Output File") col1, col2 = st.columns(2) with col1: depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1) with col2: pagecount_limit = st.slider("Page Count Limit", min_value=10, value=10, max_value=50, step=10) url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/") if st.button("Run Scraping"): if check_scraping_status() == "Scraping running": st.warning("Scraping in progress...") else: output_filename = run_scraping(url, depth_limit, pagecount_limit) if st.button("Status Scraping"): identifier = clean_string_for_filename(url) output_file = f"output_{identifier}.json" if check_scraping_status() == "Scraping running": st.warning("Scraping is running.") elif os.path.exists(output_file): try: with open(output_file, "r") as f: scraped_data = json.load(f) page_count = len(scraped_data) # Show download button if output file exists st.download_button( "Download Scraping Output", data=json.dumps(scraped_data), file_name=output_file, ) # Display number of pages scraped st.write(f"{page_count} pages scraped:") # Display scraping results st.write([(el["url"], el["title"]) for el in scraped_data]) except Exception as e: st.warning(f"Error with opening {output_file}: {e}") else: st.warning("No output file found. Please run the scraping command.")