bsenst commited on
Commit
ca9e6a8
·
verified ·
1 Parent(s): 28b9e2a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import time
4
+ import json
5
+
6
+ def check_scraping_status(log_file="scraping_status.log"):
7
+ try:
8
+ with open(log_file, "r") as file:
9
+ lines = file.readlines()
10
+ status = lines[-1]
11
+ return status
12
+ except FileNotFoundError:
13
+ return "Scraping not run yet" # Log file does not exist; assume scraping is ongoing
14
+
15
+ def run_scraping(url):
16
+
17
+ if os.path.exists("output.json"):
18
+ os.remove("output.json")
19
+
20
+ os.popen(f"scrapy runspider homespider.py -a start_url={url} -a depth_limit={depth_limit} -a pagecount_limit={pagecount_limit} -o output.json")
21
+
22
+ st.success("Scraping started")
23
+
24
+ # Streamlit interface
25
+ st.title("Scraping")
26
+ col1, col2 = st.columns(2)
27
+
28
+ with col1:
29
+ depth_limit = st.slider("Depth Limit", min_value=1, value=2, max_value=5, step=1)
30
+ with col2:
31
+ pagecount_limit = st.slider(
32
+ "Page Count", min_value=10, value=10, max_value=50, step=10
33
+ )
34
+ url = st.text_input("Enter URL", value="https://bsenst.github.io/toscrape/app-website/")
35
+
36
+ if st.button("Run Scraping"):
37
+
38
+ if check_scraping_status() == "Scraping running":
39
+ st.warning("Scraping in progress...")
40
+ else:
41
+ run_scraping(url)
42
+
43
+ if st.button("Status Scraping"):
44
+ st.warning(check_scraping_status())