Commit
·
de712b3
1
Parent(s):
0c80300
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
from bs4 import BeautifulSoup
|
3 |
+
import pandas as pd
|
4 |
+
from urllib.parse import quote
|
5 |
+
import streamlit as st
|
6 |
+
import json
|
7 |
+
|
8 |
+
@st.cache_data
|
9 |
+
def scrape_e_commerce(nama_barang, num_items):
|
10 |
+
products = []
|
11 |
+
page = 1
|
12 |
+
query = quote(nama_barang)
|
13 |
+
progress_text = "Scraping in progress. Please wait."
|
14 |
+
my_bar = st.progress(len(data), text=progress_text)
|
15 |
+
|
16 |
+
while len(products) < num_items:
|
17 |
+
url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
|
18 |
+
response = requests.get(url)
|
19 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
20 |
+
|
21 |
+
product_list = soup.find_all('a', href=True)
|
22 |
+
|
23 |
+
for product in product_list:
|
24 |
+
product_href = product['href']
|
25 |
+
if '/product/' in product_href:
|
26 |
+
product_name = product.find('div', class_='title').text.strip()
|
27 |
+
product_price = product.find('span', class_='normal price-value').text.strip()
|
28 |
+
product_link = f"https://www.klikindomaret.com{product_href}"
|
29 |
+
products.append({
|
30 |
+
'product': product_name,
|
31 |
+
'price': product_price,
|
32 |
+
'link': product_link
|
33 |
+
})
|
34 |
+
|
35 |
+
prop = min(len(products) / num_items, 1)
|
36 |
+
my_bar.progress(prop, text=progress_text)
|
37 |
+
if len (products) > num_items :
|
38 |
+
products = products[:num_items]
|
39 |
+
break
|
40 |
+
page += 1
|
41 |
+
|
42 |
+
time.sleep(1)
|
43 |
+
my_bar.empty()
|
44 |
+
return products
|
45 |
+
|
46 |
+
#---------------------------------------------------User Interface----------------------------------------------------------------------
|
47 |
+
|
48 |
+
# Streamlit UI
|
49 |
+
st.title("Scraping E-Commerce")
|
50 |
+
|
51 |
+
with st.expander("Settings :"):
|
52 |
+
# Pilihan untuk memilih situs web
|
53 |
+
selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com"])
|
54 |
+
|
55 |
+
nama_barang = st.text_input("Masukkan Nama Barang :")
|
56 |
+
estimated_num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
|
57 |
+
|
58 |
+
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
|
59 |
+
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
|
60 |
+
|
61 |
+
# Variabel tersembunyi untuk menyimpan hasil scraping
|
62 |
+
hidden_data = []
|
63 |
+
|
64 |
+
scraping_done = False # Tambahkan variabel ini
|
65 |
+
|
66 |
+
if selected_site == "klikindomaret.com":
|
67 |
+
if st.button("Mulai Scraping"):
|
68 |
+
if not query:
|
69 |
+
st.error("Mohon isi query.")
|
70 |
+
else:
|
71 |
+
data_df = scrape_e_commerce(nama_barang, estimated_num_items)
|
72 |
+
hidden_data = data_df # Simpan data ke dalam variabel tersembunyi
|
73 |
+
scraping_done = True # Set scraping_done menjadi True
|
74 |
+
|
75 |
+
scraped_products = scrape_e_commerce(nama_barang, estimated_num_items)
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
# Simpan DataFrame ke dalam file
|
80 |
+
output_file = f"scraped_{nama_barang}.xlsx"
|
81 |
+
output_file_csv = f"scraped_{nama_barang}.csv"
|
82 |
+
output_file_json = f"scraped_{nama_barang}.json"
|
83 |
+
|
84 |
+
|
85 |
+
#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
|
86 |
+
|
87 |
+
# Tampilkan hasil scraping
|
88 |
+
if scraping_done:
|
89 |
+
if hidden_data:
|
90 |
+
# Menampilkan hasil sentimen dalam kotak yang dapat diperluas
|
91 |
+
with st.expander(f"Hasil Scraping {selected_site} :"):
|
92 |
+
st.write(pd.DataFrame(scraped_products)
|
93 |
+
if download_format == "XLSX":
|
94 |
+
df = pd.DataFrame(scraped_products)
|
95 |
+
df.to_excel(output_file, index=False)
|
96 |
+
st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
|
97 |
+
elif download_format == "CSV":
|
98 |
+
df = pd.DataFrame(scraped_products)
|
99 |
+
csv = df.to_csv(index=False)
|
100 |
+
st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
|
101 |
+
elif download_format == "JSON":
|
102 |
+
json_data = pd.DataFrame(scraped_products).to_json(orient="records")
|
103 |
+
st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
|
104 |
+
if not scraping_done:
|
105 |
+
st.write("Tidak ada data untuk diunduh.")
|
106 |
+
|
107 |
+
st.divider()
|
108 |
+
github_link = "https://github.com/naufalnashif/"
|
109 |
+
st.markdown(f"GitHub: [{github_link}]({github_link})")
|
110 |
+
instagram_link = "https://www.instagram.com/naufal.nashif/"
|
111 |
+
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
|
112 |
+
st.write('Terima kasih telah mencoba demo ini!')
|