Browse files
@@ -0,0 +1,112 @@
1 |
import requests
2 |
from bs4 import BeautifulSoup
3 |
import pandas as pd
4 |
from urllib.parse import quote
5 |
import streamlit as st
6 |
import json
7 |
8 |
9 |
def scrape_e_commerce(nama_barang, num_items):
10 |
products = []
11 |
page = 1
12 |
query = quote(nama_barang)
13 |
progress_text = "Scraping in progress. Please wait."
14 |
my_bar = st.progress(len(data), text=progress_text)
15 |
16 |
while len(products) < num_items:
17 |
url = f"{query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
18 |
response = requests.get(url)
19 |
soup = BeautifulSoup(response.text, 'html.parser')
20 |
21 |
product_list = soup.find_all('a', href=True)
22 |
23 |
for product in product_list:
24 |
product_href = product['href']
25 |
if '/product/' in product_href:
26 |
product_name = product.find('div', class_='title').text.strip()
27 |
product_price = product.find('span', class_='normal price-value').text.strip()
28 |
product_link = f"{product_href}"
29 |
30 |
'product': product_name,
31 |
'price': product_price,
32 |
'link': product_link
33 |
34 |
35 |
prop = min(len(products) / num_items, 1)
36 |
my_bar.progress(prop, text=progress_text)
37 |
if len (products) > num_items :
38 |
products = products[:num_items]
39 |
40 |
page += 1
41 |
42 |
43 |
44 |
return products
45 |
46 |
#---------------------------------------------------User Interface----------------------------------------------------------------------
47 |
48 |
# Streamlit UI
49 |
st.title("Scraping E-Commerce")
50 |
51 |
with st.expander("Settings :"):
52 |
# Pilihan untuk memilih situs web
53 |
selected_site = st.selectbox("Pilih Situs Web :", [""])
54 |
55 |
nama_barang = st.text_input("Masukkan Nama Barang :")
56 |
estimated_num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
57 |
58 |
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
59 |
+'Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
60 |
61 |
# Variabel tersembunyi untuk menyimpan hasil scraping
62 |
hidden_data = []
63 |
64 |
scraping_done = False # Tambahkan variabel ini
65 |
66 |
if selected_site == "":
67 |
if st.button("Mulai Scraping"):
68 |
if not query:
69 |
st.error("Mohon isi query.")
70 |
71 |
data_df = scrape_e_commerce(nama_barang, estimated_num_items)
72 |
hidden_data = data_df # Simpan data ke dalam variabel tersembunyi
73 |
scraping_done = True # Set scraping_done menjadi True
74 |
75 |
scraped_products = scrape_e_commerce(nama_barang, estimated_num_items)
76 |
77 |
78 |
79 |
# Simpan DataFrame ke dalam file
80 |
output_file = f"scraped_{nama_barang}.xlsx"
81 |
output_file_csv = f"scraped_{nama_barang}.csv"
82 |
output_file_json = f"scraped_{nama_barang}.json"
83 |
84 |
85 |
#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
86 |
87 |
# Tampilkan hasil scraping
88 |
if scraping_done:
89 |
if hidden_data:
90 |
# Menampilkan hasil sentimen dalam kotak yang dapat diperluas
91 |
with st.expander(f"Hasil Scraping {selected_site} :"):
92 |
93 |
if download_format == "XLSX":
94 |
df = pd.DataFrame(scraped_products)
95 |
df.to_excel(output_file, index=False)
96 |
st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
97 |
elif download_format == "CSV":
98 |
df = pd.DataFrame(scraped_products)
99 |
csv = df.to_csv(index=False)
100 |
st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
101 |
elif download_format == "JSON":
102 |
json_data = pd.DataFrame(scraped_products).to_json(orient="records")
103 |
st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
104 |
if not scraping_done:
105 |
st.write("Tidak ada data untuk diunduh.")
106 |
107 |
108 |
github_link = ""
109 |
st.markdown(f"GitHub: [{github_link}]({github_link})")
110 |
instagram_link = ""
111 |
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
112 |
st.write('Terima kasih telah mencoba demo ini!')