import requests |
from bs4 import BeautifulSoup |
import pandas as pd |
from urllib.parse import quote |
import streamlit as st |
import json |
@st.cache_data |
def scrape_e_commerce(nama_barang, num_items): |
products = [] |
page = 1 |
query = quote(nama_barang) |
progress_text = "Scraping in progress. Please wait." |
my_bar = st.progress(len(data), text=progress_text) |
while len(products) < num_items: |
url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem=" |
response = requests.get(url) |
soup = BeautifulSoup(response.text, 'html.parser') |
product_list = soup.find_all('a', href=True) |
for product in product_list: |
product_href = product['href'] |
if '/product/' in product_href: |
product_name = product.find('div', class_='title').text.strip() |
product_price = product.find('span', class_='normal price-value').text.strip() |
product_link = f"https://www.klikindomaret.com{product_href}" |
products.append({ |
'product': product_name, |
'price': product_price, |
'link': product_link |
}) |
prop = min(len(products) / num_items, 1) |
my_bar.progress(prop, text=progress_text) |
if len (products) > num_items : |
products = products[:num_items] |
break |
page += 1 |
time.sleep(1) |
my_bar.empty() |
return products |
st.title("Scraping E-Commerce") |
with st.expander("Settings :"): |
selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com"]) |
nama_barang = st.text_input("Masukkan Nama Barang :") |
estimated_num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...") |
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"]) |
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️") |
hidden_data = [] |
scraping_done = False |
if selected_site == "klikindomaret.com": |
if st.button("Mulai Scraping"): |
if not query: |
st.error("Mohon isi query.") |
else: |
data_df = scrape_e_commerce(nama_barang, estimated_num_items) |
hidden_data = data_df |
scraping_done = True |
scraped_products = scrape_e_commerce(nama_barang, estimated_num_items) |
output_file = f"scraped_{nama_barang}.xlsx" |
output_file_csv = f"scraped_{nama_barang}.csv" |
output_file_json = f"scraped_{nama_barang}.json" |
if scraping_done: |
if hidden_data: |
with st.expander(f"Hasil Scraping {selected_site} :"): |
st.write(pd.DataFrame(scraped_products)) |
if download_format == "XLSX": |
df = pd.DataFrame(scraped_products) |
df.to_excel(output_file, index=False) |
st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file) |
elif download_format == "CSV": |
df = pd.DataFrame(scraped_products) |
csv = df.to_csv(index=False) |
st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv) |
elif download_format == "JSON": |
json_data = pd.DataFrame(scraped_products).to_json(orient="records") |
st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json) |
if not scraping_done: |
st.write("Tidak ada data untuk diunduh.") |
st.divider() |
github_link = "https://github.com/naufalnashif/" |
st.markdown(f"GitHub: [{github_link}]({github_link})") |
instagram_link = "https://www.instagram.com/naufal.nashif/" |
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})") |
st.write('Terima kasih telah mencoba demo ini!') |