import requests from bs4 import BeautifulSoup import pandas as pd from urllib.parse import quote import streamlit as st import json import time from selenium import webdriver from selenium.common.exceptions import WebDriverException @st.cache_data def scrape_klikindomaret(nama_barang, num_items): products = [] page = 1 query = quote(nama_barang) progress_text = "Scraping in progress. Please wait." my_bar = st.progress(0, text=progress_text) #while len(products) < num_items : for percent in range(num_items): if len (products) > num_items : products = products[:num_items] break prop = min(len(products)/num_items, 1) my_bar.progress(prop, text=progress_text) url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem=" response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') product_list = soup.find_all('a', href=True) for product in product_list: product_href = product['href'] if '/product/' in product_href: product_name = product.find('div', class_='title').text.strip() product_price = product.find('span', class_='normal price-value').text.strip() # Cek apakah ada harga sebelum diskon dan persentase diskon discount_element = product.find('span', class_='strikeout disc-price') discount_percentage = "" original_price = "" if discount_element: discount_percentage = discount_element.find('span', class_='discount').text.strip() original_price = discount_element.text.replace(discount_percentage, '').strip() else: # Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price discount_percentage = "0%" original_price = product_price product_link = f"https://www.klikindomaret.com{product_href}" products.append({ 'product': product_name, 'original_price': original_price, 'discount_percentage': discount_percentage, 'price': product_price, 'link': product_link }) page += 1 time.sleep(1) my_bar.empty() return products @st.cache_data def scrape_shopee(nama_barang, num_items): products = [] page = 1 query = quote(nama_barang) progress_text = "Scraping in progress. Please wait." my_bar = st.progress(0, text=progress_text) url = f'https://shopee.co.id/search?keyword={query}&page={page}' #path = '' #Customize chrome display #chrome_options = Options() #chrome_options.add_argument('--no-sandbox') #chrome_options.add_argument('--headless') #chrome_options.add_argument('--disable-notifications') #chrome_options.add_argument('--disable-infobars') #Customize chrome display huggingface options = webdriver.ChromeOptions() options.add_argument('--no-sandbox') options.add_argument('--headless') options.add_argument('--disable-notifications') options.add_argument('--disable-infobars') options.add_argument('--disable-dev-shm-usage') try : #driver = webdriver.Chrome(executable_path = path, options = chrome_options) driver = webdriver.Chrome(options = options) while len(products) < num_items : #Cek agar produk sesuai jumlah yang diminta if len (products) > num_items : products = products[:num_items] break driver.set_window_size(1080, 720) # Adjust the window size here driver.get(url) html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML") soup = BeautifulSoup(html, "html.parser") product_list = soup.find_all('li', class_="col-xs-2-4 shopee-search-item-result__item" ) for product in product_list: # Mencari tag di dalam setiap tag
  • a_tag = product.find_all('a', href=True) for product_info in a_tag: # Mendapatkan URL dari atribut 'href' product_href = product_info['href'] product_name = product.find('div', class_="ie3A+n bM+7UW Cve6sh").text.strip() product_price = product.find('div', class_="vioxXd rVLWG6").text.strip() product_terjual = product.find('div', class_="r6HknA uEPGHT").text.strip() product_asal = product.find('div', class_="zGGwiV").text.strip() # Cek apakah ada harga sebelum diskon dan persentase diskon #discount_element = product.find('span', class_='strikeout disc-price') #discount_percentage = "" #original_price = "" #if discount_element: # discount_percentage = discount_element.find('span', class_='discount').text.strip() # original_price = discount_element.text.replace(discount_percentage, '').strip() #else: # # Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price # discount_percentage = "0%" # original_price = product_price # product_link = f"https://shopee.co.id/{product_href}" products.append({ 'product': product_name, #'original_price': original_price, #'discount_percentage': discount_percentage, 'price': product_price, 'terjual' : product_terjual, 'asal' : product_asal, 'link': product_link }) prop = min(len(products)/num_items, 1) my_bar.progress(prop, text=progress_text) page += 1 except WebDriverException as e: return products finally: if wd: wd.quit() time.sleep(1) my_bar.empty() return products #---------------------------------------------------User Interface---------------------------------------------------------------------- # Streamlit UI st.title("Scraping E-Commerce") with st.expander("Settings :"): # Pilihan untuk memilih situs web selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "shopee.co.id"]) nama_barang = st.text_input("Masukkan Nama Barang :") num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...") download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"]) st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️") # Variabel tersembunyi untuk menyimpan hasil scraping hidden_data = [] scraping_done = False # Tambahkan variabel ini if selected_site == "klikindomaret.com": if st.button("Mulai Scraping"): if not nama_barang: st.error("Mohon isi Nama Barang.") else: scraped_products = scrape_klikindomaret(nama_barang, num_items) hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi scraping_done = True # Set scraping_done menjadi True if selected_site == "shopee.co.id": #st.error("Sedang dalam pengembangan. Silahkan pilih situs yang lain") if st.button("Mulai Scraping"): if not nama_barang: st.error("Mohon isi Nama Barang.") else: scraped_products = scrape_shopee(nama_barang, num_items) hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi scraping_done = True # Set scraping_done menjadi True # Simpan DataFrame ke dalam file output_file = f"scraped_{selected_site}_{nama_barang}.xlsx" output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv" output_file_json = f"scraped_{selected_site}_{nama_barang}.json" #---------------------------------------------------Download File & Hasil Scraping---------------------------------------------------------------------- # Tampilkan hasil scraping if scraping_done: if hidden_data: # Menampilkan hasil sentimen dalam kotak yang dapat diperluas with st.expander(f"Hasil Scraping {selected_site} :"): st.write(pd.DataFrame(scraped_products)) if download_format == "XLSX": df = pd.DataFrame(scraped_products) df.to_excel(output_file, index=False) st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file) elif download_format == "CSV": df = pd.DataFrame(scraped_products) csv = df.to_csv(index=False) st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv) elif download_format == "JSON": json_data = pd.DataFrame(scraped_products).to_json(orient="records") st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json) elif not hidden_data: st.warning(f"Tidak ada data pada query '{query}'", icon="⚠️") if not scraping_done: st.write("Tidak ada data untuk diunduh.") st.divider() github_link = "https://github.com/naufalnashif/" st.markdown(f"GitHub: [{github_link}]({github_link})") instagram_link = "https://www.instagram.com/naufal.nashif/" st.markdown(f"Instagram: [{instagram_link}]({instagram_link})") st.write('Terima kasih telah mencoba demo ini!')