File size: 11,622 Bytes
de712b3 9774ac6 de712b3 e12fb59 e8b6098 dd2b9b0 e12fb59 de712b3 e12fb59 de712b3 7190751 d8298cf 045f2fb acaad0a 8a9efb5 7190751 02fe0bc d8298cf c0f98d5 de712b3 0e2310d de712b3 a1b9884 de712b3 a1b9884 045f2fb de712b3 02fe0bc de712b3 e12fb59 efd6aa7 a26eff7 e12fb59 d453889 e12fb59 d453889 e8b6098 e12fb59 e8b6098 efd6aa7 e8b6098 661ac3e fc77df8 dd2b9b0 fc77df8 aa14fc4 e12fb59 0dfa854 8192311 e8b6098 da309c0 e8b6098 da309c0 e8b6098 e12fb59 937df2f 46125ea 74d0899 46125ea 8192311 46125ea 8192311 937df2f 46125ea 8192311 46125ea 98fefe1 de712b3 937df2f de712b3 9208784 de712b3 85f3012 de712b3 e12fb59 624e519 de712b3 e12fb59 937df2f 624e519 de712b3 e12fb59 de712b3 6d727dd de712b3 fb5ba9c de712b3 fb5ba9c de712b3 0e2310d efd6aa7 0e2310d de712b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import streamlit as st
import json
import time
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
@st.cache_data
def scrape_klikindomaret(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
progress_text = "Scraping in progress. Please wait."
my_bar = st.progress(0, text=progress_text)
#while len(products) < num_items :
for percent in range(num_items):
if len (products) > num_items :
products = products[:num_items]
break
prop = min(len(products)/num_items, 1)
my_bar.progress(prop, text=progress_text)
url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
product_list = soup.find_all('a', href=True)
for product in product_list:
product_href = product['href']
if '/product/' in product_href:
product_name = product.find('div', class_='title').text.strip()
product_price = product.find('span', class_='normal price-value').text.strip()
# Cek apakah ada harga sebelum diskon dan persentase diskon
discount_element = product.find('span', class_='strikeout disc-price')
discount_percentage = ""
original_price = ""
if discount_element:
discount_percentage = discount_element.find('span', class_='discount').text.strip()
original_price = discount_element.text.replace(discount_percentage, '').strip()
else:
# Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price
discount_percentage = "0%"
original_price = product_price
product_link = f"https://www.klikindomaret.com{product_href}"
products.append({
'product': product_name,
'original_price': original_price,
'discount_percentage': discount_percentage,
'price': product_price,
'link': product_link
})
page += 1
time.sleep(1)
my_bar.empty()
return products
@st.cache_data
def scrape_shopee(nama_barang, num_items):
products = []
#path = ''
#Customize chrome display
#chrome_options = Options()
#chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--headless')
#chrome_options.add_argument('--disable-notifications')
#chrome_options.add_argument('--disable-infobars')
#Customize chrome display huggingface
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-notifications')
options.add_argument('--disable-infobars')
options.add_argument('--disable-dev-shm-usage')
try :
page = 1
query = quote(nama_barang)
#driver = webdriver.Chrome(executable_path = path, options = chrome_options)
driver = webdriver.Chrome(options = options)
url = f'https://shopee.co.id/search?keyword={query}&page={page}'
driver.get(url)
time.sleep(10)
# Cari elemen berdasarkan tagname HTML
html_element = driver.find_element(By.TAG_NAME, "html")
# Dapatkan HTML dari elemen
html = html_element.get_attribute("innerHTML")
#html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
soup = BeautifulSoup(html, "html.parser")
for i in soup.find_all('div', class_ = "ie3A+n bM+7UW Cve6sh" ):
products.append(i.text)
except requests.exceptions.RequestException as e:
logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP Error: {e}")
except Exception as e:
logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
except WebDriverException as e:
st.error(f"An error occurred: {e}")
finally:
if driver:
driver.quit()
return products
@st.cache_data
def scrape_tokped(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
while len(products) < num_items :
url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
headers = {
'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
'Accept-Language' : 'en-US, en;q-0.5',
'Accept-Encoding' : 'grip, deflate, bt',
'Connection': 'keep-alive'
}
timeout = 10
try :
response = requests.get(url, headers = headers, timeout = timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)
for product_info in product_container_list:
link = product_info['href']
st.write(link)
title = product_info.find('div', class_="prd_link-product-name css-3um8ox" ).text.strip()
harga = product_info.find('div', class_="prd_link-product-price css-h66vau").text.strip()
#Dapatkan terjual
terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h").text.strip()
terjual = terjual_element if terjual_element else None
# Dapatkan rating
rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
rating = rating_element.text if rating_element else None
toko = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip").text.strip()
asal_product = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip" ).text.strip()
products.append({
'link': link,
'produk' : title,
'harga' : harga,
'terjual' : terjual,
'rating' : rating,
'toko' : toko,
'asal_product' : asal_product,
})
if len(products) >= num_items:
products = products[:num_items]
break
except requests.exceptions.RequestException as e:
logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
break
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP Error: {e}")
break
except Exception as e:
logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
break
page += 1
return products
#---------------------------------------------------User Interface----------------------------------------------------------------------
# Streamlit UI
st.title("Scraping E-Commerce")
with st.expander("Settings :"):
# Pilihan untuk memilih situs web
selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "shopee.co.id", "tokopedia.com"])
nama_barang = st.text_input("Masukkan Nama Barang :")
num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []
scraping_done = False # Tambahkan variabel ini
if selected_site == "klikindomaret.com":
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_klikindomaret(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
if selected_site == "shopee.co.id":
#st.error("Sedang dalam pengembangan. Silahkan pilih situs yang lain")
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_shopee(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
if selected_site == "tokopedia.com":
#st.error("Sedang dalam pengembangan. Silahkan pilih situs yang lain")
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_tokped(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
# Simpan DataFrame ke dalam file
output_file = f"scraped_{selected_site}_{nama_barang}.xlsx"
output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv"
output_file_json = f"scraped_{selected_site}_{nama_barang}.json"
#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
# Tampilkan hasil scraping
if scraping_done:
if hidden_data:
# Menampilkan hasil sentimen dalam kotak yang dapat diperluas
with st.expander(f"Hasil Scraping {selected_site} :"):
st.write(pd.DataFrame(scraped_products))
if download_format == "XLSX":
df = pd.DataFrame(scraped_products)
df.to_excel(output_file, index=False)
st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
elif download_format == "CSV":
df = pd.DataFrame(scraped_products)
csv = df.to_csv(index=False)
st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
elif download_format == "JSON":
json_data = pd.DataFrame(scraped_products).to_json(orient="records")
st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
elif not hidden_data:
st.warning(f"Tidak ada data pada query '{nama_barang}'", icon="⚠️")
if not scraping_done:
st.write("Tidak ada data untuk diunduh.")
st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')
|