File size: 19,534 Bytes
de712b3 9774ac6 ea87ab7 fdd5f95 486ab89 d4f441a de712b3 e12fb59 e8b6098 dd2b9b0 e12fb59 ea87ab7 de712b3 e12fb59 de712b3 8dae00d 045f2fb de84a05 2097450 64cf71d de712b3 0e2310d de712b3 a1b9884 de712b3 a1b9884 045f2fb de712b3 8dae00d de712b3 64cf71d de712b3 937df2f 46125ea 8d1f86b 46125ea 402be75 46125ea b68f935 46125ea 65390b0 46125ea 8d1f86b 46125ea ddbcdcd 46125ea ddbcdcd 8d1f86b 46125ea 8d1f86b 8192311 bf06733 7602f66 3d426b8 7602f66 30eaa90 46125ea 98fefe1 a6136fe 28cdeb1 a6136fe 29ad273 a6136fe a71e00a 28cdeb1 a6136fe 6174621 98dbc9e 29ad273 457213e 29ad273 7f4c8d9 29ad273 278bf08 3b89bd5 b5f962a 7f4c8d9 457213e 6174621 457213e 6174621 457213e 6174621 a6136fe 932884f 457213e a6136fe de712b3 ca3127b de712b3 9208784 de712b3 85f3012 de712b3 e12fb59 624e519 de712b3 d783b67 97fe47a d783b67 97fe47a 937df2f a1683d0 937df2f 624e519 de712b3 e12fb59 de712b3 6d727dd de712b3 fb5ba9c de712b3 fb5ba9c de712b3 0e2310d efd6aa7 0e2310d de712b3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 |
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import streamlit as st
import json
import time
import logging
import random
from lxml_html_clean import Cleaner
from requests_html import HTMLSession
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
logging.basicConfig(level=logging.DEBUG)
@st.cache_data
def scrape_klikindomaret(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
while len(products) < num_items :
if len (products) > num_items :
products = products[:num_items]
break
url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
product_list = soup.find_all('a', href=True)
for product in product_list:
product_href = product['href']
if '/product/' in product_href:
product_name = product.find('div', class_='title').text.strip()
product_price = product.find('span', class_='normal price-value').text.strip()
# Cek apakah ada harga sebelum diskon dan persentase diskon
discount_element = product.find('span', class_='strikeout disc-price')
discount_percentage = ""
original_price = ""
if discount_element:
discount_percentage = discount_element.find('span', class_='discount').text.strip()
original_price = discount_element.text.replace(discount_percentage, '').strip()
else:
# Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price
discount_percentage = "0%"
original_price = product_price
product_link = f"https://www.klikindomaret.com{product_href}"
products.append({
'product': product_name,
'original_price': original_price,
'discount_percentage': discount_percentage,
'price': product_price,
'link': product_link
})
if len (products) > num_items :
products = products[:num_items]
break
page += 1
return products
@st.cache_data
def scrape_tokped(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
while len(products) < num_items :
url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,id-ID;q=0.8,id;q=0.7,ja;q=0.6,ru;q=0.5,zh-CN;q=0.4,zh;q=0.3,af;q=0.2,nl;q=0.1',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
}
timeout = 10
try :
response = requests.get(url, headers = headers, timeout = timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)
for product_info in product_container_list:
link = product_info['href']
title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
title = title_element.text.strip() if title_element else None
harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
harga = harga_element.text.strip() if harga_element else None
terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
terjual = terjual_element.text if terjual_element else None
rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
rating = rating_element.text if rating_element else None
toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
toko = toko_element.text.strip() if toko_element else None
asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
asal_product = asal_product_element.text.strip() if asal_product_element else None
products.append({
'link': link,
'produk' : title,
'harga' : harga,
'terjual' : terjual,
'rating' : rating,
'toko' : toko,
'asal_product' : asal_product,
})
if len(products) >= num_items:
products = products[:num_items]
break
except Exception as e:
st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
st.write("Jalankan script ini di IDE/colab.research.google.com Anda :")
code = '''
!pip install beautifulsoup4
!pip install requests
!pip install streamlit
from bs4 import BeautifulSoup
import requests
from urllib.parse import quote
import pandas as pd
import streamlit as st
def scrape_tokped(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
while len(products) < num_items :
url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
}
timeout = 10
try :
response = requests.get(url, headers = headers, timeout = timeout)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)
for product_info in product_container_list:
link = product_info['href']
title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
title = title_element.text.strip() if title_element else None
harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
harga = harga_element.text.strip() if harga_element else None
terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
terjual = terjual_element.text if terjual_element else None
rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
rating = rating_element.text if rating_element else None
toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
toko = toko_element.text.strip() if toko_element else None
asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
asal_product = asal_product_element.text.strip() if asal_product_element else None
products.append({
'link': link,
'produk' : title,
'harga' : harga,
'terjual' : terjual,
'rating' : rating,
'toko' : toko,
'asal_product' : asal_product,
})
if len(products) >= num_items:
products = products[:num_items]
break
except requests.exceptions.RequestException as e:
logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
break
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP Error: {e}")
break
except Exception as e:
logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
break
page += 1
return products)
nama_barang = input("Masukkan nama barang: ")
num_items = int(input("Masukkan jumlah barang yang ingin diambil: "))
# Melakukan scraping menggunakan fungsi scrape_tokped
hasil = scrape_tokped(nama_barang, num_items)
pd.DataFrame(hasil)'''
st.code(code, language='python')
break
page += 1
return products
@st.cache_data
def scrape_tokped_with_selenium(nama_barang, num_items):
products = []
page = 1
query = quote(nama_barang)
options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-notifications')
options.add_argument('--disable-infobars')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)
while len(products) < num_items :
try :
url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
driver.get(url)
# Eksekusi JavaScript untuk mengatur header
#driver.execute_script(
#"""
#var xhr = new XMLHttpRequest();
#xhr.open('GET', arguments[0], false);
#xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36');
#xhr.send(null);
#"""
#, url
#)
# Dapatkan sumber halaman setelah eksekusi JavaScript
# Tunggu hingga halaman selesai dimuat (opsional, tergantung kebutuhan)
driver.implicitly_wait(20) # Tunggu maksimal 20 detik
# Temukan elemen kontainer produk berdasarkan XPath atau CSS selector
# Di sini, saya menggunakan XPath sebagai contoh:
product_container_xpath = "//body//*" # Ganti dengan XPath yang sesuai
#html = driver.find_elements_by_xpath('//body//*')
#html = driver.find_element(By.XPATH, product_container_xpath)
html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
st.write(html)
# Gunakan BeautifulSoup untuk melakukan parsing HTML
soup = BeautifulSoup(html, "html.parser")
# Cari semua elemen yang sesuai
product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href=True)
for product_info in product_container_list:
link = product_info['href']
st.write(link)
title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
title = title_element.text.strip() if title_element else None
harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
harga = harga_element.text.strip() if harga_element else None
terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
terjual = terjual_element.text if terjual_element else None
rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
rating = rating_element.text if rating_element else None
toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
toko = toko_element.text.strip() if toko_element else None
asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
asal_product = asal_product_element.text.strip() if asal_product_element else None
products.append({
'link': link,
'produk' : title,
'harga' : harga,
'terjual' : terjual,
'rating' : rating,
'toko' : toko,
'asal_product' : asal_product,
})
if len(products) >= num_items:
products = products[:num_items]
break
except requests.exceptions.RequestException as e:
logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
st.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
break
except requests.exceptions.HTTPError as e:
logging.error(f"HTTP Error: {e}")
st.error(f"HTTP Error: {e}")
break
except Exception as e:
logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
break
except WebDriverException as e:
st.error(f"An error occurred: {e}")
break
finally:
if driver:
driver.quit()
page += 1
return products
#---------------------------------------------------User Interface----------------------------------------------------------------------
# Streamlit UI
st.title("Scraping E-Commerce")
with st.expander("Settings :"):
# Pilihan untuk memilih situs web
selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "tokopedia.com", "tokopedia.com(selenium)"])
nama_barang = st.text_input("Masukkan Nama Barang :")
num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []
scraping_done = False # Tambahkan variabel ini
if selected_site == "klikindomaret.com":
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_klikindomaret(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
if selected_site =="tokopedia.com":
st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_tokped(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
if selected_site == "tokopedia.com(selenium)":
st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
if st.button("Mulai Scraping"):
if not nama_barang:
st.error("Mohon isi Nama Barang.")
else:
scraped_products = scrape_tokped_with_selenium(nama_barang, num_items)
hidden_data = scraped_products # Simpan data ke dalam variabel tersembunyi
scraping_done = True # Set scraping_done menjadi True
# Simpan DataFrame ke dalam file
output_file = f"scraped_{selected_site}_{nama_barang}.xlsx"
output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv"
output_file_json = f"scraped_{selected_site}_{nama_barang}.json"
#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
# Tampilkan hasil scraping
if scraping_done:
if hidden_data:
# Menampilkan hasil sentimen dalam kotak yang dapat diperluas
with st.expander(f"Hasil Scraping {selected_site} :"):
st.write(pd.DataFrame(scraped_products))
if download_format == "XLSX":
df = pd.DataFrame(scraped_products)
df.to_excel(output_file, index=False)
st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
elif download_format == "CSV":
df = pd.DataFrame(scraped_products)
csv = df.to_csv(index=False)
st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
elif download_format == "JSON":
json_data = pd.DataFrame(scraped_products).to_json(orient="records")
st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
elif not hidden_data:
st.warning(f"Tidak ada data pada query '{nama_barang}'", icon="⚠️")
if not scraping_done:
st.write("Tidak ada data untuk diunduh.")
st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')
|