File size: 19,534 Bytes
de712b3
 
 
 
 
 
9774ac6
ea87ab7
fdd5f95
486ab89
d4f441a
de712b3
e12fb59
e8b6098
dd2b9b0
e12fb59
ea87ab7
 
de712b3
e12fb59
de712b3
 
 
8dae00d
045f2fb
de84a05
2097450
 
64cf71d
de712b3
 
 
 
 
 
 
0e2310d
de712b3
 
 
 
a1b9884
 
 
 
 
 
 
 
 
 
 
 
 
de712b3
 
 
a1b9884
 
045f2fb
de712b3
 
8dae00d
 
 
de712b3
64cf71d
de712b3
 
937df2f
 
46125ea
 
 
 
 
8d1f86b
46125ea
402be75
 
 
 
 
 
46125ea
b68f935
46125ea
65390b0
46125ea
 
 
8d1f86b
46125ea
 
ddbcdcd
 
 
 
 
 
 
 
 
46125ea
 
ddbcdcd
 
 
 
 
 
8d1f86b
46125ea
 
 
 
 
 
 
 
 
 
 
 
8d1f86b
8192311
bf06733
7602f66
3d426b8
 
7602f66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30eaa90
46125ea
98fefe1
a6136fe
 
 
 
 
 
 
28cdeb1
a6136fe
29ad273
a6136fe
 
 
a71e00a
28cdeb1
a6136fe
6174621
 
 
98dbc9e
 
 
29ad273
 
 
 
 
 
 
 
 
457213e
 
29ad273
7f4c8d9
29ad273
 
 
278bf08
3b89bd5
b5f962a
 
7f4c8d9
457213e
6174621
457213e
 
 
 
6174621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457213e
6174621
 
 
 
 
 
 
 
 
 
 
 
 
 
a6136fe
932884f
 
 
457213e
a6136fe
de712b3
 
 
 
 
 
 
ca3127b
de712b3
 
9208784
de712b3
 
 
 
 
 
 
 
 
 
 
85f3012
 
de712b3
e12fb59
624e519
de712b3
 
d783b67
97fe47a
d783b67
 
 
 
 
 
 
 
 
97fe47a
937df2f
 
 
 
a1683d0
937df2f
 
624e519
de712b3
e12fb59
 
 
de712b3
 
 
 
 
 
 
 
 
6d727dd
de712b3
fb5ba9c
de712b3
 
 
fb5ba9c
de712b3
 
 
 
 
0e2310d
efd6aa7
0e2310d
de712b3
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import streamlit as st
import json
import time
import logging
import random
from lxml_html_clean import Cleaner
from requests_html import HTMLSession

from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By

logging.basicConfig(level=logging.DEBUG)

@st.cache_data
def scrape_klikindomaret(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)


    while len(products) < num_items :
        if len (products) > num_items :
            products = products[:num_items]
            break 
        url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        product_list = soup.find_all('a', href=True)

        for product in product_list:
            
            product_href = product['href']
            if '/product/' in product_href:
                product_name = product.find('div', class_='title').text.strip()
                product_price = product.find('span', class_='normal price-value').text.strip()

                 # Cek apakah ada harga sebelum diskon dan persentase diskon
                discount_element = product.find('span', class_='strikeout disc-price')
                discount_percentage = ""
                original_price = ""
                if discount_element:
                    discount_percentage = discount_element.find('span', class_='discount').text.strip()
                    original_price = discount_element.text.replace(discount_percentage, '').strip()
                else:
                    # Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price
                    discount_percentage = "0%"
                    original_price = product_price
                
                product_link = f"https://www.klikindomaret.com{product_href}"
                products.append({
                    'product': product_name,
                    'original_price': original_price,
                    'discount_percentage': discount_percentage,
                    'price': product_price,
                    'link': product_link
                })
            if len (products) > num_items :
                products = products[:num_items]
                break
        page += 1

    return products

@st.cache_data
def scrape_tokped(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)
    while len(products) < num_items :
        url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'en-US,en;q=0.9,id-ID;q=0.8,id;q=0.7,ja;q=0.6,ru;q=0.5,zh-CN;q=0.4,zh;q=0.3,af;q=0.2,nl;q=0.1',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
        }
        timeout = 10
        try :
            response = requests.get(url, headers = headers, timeout = timeout)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)

            for product_info in product_container_list:
                link = product_info['href']
                title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                title = title_element.text.strip() if title_element else None

                harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                harga = harga_element.text.strip() if harga_element else None

                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                terjual = terjual_element.text if terjual_element else None

                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None

                toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                toko = toko_element.text.strip() if toko_element else None

                asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                asal_product = asal_product_element.text.strip() if asal_product_element else None

                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break

        except Exception as e:
            st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            st.write("Jalankan script ini di IDE/colab.research.google.com Anda :")
            code =  '''
                    !pip install beautifulsoup4
                    !pip install requests
                    !pip install streamlit
                    from bs4 import BeautifulSoup
                    import requests
                    from urllib.parse import quote
                    import pandas as pd
                    import streamlit as st
                    def scrape_tokped(nama_barang, num_items):
                        products = []
                        page = 1
                        query = quote(nama_barang)
                        while len(products) < num_items :
                            url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
                    
                            headers = {
                                'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
                            }
                            timeout = 10
                            try :
                                response = requests.get(url, headers = headers, timeout = timeout)
                                response.raise_for_status()
                    
                                soup = BeautifulSoup(response.text, 'html.parser')
                    
                                product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)
                    
                                for product_info in product_container_list:
                                    link = product_info['href']
                                    title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                                    title = title_element.text.strip() if title_element else None
                    
                                    harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                                    harga = harga_element.text.strip() if harga_element else None
                    
                                    terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                                    terjual = terjual_element.text if terjual_element else None
                    
                                    rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                                    rating = rating_element.text if rating_element else None
                    
                                    toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                                    toko = toko_element.text.strip() if toko_element else None
                    
                                    asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                                    asal_product = asal_product_element.text.strip() if asal_product_element else None
                    
                                    products.append({
                                        'link': link,
                                        'produk' : title,
                                        'harga' : harga,
                                        'terjual' : terjual,
                                        'rating' : rating,
                                        'toko' : toko,
                                        'asal_product' : asal_product,
                                    })
                                if len(products) >= num_items:
                                    products = products[:num_items]
                                    break
                    
                            except requests.exceptions.RequestException as e:
                                logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
                                break
                            except requests.exceptions.HTTPError as e:
                                logging.error(f"HTTP Error: {e}")
                                break
                            except Exception as e:
                                logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
                                break
                            page += 1
                        return products)
                        
                    nama_barang = input("Masukkan nama barang: ")
                    num_items = int(input("Masukkan jumlah barang yang ingin diambil: "))

                    # Melakukan scraping menggunakan fungsi scrape_tokped
                    hasil = scrape_tokped(nama_barang, num_items)
                    pd.DataFrame(hasil)'''
            st.code(code, language='python')
            break
        page += 1
    return products

@st.cache_data
def scrape_tokped_with_selenium(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)

    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')
    
    driver = webdriver.Chrome(options=options)

    while len(products) < num_items :
        try :
            url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
            
            driver.get(url)
            # Eksekusi JavaScript untuk mengatur header
            #driver.execute_script(
                #"""
                #var xhr = new XMLHttpRequest();
                #xhr.open('GET', arguments[0], false);
                #xhr.setRequestHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36');
                #xhr.send(null);
                #"""
                #, url
            #)

            # Dapatkan sumber halaman setelah eksekusi JavaScript
            # Tunggu hingga halaman selesai dimuat (opsional, tergantung kebutuhan)
            driver.implicitly_wait(20)  # Tunggu maksimal 20 detik
            
            # Temukan elemen kontainer produk berdasarkan XPath atau CSS selector
            # Di sini, saya menggunakan XPath sebagai contoh:
            product_container_xpath = "//body//*"  # Ganti dengan XPath yang sesuai
            #html = driver.find_elements_by_xpath('//body//*')
            #html = driver.find_element(By.XPATH, product_container_xpath)
            html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
            st.write(html)
            # Gunakan BeautifulSoup untuk melakukan parsing HTML
            soup = BeautifulSoup(html, "html.parser")

            # Cari semua elemen yang sesuai
            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href=True)

            for product_info in product_container_list:
                link = product_info['href']
                st.write(link)
                title_element = product_info.find('div', class_="prd_link-product-name css-3um8ox")
                title = title_element.text.strip() if title_element else None
    
                harga_element = product_info.find('div', class_="prd_link-product-price css-h66vau")
                harga = harga_element.text.strip() if harga_element else None
    
                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h")
                terjual = terjual_element.text if terjual_element else None
    
                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None
    
                toko_element = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip")
                toko = toko_element.text.strip() if toko_element else None
    
                asal_product_element = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip")
                asal_product = asal_product_element.text.strip() if asal_product_element else None
            
                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break
   
        except requests.exceptions.RequestException as e:
            logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
            st.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
            break
        except requests.exceptions.HTTPError as e:
            logging.error(f"HTTP Error: {e}")
            st.error(f"HTTP Error: {e}")
            break
        except Exception as e:
            logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            st.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            break 
        except WebDriverException as e:
            st.error(f"An error occurred: {e}")
            break
        finally:
            if driver:
                driver.quit()
        page += 1 
    return products
#---------------------------------------------------User Interface----------------------------------------------------------------------

# Streamlit UI
st.title("Scraping E-Commerce")

with st.expander("Settings :"):
    # Pilihan untuk memilih situs web
    selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "tokopedia.com", "tokopedia.com(selenium)"])
    
    nama_barang = st.text_input("Masukkan Nama Barang :")
    num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
    
    download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
    st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")

# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []

scraping_done = False  # Tambahkan variabel ini

if selected_site == "klikindomaret.com":
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_klikindomaret(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

if selected_site =="tokopedia.com":
    st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_tokped(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

if selected_site == "tokopedia.com(selenium)":
    st.warning("Jika mengalami error karena sedang dalam pengembangan. Silahkan pilih situs yang lain", icon="⚠️")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_tokped_with_selenium(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

# Simpan DataFrame ke dalam file
output_file = f"scraped_{selected_site}_{nama_barang}.xlsx"
output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv"
output_file_json = f"scraped_{selected_site}_{nama_barang}.json"


#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------

# Tampilkan hasil scraping
if scraping_done:
    if hidden_data:
        # Menampilkan hasil sentimen dalam kotak yang dapat diperluas
        with st.expander(f"Hasil Scraping {selected_site} :"):
            st.write(pd.DataFrame(scraped_products))
        if download_format == "XLSX":
            df = pd.DataFrame(scraped_products)
            df.to_excel(output_file, index=False)
            st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
        elif download_format == "CSV":
            df = pd.DataFrame(scraped_products)
            csv = df.to_csv(index=False)
            st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
        elif download_format == "JSON":
            json_data = pd.DataFrame(scraped_products).to_json(orient="records")
            st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
    elif not hidden_data:
        st.warning(f"Tidak ada data pada query '{nama_barang}'", icon="⚠️")
        
if not scraping_done:
    st.write("Tidak ada data untuk diunduh.")

st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')