File size: 11,622 Bytes
de712b3
 
 
 
 
 
9774ac6
de712b3
e12fb59
e8b6098
dd2b9b0
e12fb59
de712b3
e12fb59
de712b3
 
 
7190751
d8298cf
045f2fb
acaad0a
 
8a9efb5
 
7190751
02fe0bc
d8298cf
c0f98d5
 
de712b3
 
 
 
 
 
 
0e2310d
de712b3
 
 
 
a1b9884
 
 
 
 
 
 
 
 
 
 
 
 
de712b3
 
 
a1b9884
 
045f2fb
de712b3
 
02fe0bc
de712b3
 
 
 
 
 
e12fb59
 
 
efd6aa7
a26eff7
e12fb59
 
 
d453889
 
e12fb59
d453889
 
 
 
 
 
 
 
 
e8b6098
e12fb59
 
e8b6098
efd6aa7
 
e8b6098
 
661ac3e
 
 
fc77df8
 
dd2b9b0
fc77df8
 
 
 
aa14fc4
e12fb59
0dfa854
 
 
8192311
 
 
 
 
 
e8b6098
da309c0
e8b6098
da309c0
 
e8b6098
e12fb59
937df2f
 
 
46125ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74d0899
46125ea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8192311
46125ea
8192311
937df2f
46125ea
8192311
 
 
 
46125ea
 
98fefe1
de712b3
 
 
 
 
 
 
937df2f
de712b3
 
9208784
de712b3
 
 
 
 
 
 
 
 
 
 
85f3012
 
de712b3
e12fb59
624e519
de712b3
 
e12fb59
 
 
 
 
 
 
 
 
937df2f
 
 
 
 
 
 
 
 
 
624e519
de712b3
e12fb59
 
 
de712b3
 
 
 
 
 
 
 
 
6d727dd
de712b3
fb5ba9c
de712b3
 
 
fb5ba9c
de712b3
 
 
 
 
0e2310d
efd6aa7
0e2310d
de712b3
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
import streamlit as st
import json
import time

from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By

@st.cache_data
def scrape_klikindomaret(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)
    progress_text = "Scraping in progress. Please wait."
    my_bar = st.progress(0, text=progress_text)

    #while len(products) < num_items :
    for percent in range(num_items):
        if len (products) > num_items :
            products = products[:num_items]
            break  
        
        prop = min(len(products)/num_items, 1)
        my_bar.progress(prop, text=progress_text)
            
        url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        product_list = soup.find_all('a', href=True)

        for product in product_list:
            
            product_href = product['href']
            if '/product/' in product_href:
                product_name = product.find('div', class_='title').text.strip()
                product_price = product.find('span', class_='normal price-value').text.strip()

                 # Cek apakah ada harga sebelum diskon dan persentase diskon
                discount_element = product.find('span', class_='strikeout disc-price')
                discount_percentage = ""
                original_price = ""
                if discount_element:
                    discount_percentage = discount_element.find('span', class_='discount').text.strip()
                    original_price = discount_element.text.replace(discount_percentage, '').strip()
                else:
                    # Jika tidak ada diskon, set discount_percentage ke "0%" dan original_price ke product_price
                    discount_percentage = "0%"
                    original_price = product_price
                
                product_link = f"https://www.klikindomaret.com{product_href}"
                products.append({
                    'product': product_name,
                    'original_price': original_price,
                    'discount_percentage': discount_percentage,
                    'price': product_price,
                    'link': product_link
                })
        
        page += 1
        
    time.sleep(1)   
    my_bar.empty()
    return products

@st.cache_data
def scrape_shopee(nama_barang, num_items):
    products = []

 
    #path = ''
    
    #Customize chrome display
    #chrome_options = Options()
    #chrome_options.add_argument('--no-sandbox')
    #chrome_options.add_argument('--headless')
    #chrome_options.add_argument('--disable-notifications')
    #chrome_options.add_argument('--disable-infobars')
    
    #Customize chrome display huggingface
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-notifications')
    options.add_argument('--disable-infobars')
    options.add_argument('--disable-dev-shm-usage')


    try :
        page = 1
        query = quote(nama_barang)
        #driver = webdriver.Chrome(executable_path = path, options = chrome_options)
        driver = webdriver.Chrome(options = options)
        url = f'https://shopee.co.id/search?keyword={query}&page={page}'
        driver.get(url)
        time.sleep(10)
        
        # Cari elemen berdasarkan tagname HTML
        html_element = driver.find_element(By.TAG_NAME, "html")

        # Dapatkan HTML dari elemen
        html = html_element.get_attribute("innerHTML")
        #html = driver.execute_script("return document.getElementsByTagName('html')[0].innerHTML")
        soup = BeautifulSoup(html, "html.parser")
        
        for i in soup.find_all('div', class_ = "ie3A+n bM+7UW Cve6sh" ):
            products.append(i.text)
        
    except requests.exceptions.RequestException as e:
        logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
    except requests.exceptions.HTTPError as e:
        logging.error(f"HTTP Error: {e}")
    except Exception as e:
        logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")  
    except WebDriverException as e:
        st.error(f"An error occurred: {e}")
    finally:
        if driver:
            driver.quit()
    
    return products

@st.cache_data
def scrape_tokped(nama_barang, num_items):
    products = []
    page = 1
    query = quote(nama_barang)
    while len(products) < num_items :
        url = f'https://www.tokopedia.com/search?navsource=&page={page}&q={query}&srp_component_id=02.01.00.00&srp_page_id=&srp_page_title=&st='
      
        headers = {
            'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
            'Accept-Language' : 'en-US, en;q-0.5',
            'Accept-Encoding' : 'grip, deflate, bt',
            'Connection': 'keep-alive'
        }
        timeout = 10
        try :
            response = requests.get(url, headers = headers, timeout = timeout)
            response.raise_for_status()
        
            soup = BeautifulSoup(response.text, 'html.parser')
            product_container_list = soup.find_all('a', class_="pcv3__info-content css-gwkf0u", href = True)
        
            for product_info in product_container_list:
                link = product_info['href']
                st.write(link)
                title = product_info.find('div', class_="prd_link-product-name css-3um8ox" ).text.strip()
                harga = product_info.find('div', class_="prd_link-product-price css-h66vau").text.strip()
                #Dapatkan terjual
                terjual_element = product_info.find('span', class_="prd_label-integrity css-1sgek4h").text.strip()
                terjual = terjual_element if terjual_element else None
                # Dapatkan rating
                rating_element = product_info.find('span', class_='prd_rating-average-text css-t70v7i')
                rating = rating_element.text if rating_element else None
        
                toko = product_info.find('span', class_="prd_link-shop-name css-1kdc32b flip").text.strip()
                asal_product = product_info.find('span', class_="prd_link-shop-loc css-1kdc32b flip"  ).text.strip()
        
                products.append({
                    'link': link,
                    'produk' : title,
                    'harga' : harga,
                    'terjual' : terjual,
                    'rating' : rating,
                    'toko' : toko,
                    'asal_product' : asal_product,
                })
            if len(products) >= num_items:
                products = products[:num_items]
                break
                
        except requests.exceptions.RequestException as e:
            logging.error(f"Terjadi kesalahan saat mengirim permintaan: {e}")
            break
        except requests.exceptions.HTTPError as e:
            logging.error(f"HTTP Error: {e}")
            break
        except Exception as e:
            logging.error(f"Terjadi kesalahan yang tidak diketahui: {e}")
            break
        page += 1
    return products
#---------------------------------------------------User Interface----------------------------------------------------------------------

# Streamlit UI
st.title("Scraping E-Commerce")

with st.expander("Settings :"):
    # Pilihan untuk memilih situs web
    selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com", "shopee.co.id", "tokopedia.com"])
    
    nama_barang = st.text_input("Masukkan Nama Barang :")
    num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
    
    download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
    st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")

# Variabel tersembunyi untuk menyimpan hasil scraping
hidden_data = []

scraping_done = False  # Tambahkan variabel ini

if selected_site == "klikindomaret.com":
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_klikindomaret(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

if selected_site == "shopee.co.id":
    #st.error("Sedang dalam pengembangan. Silahkan pilih situs yang lain")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_shopee(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True
            
if selected_site == "tokopedia.com":
    #st.error("Sedang dalam pengembangan. Silahkan pilih situs yang lain")
    if st.button("Mulai Scraping"):
        if not nama_barang:
            st.error("Mohon isi Nama Barang.")
        else:
            scraped_products = scrape_tokped(nama_barang, num_items)
            hidden_data = scraped_products  # Simpan data ke dalam variabel tersembunyi
            scraping_done = True  # Set scraping_done menjadi True

# Simpan DataFrame ke dalam file
output_file = f"scraped_{selected_site}_{nama_barang}.xlsx"
output_file_csv = f"scraped_{selected_site}_{nama_barang}.csv"
output_file_json = f"scraped_{selected_site}_{nama_barang}.json"


#---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------

# Tampilkan hasil scraping
if scraping_done:
    if hidden_data:
        # Menampilkan hasil sentimen dalam kotak yang dapat diperluas
        with st.expander(f"Hasil Scraping {selected_site} :"):
            st.write(pd.DataFrame(scraped_products))
        if download_format == "XLSX":
            df = pd.DataFrame(scraped_products)
            df.to_excel(output_file, index=False)
            st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
        elif download_format == "CSV":
            df = pd.DataFrame(scraped_products)
            csv = df.to_csv(index=False)
            st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
        elif download_format == "JSON":
            json_data = pd.DataFrame(scraped_products).to_json(orient="records")
            st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
    elif not hidden_data:
        st.warning(f"Tidak ada data pada query '{nama_barang}'", icon="⚠️")
        
if not scraping_done:
    st.write("Tidak ada data untuk diunduh.")

st.divider()
github_link = "https://github.com/naufalnashif/"
st.markdown(f"GitHub: [{github_link}]({github_link})")
instagram_link = "https://www.instagram.com/naufal.nashif/"
st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
st.write('Terima kasih telah mencoba demo ini!')