naufalnashif commited on
Commit
de712b3
·
1 Parent(s): 0c80300

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import pandas as pd
4
+ from urllib.parse import quote
5
+ import streamlit as st
6
+ import json
7
+
8
+ @st.cache_data
9
+ def scrape_e_commerce(nama_barang, num_items):
10
+ products = []
11
+ page = 1
12
+ query = quote(nama_barang)
13
+ progress_text = "Scraping in progress. Please wait."
14
+ my_bar = st.progress(len(data), text=progress_text)
15
+
16
+ while len(products) < num_items:
17
+ url = f"https://www.klikindomaret.com/search/?key={query}&categories=&productbrandid=&sortcol=&pagesize=54&page={page}&startprice=&endprice=&attributes=&ShowItem="
18
+ response = requests.get(url)
19
+ soup = BeautifulSoup(response.text, 'html.parser')
20
+
21
+ product_list = soup.find_all('a', href=True)
22
+
23
+ for product in product_list:
24
+ product_href = product['href']
25
+ if '/product/' in product_href:
26
+ product_name = product.find('div', class_='title').text.strip()
27
+ product_price = product.find('span', class_='normal price-value').text.strip()
28
+ product_link = f"https://www.klikindomaret.com{product_href}"
29
+ products.append({
30
+ 'product': product_name,
31
+ 'price': product_price,
32
+ 'link': product_link
33
+ })
34
+
35
+ prop = min(len(products) / num_items, 1)
36
+ my_bar.progress(prop, text=progress_text)
37
+ if len (products) > num_items :
38
+ products = products[:num_items]
39
+ break
40
+ page += 1
41
+
42
+ time.sleep(1)
43
+ my_bar.empty()
44
+ return products
45
+
46
+ #---------------------------------------------------User Interface----------------------------------------------------------------------
47
+
48
+ # Streamlit UI
49
+ st.title("Scraping E-Commerce")
50
+
51
+ with st.expander("Settings :"):
52
+ # Pilihan untuk memilih situs web
53
+ selected_site = st.selectbox("Pilih Situs Web :", ["klikindomaret.com"])
54
+
55
+ nama_barang = st.text_input("Masukkan Nama Barang :")
56
+ estimated_num_items = st.number_input("Masukkan Estimasi Banyak Data :", min_value = 1, step = 1, placeholder="Type a number...")
57
+
58
+ download_format = st.selectbox("Pilih Format Unduhan :", ["XLSX", "CSV", "JSON"])
59
+ st.info('Tekan "Mulai Scraping" kembali jika tampilan menghilang ', icon="ℹ️")
60
+
61
+ # Variabel tersembunyi untuk menyimpan hasil scraping
62
+ hidden_data = []
63
+
64
+ scraping_done = False # Tambahkan variabel ini
65
+
66
+ if selected_site == "klikindomaret.com":
67
+ if st.button("Mulai Scraping"):
68
+ if not query:
69
+ st.error("Mohon isi query.")
70
+ else:
71
+ data_df = scrape_e_commerce(nama_barang, estimated_num_items)
72
+ hidden_data = data_df # Simpan data ke dalam variabel tersembunyi
73
+ scraping_done = True # Set scraping_done menjadi True
74
+
75
+ scraped_products = scrape_e_commerce(nama_barang, estimated_num_items)
76
+
77
+
78
+
79
+ # Simpan DataFrame ke dalam file
80
+ output_file = f"scraped_{nama_barang}.xlsx"
81
+ output_file_csv = f"scraped_{nama_barang}.csv"
82
+ output_file_json = f"scraped_{nama_barang}.json"
83
+
84
+
85
+ #---------------------------------------------------Download File & Hasil Scraping----------------------------------------------------------------------
86
+
87
+ # Tampilkan hasil scraping
88
+ if scraping_done:
89
+ if hidden_data:
90
+ # Menampilkan hasil sentimen dalam kotak yang dapat diperluas
91
+ with st.expander(f"Hasil Scraping {selected_site} :"):
92
+ st.write(pd.DataFrame(scraped_products)
93
+ if download_format == "XLSX":
94
+ df = pd.DataFrame(scraped_products)
95
+ df.to_excel(output_file, index=False)
96
+ st.download_button(label=f"Unduh XLSX ({len(hidden_data)} data)", data=open(output_file, "rb").read(), key="xlsx_download", file_name=output_file)
97
+ elif download_format == "CSV":
98
+ df = pd.DataFrame(scraped_products)
99
+ csv = df.to_csv(index=False)
100
+ st.download_button(label=f"Unduh CSV ({len(hidden_data)} data)", data=csv, key="csv_download", file_name=output_file_csv)
101
+ elif download_format == "JSON":
102
+ json_data = pd.DataFrame(scraped_products).to_json(orient="records")
103
+ st.download_button(label=f"Unduh JSON ({len(hidden_data)} data)", data=json_data, key="json_download", file_name=output_file_json)
104
+ if not scraping_done:
105
+ st.write("Tidak ada data untuk diunduh.")
106
+
107
+ st.divider()
108
+ github_link = "https://github.com/naufalnashif/"
109
+ st.markdown(f"GitHub: [{github_link}]({github_link})")
110
+ instagram_link = "https://www.instagram.com/naufal.nashif/"
111
+ st.markdown(f"Instagram: [{instagram_link}]({instagram_link})")
112
+ st.write('Terima kasih telah mencoba demo ini!')