Rooobert commited on
Commit
f39d20c
·
verified ·
1 Parent(s): 65e4f78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -133
app.py CHANGED
@@ -1,168 +1,159 @@
 
 
 
 
1
  import pandas as pd
 
 
2
  import plotly.express as px
3
  import plotly.graph_objects as go
4
- import numpy as np
5
  from plotly.subplots import make_subplots
6
- import streamlit as st
7
 
8
- # 讀取數據
 
 
 
 
 
 
 
 
 
9
  @st.cache_data
10
- def load_data():
11
- df = pd.read_csv('booking_hotels_tainan.csv')
12
-
13
- # 數據預處理
14
- def clean_rating(x):
15
- if pd.isna(x) or x == '無評分':
16
- return 0
17
- return float(str(x).replace('分數', '').replace('分', ''))
18
-
19
- # 清理評分數據
20
- df['評分'] = df['評分'].apply(clean_rating)
21
-
22
- # 創建價格區間
23
- df['價格區間'] = pd.qcut(df['價格'],
24
- q=3,
25
- labels=['經濟型', '中價位', '高價位'])
26
- return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- df = load_data()
 
 
 
29
 
30
- def create_price_rating_scatter():
31
- """價格vs評分散點圖"""
 
 
 
 
32
  fig = px.scatter(
33
  df,
34
  x='價格',
35
  y='評分',
36
  text='飯店名稱',
37
- size='價格', # 點的大小根據價格變化
38
- color='評分', # 顏色根據評分變化
39
  title='台南飯店價��與評分關係圖',
40
  labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
41
  )
42
-
43
- fig.update_traces(
44
- textposition='top center',
45
- marker=dict(sizeref=2.*max(df['價格'])/(40.**2))
46
- )
47
-
48
- fig.update_layout(
49
- height=600,
50
- showlegend=True,
51
- title_x=0.5,
52
- title_font_size=20
53
- )
54
-
55
  return fig
56
 
57
- def create_price_distribution():
58
- """價格分布圖"""
59
  fig = go.Figure()
60
-
61
- # 添加直方圖
62
  fig.add_trace(go.Histogram(
63
  x=df['價格'],
64
  name='價格分布',
65
  nbinsx=10,
66
  marker_color='rgb(55, 83, 109)'
67
  ))
68
-
69
- # 添加箱型圖
70
  fig.add_trace(go.Box(
71
  x=df['價格'],
72
  name='價格箱型圖',
73
  marker_color='rgb(26, 118, 255)'
74
  ))
75
-
76
- fig.update_layout(
77
- title_text='台南飯店價格分布',
78
- title_x=0.5,
79
- title_font_size=20,
80
- xaxis_title='價格 (TWD)',
81
- yaxis_title='數量',
82
- height=500,
83
- bargap=0.2,
84
- showlegend=True
85
- )
86
-
87
  return fig
88
 
89
- def create_rating_box_by_price_range():
90
- """不同價格區間的評分箱型圖"""
91
- fig = px.box(
92
- df,
93
- x='價格區間',
94
- y='評分',
95
- title='不同價格區間的評分分布',
96
- labels={'價格區間': '價格類型', '評分': '評分 (0-10)'},
97
- color='價格區間'
98
- )
99
-
100
- fig.update_layout(
101
- title_x=0.5,
102
- title_font_size=20,
103
- height=500,
104
- showlegend=False
105
- )
106
-
107
- return fig
108
 
109
- def create_hotel_comparison():
110
- """飯店評分與價格比較圖"""
111
- fig = make_subplots(specs=[[{"secondary_y": True}]])
112
-
113
- # 排序數據
114
- df_sorted = df.sort_values('評分', ascending=True)
115
-
116
- # 添加評分柱狀圖
117
- fig.add_trace(
118
- go.Bar(
119
- x=df_sorted['飯店名稱'],
120
- y=df_sorted['評分'],
121
- name="評分",
122
- marker_color='rgb(55, 83, 109)'
123
- )
124
- )
125
-
126
- # 添加價格線圖
127
- fig.add_trace(
128
- go.Scatter(
129
- x=df_sorted['飯店名稱'],
130
- y=df_sorted['價格'],
131
- name="價格",
132
- marker_color='rgb(26, 118, 255)'
133
- ),
134
- secondary_y=True
135
- )
136
-
137
- fig.update_layout(
138
- title_text='台南飯店評分與價格比較',
139
- title_x=0.5,
140
- title_font_size=20,
141
- height=700,
142
- showlegend=True,
143
- xaxis_tickangle=45
144
- )
145
-
146
- fig.update_yaxes(title_text="評分", secondary_y=False)
147
- fig.update_yaxes(title_text="價格 (TWD)", secondary_y=True)
148
-
149
- return fig
150
 
151
- def main():
152
- st.set_page_config(page_title="台南飯店分析", layout="wide")
153
- st.title("台南飯店分析")
154
-
155
- scatter_fig = create_price_rating_scatter()
156
- dist_fig = create_price_distribution()
157
- box_fig = create_rating_box_by_price_range()
158
- comparison_fig = create_hotel_comparison()
159
-
160
- st.plotly_chart(scatter_fig, use_container_width=True)
161
- st.plotly_chart(dist_fig, use_container_width=True)
162
- st.plotly_chart(box_fig, use_container_width=True)
163
- st.plotly_chart(comparison_fig, use_container_width=True)
164
-
165
- st.write("分析完成!請查看上方產生的互動式視覺化圖表。")
166
 
167
- if __name__ == "__main__":
168
- main()
 
 
 
 
 
 
 
1
+ # file_path: app.py
2
+ import streamlit as st
3
+ import requests
4
+ from bs4 import BeautifulSoup
5
  import pandas as pd
6
+ from google.oauth2.service_account import Credentials
7
+ import gspread
8
  import plotly.express as px
9
  import plotly.graph_objects as go
 
10
  from plotly.subplots import make_subplots
 
11
 
12
+ # Google Sheets credentials
13
+ SCOPE = ['https://www.googleapis.com/auth/spreadsheets']
14
+ SERVICE_ACCOUNT_FILE = "realtime-441511-f5708eabdf26.json"
15
+ SPREADSHEET_URL = "https://docs.google.com/spreadsheets/d/1tIsXCbB8P6ZxdnZNnv7S7BBWbbT7lrSjW990zG-vQAA/edit?gid=0#gid=0"
16
+
17
+ # Streamlit app
18
+ st.title("Booking.com 台南飯店資料爬取與分析")
19
+ st.sidebar.header("功能選擇")
20
+ mode = st.sidebar.selectbox("選擇模式", ["資料爬取", "資料視覺化", "上傳至 Google Sheet"])
21
+
22
  @st.cache_data
23
+ def scrape_booking_hotel():
24
+ url = "https://www.booking.com/searchresults.zh-tw.html"
25
+ headers = {
26
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
27
+ 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
28
+ }
29
+ params = {
30
+ 'ss': '台南',
31
+ 'checkin': '2024-11-16',
32
+ 'checkout': '2024-11-17',
33
+ 'group_adults': '2',
34
+ 'no_rooms': '1',
35
+ 'group_children': '0',
36
+ 'dest_id': '-2637868',
37
+ 'dest_type': 'city'
38
+ }
39
+ try:
40
+ response = requests.get(url, headers=headers, params=params)
41
+ response.raise_for_status()
42
+ soup = BeautifulSoup(response.text, 'html.parser')
43
+ hotels_data = []
44
+ hotel_cards = soup.find_all('div', {'data-testid': 'property-card'})
45
+
46
+ for hotel in hotel_cards:
47
+ try:
48
+ name_elem = hotel.find('div', {'data-testid': 'title', 'class': 'f6431b446c'})
49
+ name = name_elem.text.strip() if name_elem else "無資料"
50
+ price_elem = hotel.find('span', {
51
+ 'data-testid': 'price-and-discounted-price',
52
+ 'class': 'f6431b446c'
53
+ })
54
+ price = price_elem.text.strip() if price_elem else "無資料"
55
+ price = price.replace('TWD', '').replace(' ', '').replace(',', '').strip()
56
+ rating_container = hotel.find('div', {'class': 'a3b8729ab1'})
57
+ rating_elem = rating_container.find('div', {'class': 'ac4a7896c7'}) if rating_container else None
58
+ rating = rating_elem.text.strip() if rating_elem else "無評分"
59
+ description_elem = hotel.find('div', {'data-testid': 'recommended-units'})
60
+ if description_elem:
61
+ room_type = description_elem.find('h4', {'class': 'abf093bdfe'})
62
+ room_type = room_type.text.strip() if room_type else ""
63
+ bed_info = description_elem.find('div', {'class': 'abf093bdfe'})
64
+ bed_info = bed_info.text.strip() if bed_info else ""
65
+ cancellation = description_elem.find('strong', text='可免費取消')
66
+ cancellation = "可免費取消" if cancellation else ""
67
+ payment = description_elem.find('strong', text='無需訂金')
68
+ payment = "無需訂金" if payment else ""
69
+ description = f"{room_type} | {bed_info} | {cancellation} | {payment}".strip(' |')
70
+ else:
71
+ description = "無說明"
72
+ hotels_data.append({
73
+ '飯店名稱': name,
74
+ '價格': price,
75
+ '評分': rating,
76
+ '說明': description
77
+ })
78
+ except AttributeError:
79
+ continue
80
 
81
+ df = pd.DataFrame(hotels_data).drop_duplicates()
82
+ return df
83
+ except requests.RequestException:
84
+ return pd.DataFrame()
85
 
86
+ def clean_rating(x):
87
+ if pd.isna(x) or x == '無評分':
88
+ return 0
89
+ return float(str(x).replace('分數', '').replace('分', ''))
90
+
91
+ def create_price_rating_scatter(df):
92
  fig = px.scatter(
93
  df,
94
  x='價格',
95
  y='評分',
96
  text='飯店名稱',
97
+ size='價格',
98
+ color='評分',
99
  title='台南飯店價��與評分關係圖',
100
  labels={'價格': '房價 (TWD)', '評分': '評分 (0-10)'}
101
  )
102
+ fig.update_layout(height=600, title_x=0.5)
 
 
 
 
 
 
 
 
 
 
 
 
103
  return fig
104
 
105
+ def create_price_distribution(df):
 
106
  fig = go.Figure()
 
 
107
  fig.add_trace(go.Histogram(
108
  x=df['價格'],
109
  name='價格分布',
110
  nbinsx=10,
111
  marker_color='rgb(55, 83, 109)'
112
  ))
 
 
113
  fig.add_trace(go.Box(
114
  x=df['價格'],
115
  name='價格箱型圖',
116
  marker_color='rgb(26, 118, 255)'
117
  ))
118
+ fig.update_layout(title_text='台南飯店價格分布', title_x=0.5, height=500)
 
 
 
 
 
 
 
 
 
 
 
119
  return fig
120
 
121
+ def upload_to_google_sheets(df):
122
+ creds = Credentials.from_service_account_file(SERVICE_ACCOUNT_FILE, scopes=SCOPE)
123
+ gs = gspread.authorize(creds)
124
+ sheet = gs.open_by_url(SPREADSHEET_URL)
125
+ worksheet = sheet.get_worksheet(0)
126
+ df1 = df.astype(str)
127
+ worksheet.update([df1.columns.values.tolist()] + df1.values.tolist())
128
+ return "資料已成功上傳到 Google Sheet!"
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ if mode == "資料爬取":
131
+ st.header("爬取台南飯店資料")
132
+ if st.button("開始爬取"):
133
+ df = scrape_booking_hotel()
134
+ if not df.empty:
135
+ st.dataframe(df)
136
+ df.to_csv('booking_hotels_tainan.csv', index=False, encoding='utf-8-sig')
137
+ st.success("資料爬取成功,已儲存至 booking_hotels_tainan.csv")
138
+ else:
139
+ st.error("未能成功爬取資料")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
+ elif mode == "資料視覺化":
142
+ st.header("分析與視覺化")
143
+ try:
144
+ df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
145
+ df['價格'] = pd.to_numeric(df['價格'], errors='coerce')
146
+ df['評分'] = df['評分'].apply(clean_rating)
147
+ st.plotly_chart(create_price_rating_scatter(df))
148
+ st.plotly_chart(create_price_distribution(df))
149
+ except Exception as e:
150
+ st.error(f"讀取或分析資料時發生錯誤:{e}")
 
 
 
 
 
151
 
152
+ elif mode == "上傳至 Google Sheet":
153
+ st.header("上傳資料至 Google Sheet")
154
+ try:
155
+ df = pd.read_csv('booking_hotels_tainan.csv', encoding='utf-8-sig')
156
+ result = upload_to_google_sheets(df)
157
+ st.success(result)
158
+ except Exception as e:
159
+ st.error(f"上傳資料時發生錯誤:{e}")