commited on
Test av borttagning av dubbletter, tillägg av hållplatser samt start på sortering
Browse files- app.py +115 -38
- test.py +44 -0
- test_data.csv +17 -0
@@ -74,50 +74,70 @@ def get_buses():
74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75 |
return bus_df, bus_list, short_bus_list
76 |
77 |
78 |
79 |
80 |
Removes duplicate trips based on route_id
81 |
82 |
83 |
df (pd.DataFrame): Input DataFrame containing trip data.
84 |
route_id_col (str): Column name for route IDs.
85 |
trip_id_col (str): Column name for trip IDs.
86 |
stop_id_col (str): Column name for stop IDs.
87 |
datetime_col (str): Column name for departure times.
88 |
time_window (str): Time window for considering trips as duplicates (e.g., '3min').
89 |
90 |
91 |
pd.DataFrame: Filtered DataFrame with duplicates removed.
92 |
93 |
# Ensure the datetime column is of datetime type
94 |
df[datetime_col] = pd.to_datetime(df[datetime_col])
95 |
96 |
# Sort by route_id
97 |
df = df.sort_values(by=[route_id_col,
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
# Filter the original DataFrame to retain only the non-duplicate trips
114 |
unique_trip_ids = filtered_first_stops[trip_id_col].unique()
115 |
result = df[df[trip_id_col].isin(unique_trip_ids)]
116 |
117 |
return result
118 |
119 |
def plot_graph(plot_df):
120 |
#Nu vill vi plotta!
121 |
categories = {0 : 'Empty',
122 |
1: 'Many seats available',
123 |
2:'Few seats available',
@@ -125,7 +145,7 @@ def plot_graph(plot_df):
125 |
4:'Crushed standing room',
126 |
5: 'Full'}
127 |
128 |
plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name"]]
129 |
plot_df = plot_df.sort_values("datetime")
130 |
131 |
@@ -146,6 +166,36 @@ def plot_graph(plot_df):
146 |
147 |
st.altair_chart(chart, use_container_width=True)
148 |
149 |
def visualize(filtered_data):
150 |
import folium
151 |
from streamlit_folium import st_folium
@@ -198,7 +248,7 @@ def main():
198 |
st.session_state.data = load_local_data("data.csv")
199 |
if "first" not in st.session_state:
200 |
st.session_state.first = True
201 |
st.session_state.data =
202 |
203 |
# Fetch data if local data is invalid
204 |
if st.session_state.hopsworks_project is None:
@@ -252,7 +302,18 @@ def main():
252 |
st.session_state.direction = not st.session_state.direction
253 |
254 |
255 |
256 |
#direction = st.sidebar.checkbox('Direction of bus', value=True)
257 |
258 |
today = datetime.now()
@@ -271,17 +332,16 @@ def main():
271 |
start_time = st.sidebar.time_input("Select a start time", value=None)
272 |
end_time = st.sidebar.time_input("Select an end time", value=None)
273 |
274 |
275 |
trips = buses_df[buses_df["route_long_name"]==bus]
276 |
bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus]
277 |
bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"])
278 |
bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None)
279 |
280 |
#TODO remove
281 |
trip_ids = list(trips["trip_id"])
282 |
plot_df = st.session_state.data[st.session_state.data["trip_id"]==trip_ids[0]]
283 |
284 |
#TODO direction
285 |
286 |
print(f"start time {type(start_time)}")
287 |
print(f"end time {type(end_time)}")
@@ -290,10 +350,27 @@ def main():
290 |
if start_time != None and end_time != None:
291 |
#TODO hur filtrera på tid?
292 |
st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}")
293 |
selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
294 |
& (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
295 |
& (bus_trips["direction_id"] == st.session_state.direction )
296 |
trip_ids = list(pd.unique(selected_trips["trip_id"]))
297 |
st.write(f"Length {len(trip_ids)}")
298 |
for id in trip_ids:
299 |
74 |
short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75 |
return bus_df, bus_list, short_bus_list
76 |
77 |
# Function to remove duplicates
78 |
def remove_near_duplicates(data):
79 |
80 |
result = []
81 |
data["datetime"] = pd.to_datetime(data["datetime"])
82 |
for _, group in data.groupby(['route_id', 'stop_name']):
83 |
# Initialize a list to store rows that are not duplicates
84 |
filtered_rows = []
85 |
last_row = None
86 |
87 |
for idx, row in group.iterrows():
88 |
if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
89 |
# Keep the row if it's the first or sufficiently far apart in time
90 |
91 |
last_row = row
92 |
93 |
# Add filtered rows to the result
94 |
95 |
filtered_df = pd.DataFrame(result)
96 |
# Return the filtered dataframe
97 |
98 |
return filtered_df
99 |
100 |
def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col="trip_id", datetime_col="datetime", time_window='3min'):
101 |
102 |
Removes duplicate trips based on route_id and starting time proximity within a time window.
103 |
104 |
105 |
df (pd.DataFrame): Input DataFrame containing trip data.
106 |
route_id_col (str): Column name for route IDs.
107 |
trip_id_col (str): Column name for trip IDs.
108 |
datetime_col (str): Column name for departure times.
109 |
time_window (str): Time window for considering trips as duplicates (e.g., '3min').
110 |
111 |
112 |
pd.DataFrame: Filtered DataFrame with duplicates removed.
113 |
114 |
115 |
116 |
# Ensure the datetime column is of datetime type
117 |
df[datetime_col] = pd.to_datetime(df[datetime_col])
118 |
119 |
# Sort by route_id and datetime for correct chronological order within each route
120 |
df = df.sort_values(by=[route_id_col, datetime_col])
121 |
122 |
# Calculate time differences between consecutive rows within each route_id group
123 |
df['time_diff'] = df.groupby(route_id_col)[datetime_col].diff().fillna(pd.Timedelta('0s'))
124 |
125 |
# Mark rows as duplicates if the time difference is within the time window
126 |
time_window_timedelta = pd.to_timedelta(time_window)
127 |
df['is_duplicate'] = df['time_diff'] <= time_window_timedelta
128 |
129 |
# Keep only the first row within each group of duplicates (based on time window)
130 |
df['keep'] = ~df.groupby(route_id_col)['is_duplicate'].transform('any')
131 |
132 |
# Filter rows: Keep only those that are marked as 'keep'
133 |
result = df[df['keep']].drop(columns=['time_diff', 'is_duplicate', 'keep'])
134 |
135 |
136 |
return result
137 |
138 |
def plot_graph(plot_df):
139 |
#Nu vill vi plotta!
140 |
#TODO ska den bara visa de stopp man vill eller alla?
141 |
categories = {0 : 'Empty',
142 |
1: 'Many seats available',
143 |
2:'Few seats available',
145 |
4:'Crushed standing room',
146 |
5: 'Full'}
147 |
148 |
plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
149 |
plot_df = plot_df.sort_values("datetime")
150 |
151 |
166 |
167 |
st.altair_chart(chart, use_container_width=True)
168 |
169 |
def plot_graph_title(plot_df, stop, time):
170 |
#Nu vill vi plotta!
171 |
#TODO ska den bara visa de stopp man vill eller alla?
172 |
categories = {0 : 'Empty',
173 |
1: 'Many seats available',
174 |
2:'Few seats available',
175 |
3:'Standing room only',
176 |
4:'Crushed standing room',
177 |
5: 'Full'}
178 |
179 |
plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
180 |
plot_df = plot_df.sort_values("datetime")
181 |
#plot_df = plot_df.set_index("datetime")
182 |
plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories)
183 |
# Explicitly set the order for Y_category
184 |
category_order = list(categories.values()) # ['Empty', 'Many seats available', ..., 'Full']
185 |
186 |
187 |
188 |
# Create the Altair chart
189 |
chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode(
190 |
x=alt.X('stop_name:N', title="Stop name"), # Use column name as string
191 |
y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)), # Treat Y as categorical
192 |
tooltip=["datetime", 'stop_name', 'Occupancy'] # Add tooltips for interactivity
193 |
194 |
title=f"Vehicle Occupancy For Bus arriving at {stop} at {time}"
195 |
196 |
st.altair_chart(chart, use_container_width=True)
197 |
198 |
199 |
def visualize(filtered_data):
200 |
import folium
201 |
from streamlit_folium import st_folium
248 |
st.session_state.data = load_local_data("data.csv")
249 |
if "first" not in st.session_state:
250 |
st.session_state.first = True
251 |
#st.session_state.data = remove_near_duplicates(st.session_state.data)
252 |
253 |
# Fetch data if local data is invalid
254 |
if st.session_state.hopsworks_project is None:
302 |
st.session_state.direction = not st.session_state.direction
303 |
304 |
305 |
#Plocka alla aktuella trip_ids från buses
306 |
trips = buses_df[buses_df["route_long_name"]==bus]
307 |
bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus]
308 |
bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"])
309 |
bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None)
310 |
311 |
stops = list(pd.unique(bus_trips["stop_name"]))
312 |
stop_choice = st.sidebar.selectbox(
313 |
"Select your bus stop:",
314 |
315 |
help="Select one bus stop to se occupancy."
316 |
317 |
#direction = st.sidebar.checkbox('Direction of bus', value=True)
318 |
319 |
today = datetime.now()
332 |
start_time = st.sidebar.time_input("Select a start time", value=None)
333 |
end_time = st.sidebar.time_input("Select an end time", value=None)
334 |
335 |
336 |
337 |
#TODO remove
338 |
#trip_ids = list(trips["trip_id"])
339 |
#plot_df = st.session_state.data[st.session_state.data["trip_id"]==trip_ids[0]]
340 |
341 |
#TODO hållsplats
342 |
#Kolla på route_id för att plocka alla hållplatser
343 |
344 |
345 |
346 |
print(f"start time {type(start_time)}")
347 |
print(f"end time {type(end_time)}")
350 |
if start_time != None and end_time != None:
351 |
#TODO hur filtrera på tid?
352 |
st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}")
353 |
"""selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
354 |
& (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
355 |
& (bus_trips["direction_id"] == st.session_state.direction )]"""
356 |
selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
357 |
& (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
358 |
& (bus_trips["direction_id"] == st.session_state.direction )
359 |
& (bus_trips["stop_name"] == stop_choice)]
360 |
trip_ids = list(pd.unique(selected_trips["trip_id"]))
361 |
362 |
chioce = selected_trips[selected_trips["stop_name"]==stop_choice]
363 |
364 |
chioce = chioce[["trip_id", "stop_name", "datetime"]]
365 |
#Ev lägga stop_chioce i session_state
366 |
367 |
chioce = chioce.sort_values(by=["datetime"])
368 |
chioce = chioce.drop_duplicates("datetime")
369 |
370 |
for idx, row in chioce.iterrows():
371 |
st.write(f"The bus arrives at {row['stop_name']} at {row['datetime'].strftime('%H:%M')}")
372 |
plot_graph_title(st.session_state.data[st.session_state.data["trip_id"]==row["trip_id"]], row["stop_name"], row['datetime'].strftime('%H:%M'))
373 |
374 |
st.write(f"Length {len(trip_ids)}")
375 |
for id in trip_ids:
376 |
@@ -0,0 +1,44 @@
1 |
import os
2 |
import pandas as pd
3 |
4 |
# Load local data
5 |
def load_local_data():
6 |
7 |
#filepath = os.path.join(current_dir, "test_data.csv")
8 |
filepath = "WheelyFunTimes/test_data.csv"
9 |
return pd.read_csv(filepath)
10 |
"""if os.path.exists(filepath):
11 |
return pd.read_csv(filepath)
12 |
13 |
return None"""
14 |
15 |
def remove_near_duplicates(data):
16 |
17 |
result = []
18 |
data["datetime"] = pd.to_datetime(data["datetime"])
19 |
for _, group in data.groupby(['route_id', 'stop_name']):
20 |
# Initialize a list to store rows that are not duplicates
21 |
filtered_rows = []
22 |
last_row = None
23 |
24 |
for idx, row in group.iterrows():
25 |
if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
26 |
# Keep the row if it's the first or sufficiently far apart in time
27 |
28 |
last_row = row
29 |
30 |
# Add filtered rows to the result
31 |
32 |
filtered_df = pd.DataFrame(result)
33 |
# Return the filtered dataframe
34 |
35 |
return filtered_df
36 |
37 |
df = load_local_data()
38 |
39 |
df = remove_near_duplicates(df)
40 |
41 |
42 |
43 |
44 |
@@ -0,0 +1,17 @@
1 |
2 |
FALSE,NO4,Kristianstad - Simrishamn,1,2025-01-05 13:04:37+00:01,9986.36.00,167.37.00,2,1.6,0.0,9935.39.00,87.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,13,15,1,Malmö Spångatan
3 |
TRUE,8,Lindängen - Västra hamnen,2,2025-01-05 15:50:14+00:00,9586.03.00,410.28.00,3,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,50,0,Malmö Nydala
4 |
FALSE,3,Lindängen - Västra hamnen,2,2025-01-05 07:45:42+00:00,363.43.00,3756.51.00,3,1.9,0.0,4126.46.00,100.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,7,45,0,Viby Kvarnnäsvägen
5 |
FALSE,4,Lindängen - Västra hamnen,2,2025-01-05 14:27:33+00:00,11973.42.00,3358.39.00,3,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,27,0,Lund Smörlyckan
6 |
FALSE,502,Lindängen - Västra hamnen,2,2025-01-05 14:45:15+00:00,7171.11.00,14479.54.00,3,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,45,0,Båstad Kalkvägen
7 |
TRUE,511,Lund C - ESS (spårvagn),3,2025-01-05 12:44:36+00:00,12025.35.00,3912.03.00,5,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,48,0,Lund Solbjer
8 |
TRUE,150,Lund C - ESS (spårvagn),3,2025-01-05 10:08:00+00:00,10014.42.00,4345.02.00,4,1.8,0.0,16011.59.00,97.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,10,8,1,Klågerup busstation
9 |
FALSE,5,Lund C - ESS (spårvagn),3,2025-01-05 10:09:00+00:00,10015.42.00,4346.02.00,6,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,55,0,Klågerup busstation
10 |
TRUE,511,Lund C - ESS (spårvagn),3,2025-01-05 12:44:36+00:00,12025.35.00,3912.03.00,7,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,48,0,Lund Solbjer
11 |
TRUE,150,Lund C - ESS (spårvagn),3,2025-01-05 10:08:00+00:00,10014.42.00,4345.02.00,7,1.8,0.0,16011.59.00,97.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,10,8,1,Klågerup busstation
12 |
TRUE,1,Lund C - ESS (spårvagn),3,2025-01-05 12:43:36+00:00,12024.35.00,3911.03.00,6,02.15,0.0,11580.39.00,90.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,12,43,0,Lund Solbjer
13 |
FALSE,817,Kristianstad - Simrishamn,1,2025-01-05 21:49:00+00:01,14016.02.00,1519.11.00,2,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,20,1,Degeberga Skaddevägen
14 |
TRUE,174,Kristianstad - Simrishamn,1,2025-01-05 13:04:37+00:00,9985.36.00,166.37.00,1,1.6,0.0,9935.39.00,87.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,13,4,0,Malmö Spångatan
15 |
TRUE,1,Lund C - ESS (spårvagn),3,2025-01-05 12:43:36+00:00,12024.35.00,3911.03.00,4,02.15,0.0,11580.39.00,90.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,12,43,0,Lund Solbjer
16 |
FALSE,3,Kristianstad - Simrishamn,1,2025-01-05 21:49:00+00:00,14015.02.00,1518.11.00,1,1.4,0.2,1678.34.00,100.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,21,49,0,Degeberga Skaddevägen
17 |
FALSE,5,Lund C - ESS (spårvagn),3,2025-01-05 10:09:00+00:00,10015.42.00,4346.02.00,5,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,55,0,Klågerup busstation