elli-teu commited on
Commit
8769306
·
1 Parent(s): 5184bfe

Test av borttagning av dubbletter, tillägg av hållplatser samt start på sortering

Browse files
Files changed (3) hide show
  1. app.py +115 -38
  2. test.py +44 -0
  3. test_data.csv +17 -0
app.py CHANGED
@@ -74,50 +74,70 @@ def get_buses():
74
  short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75
  return bus_df, bus_list, short_bus_list
76
 
77
-
78
- def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col = "trip_id", stop_id_col = "stop_name", datetime_col = "datetime", time_window='3min'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  """
80
- Removes duplicate trips based on route_id, start stop_id, and starting time proximity within a time window.
81
 
82
  Parameters:
83
  df (pd.DataFrame): Input DataFrame containing trip data.
84
  route_id_col (str): Column name for route IDs.
85
  trip_id_col (str): Column name for trip IDs.
86
- stop_id_col (str): Column name for stop IDs.
87
  datetime_col (str): Column name for departure times.
88
  time_window (str): Time window for considering trips as duplicates (e.g., '3min').
89
 
90
  Returns:
91
  pd.DataFrame: Filtered DataFrame with duplicates removed.
92
  """
 
 
93
  # Ensure the datetime column is of datetime type
94
  df[datetime_col] = pd.to_datetime(df[datetime_col])
95
-
96
- # Sort by route_id, stop_id, and datetime for proper grouping and filtering
97
- df = df.sort_values(by=[route_id_col, stop_id_col, datetime_col])
98
-
99
- # Find the first stop for each trip
100
- first_stops = df.groupby(trip_id_col).first().reset_index()
101
-
102
- # Identify duplicate trips based on route_id, stop_id, and datetime proximity
103
- def filter_duplicates(group):
104
- # Compare trips starting within the time window
105
- group['keep'] = ~((group[stop_id_col] == group[stop_id_col].shift()) &
106
- (group[datetime_col] - group[datetime_col].shift() <= pd.Timedelta(time_window)))
107
- group['keep'] = group['keep'].cumsum() == 1 # Keep only the first trip in each duplicate group
108
- return group[group['keep']]
109
-
110
- # Apply filtering for each route_id group
111
- filtered_first_stops = first_stops.groupby(route_id_col, group_keys=False).apply(filter_duplicates)
112
-
113
- # Filter the original DataFrame to retain only the non-duplicate trips
114
- unique_trip_ids = filtered_first_stops[trip_id_col].unique()
115
- result = df[df[trip_id_col].isin(unique_trip_ids)]
116
-
117
  return result
118
 
119
  def plot_graph(plot_df):
120
  #Nu vill vi plotta!
 
121
  categories = {0 : 'Empty',
122
  1: 'Many seats available',
123
  2:'Few seats available',
@@ -125,7 +145,7 @@ def plot_graph(plot_df):
125
  4:'Crushed standing room',
126
  5: 'Full'}
127
 
128
- plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name"]]
129
  plot_df = plot_df.sort_values("datetime")
130
  st.write(plot_df.head())
131
  st.write(plot_df.tail())
@@ -146,6 +166,36 @@ def plot_graph(plot_df):
146
  )
147
  st.altair_chart(chart, use_container_width=True)
148
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  def visualize(filtered_data):
150
  import folium
151
  from streamlit_folium import st_folium
@@ -198,7 +248,7 @@ def main():
198
  st.session_state.data = load_local_data("data.csv")
199
  if "first" not in st.session_state:
200
  st.session_state.first = True
201
- st.session_state.data = remove_duplicate_trips(st.session_state.data)
202
  else:
203
  # Fetch data if local data is invalid
204
  if st.session_state.hopsworks_project is None:
@@ -252,7 +302,18 @@ def main():
252
  st.session_state.direction = not st.session_state.direction
253
  print(st.session_state.direction)
254
 
255
-
 
 
 
 
 
 
 
 
 
 
 
256
  #direction = st.sidebar.checkbox('Direction of bus', value=True)
257
 
258
  today = datetime.now()
@@ -271,17 +332,16 @@ def main():
271
  start_time = st.sidebar.time_input("Select a start time", value=None)
272
  end_time = st.sidebar.time_input("Select an end time", value=None)
273
 
274
- #Plocka alla aktuella trip_ids från buses
275
- trips = buses_df[buses_df["route_long_name"]==bus]
276
- bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus]
277
- bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"])
278
- bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None)
279
 
280
  #TODO remove
281
- trip_ids = list(trips["trip_id"])
282
- plot_df = st.session_state.data[st.session_state.data["trip_id"]==trip_ids[0]]
 
 
 
 
283
 
284
- #TODO direction
285
 
286
  print(f"start time {type(start_time)}")
287
  print(f"end time {type(end_time)}")
@@ -290,10 +350,27 @@ def main():
290
  if start_time != None and end_time != None:
291
  #TODO hur filtrera på tid?
292
  st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}")
 
 
 
293
  selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
294
  & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
295
- & (bus_trips["direction_id"] == st.session_state.direction )]
 
296
  trip_ids = list(pd.unique(selected_trips["trip_id"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
297
  st.write(f"Length {len(trip_ids)}")
298
  for id in trip_ids:
299
  plot_graph(st.session_state.data[st.session_state.data["trip_id"]==id])
 
74
  short_bus_list = list(pd.unique(bus_df["route_short_name"]))
75
  return bus_df, bus_list, short_bus_list
76
 
77
+ # Function to remove duplicates
78
+ def remove_near_duplicates(data):
79
+ print(data["trip_id"].nunique())
80
+ result = []
81
+ data["datetime"] = pd.to_datetime(data["datetime"])
82
+ for _, group in data.groupby(['route_id', 'stop_name']):
83
+ # Initialize a list to store rows that are not duplicates
84
+ filtered_rows = []
85
+ last_row = None
86
+
87
+ for idx, row in group.iterrows():
88
+ if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
89
+ # Keep the row if it's the first or sufficiently far apart in time
90
+ filtered_rows.append(row)
91
+ last_row = row
92
+
93
+ # Add filtered rows to the result
94
+ result.extend(filtered_rows)
95
+ filtered_df = pd.DataFrame(result)
96
+ # Return the filtered dataframe
97
+ print(filtered_df["trip_id"].nunique())
98
+ return filtered_df
99
+
100
+ def remove_duplicate_trips(df, route_id_col="route_id", trip_id_col="trip_id", datetime_col="datetime", time_window='3min'):
101
  """
102
+ Removes duplicate trips based on route_id and starting time proximity within a time window.
103
 
104
  Parameters:
105
  df (pd.DataFrame): Input DataFrame containing trip data.
106
  route_id_col (str): Column name for route IDs.
107
  trip_id_col (str): Column name for trip IDs.
 
108
  datetime_col (str): Column name for departure times.
109
  time_window (str): Time window for considering trips as duplicates (e.g., '3min').
110
 
111
  Returns:
112
  pd.DataFrame: Filtered DataFrame with duplicates removed.
113
  """
114
+ print(df["trip_id"].nunique())
115
+
116
  # Ensure the datetime column is of datetime type
117
  df[datetime_col] = pd.to_datetime(df[datetime_col])
118
+
119
+ # Sort by route_id and datetime for correct chronological order within each route
120
+ df = df.sort_values(by=[route_id_col, datetime_col])
121
+
122
+ # Calculate time differences between consecutive rows within each route_id group
123
+ df['time_diff'] = df.groupby(route_id_col)[datetime_col].diff().fillna(pd.Timedelta('0s'))
124
+
125
+ # Mark rows as duplicates if the time difference is within the time window
126
+ time_window_timedelta = pd.to_timedelta(time_window)
127
+ df['is_duplicate'] = df['time_diff'] <= time_window_timedelta
128
+
129
+ # Keep only the first row within each group of duplicates (based on time window)
130
+ df['keep'] = ~df.groupby(route_id_col)['is_duplicate'].transform('any')
131
+
132
+ # Filter rows: Keep only those that are marked as 'keep'
133
+ result = df[df['keep']].drop(columns=['time_diff', 'is_duplicate', 'keep'])
134
+
135
+ print(result["trip_id"].nunique())
 
 
 
 
136
  return result
137
 
138
  def plot_graph(plot_df):
139
  #Nu vill vi plotta!
140
+ #TODO ska den bara visa de stopp man vill eller alla?
141
  categories = {0 : 'Empty',
142
  1: 'Many seats available',
143
  2:'Few seats available',
 
145
  4:'Crushed standing room',
146
  5: 'Full'}
147
 
148
+ plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
149
  plot_df = plot_df.sort_values("datetime")
150
  st.write(plot_df.head())
151
  st.write(plot_df.tail())
 
166
  )
167
  st.altair_chart(chart, use_container_width=True)
168
 
169
+ def plot_graph_title(plot_df, stop, time):
170
+ #Nu vill vi plotta!
171
+ #TODO ska den bara visa de stopp man vill eller alla?
172
+ categories = {0 : 'Empty',
173
+ 1: 'Many seats available',
174
+ 2:'Few seats available',
175
+ 3:'Standing room only',
176
+ 4:'Crushed standing room',
177
+ 5: 'Full'}
178
+
179
+ plot_df = plot_df[["datetime", "vehicle_occupancystatus", "stop_name", "route_id"]]
180
+ plot_df = plot_df.sort_values("datetime")
181
+ #plot_df = plot_df.set_index("datetime")
182
+ plot_df["Occupancy"] = plot_df["vehicle_occupancystatus"].map(categories)
183
+ # Explicitly set the order for Y_category
184
+ category_order = list(categories.values()) # ['Empty', 'Many seats available', ..., 'Full']
185
+ category_order.reverse()
186
+
187
+ #st.line_chart(plot_df)
188
+ # Create the Altair chart
189
+ chart = alt.Chart(plot_df).mark_line(point=True, interpolate="step-after").encode(
190
+ x=alt.X('stop_name:N', title="Stop name"), # Use column name as string
191
+ y=alt.Y('Occupancy:N', title="Vehicle Occupancy Status (Categories)", sort=category_order, scale=alt.Scale(domain=category_order)), # Treat Y as categorical
192
+ tooltip=["datetime", 'stop_name', 'Occupancy'] # Add tooltips for interactivity
193
+ ).properties(
194
+ title=f"Vehicle Occupancy For Bus arriving at {stop} at {time}"
195
+ )
196
+ st.altair_chart(chart, use_container_width=True)
197
+
198
+
199
  def visualize(filtered_data):
200
  import folium
201
  from streamlit_folium import st_folium
 
248
  st.session_state.data = load_local_data("data.csv")
249
  if "first" not in st.session_state:
250
  st.session_state.first = True
251
+ #st.session_state.data = remove_near_duplicates(st.session_state.data)
252
  else:
253
  # Fetch data if local data is invalid
254
  if st.session_state.hopsworks_project is None:
 
302
  st.session_state.direction = not st.session_state.direction
303
  print(st.session_state.direction)
304
 
305
+ #Plocka alla aktuella trip_ids från buses
306
+ trips = buses_df[buses_df["route_long_name"]==bus]
307
+ bus_trips = st.session_state.data[st.session_state.data["route_long_name"]==bus]
308
+ bus_trips["datetime"] = pd.to_datetime(bus_trips["datetime"])
309
+ bus_trips["datetime"] = bus_trips["datetime"].dt.tz_convert(None)
310
+
311
+ stops = list(pd.unique(bus_trips["stop_name"]))
312
+ stop_choice = st.sidebar.selectbox(
313
+ "Select your bus stop:",
314
+ options=stops,
315
+ help="Select one bus stop to se occupancy."
316
+ )
317
  #direction = st.sidebar.checkbox('Direction of bus', value=True)
318
 
319
  today = datetime.now()
 
332
  start_time = st.sidebar.time_input("Select a start time", value=None)
333
  end_time = st.sidebar.time_input("Select an end time", value=None)
334
 
335
+
 
 
 
 
336
 
337
  #TODO remove
338
+ #trip_ids = list(trips["trip_id"])
339
+ #plot_df = st.session_state.data[st.session_state.data["trip_id"]==trip_ids[0]]
340
+
341
+ #TODO hållsplats
342
+ #Kolla på route_id för att plocka alla hållplatser
343
+
344
 
 
345
 
346
  print(f"start time {type(start_time)}")
347
  print(f"end time {type(end_time)}")
 
350
  if start_time != None and end_time != None:
351
  #TODO hur filtrera på tid?
352
  st.write(f"Displaying buses between {start_time.strftime('%H:%M')} and {end_time.strftime('%H:%M')} the {day_choice}")
353
+ """selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
354
+ & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
355
+ & (bus_trips["direction_id"] == st.session_state.direction )]"""
356
  selected_trips = bus_trips[(bus_trips["datetime"] >= datetime.combine(date_options[day_choice], start_time))
357
  & (bus_trips["datetime"] <= datetime.combine(date_options[day_choice], end_time))
358
+ & (bus_trips["direction_id"] == st.session_state.direction )
359
+ & (bus_trips["stop_name"] == stop_choice)]
360
  trip_ids = list(pd.unique(selected_trips["trip_id"]))
361
+
362
+ chioce = selected_trips[selected_trips["stop_name"]==stop_choice]
363
+ chioce.head()
364
+ chioce = chioce[["trip_id", "stop_name", "datetime"]]
365
+ #Ev lägga stop_chioce i session_state
366
+
367
+ chioce = chioce.sort_values(by=["datetime"])
368
+ chioce = chioce.drop_duplicates("datetime")
369
+
370
+ for idx, row in chioce.iterrows():
371
+ st.write(f"The bus arrives at {row['stop_name']} at {row['datetime'].strftime('%H:%M')}")
372
+ plot_graph_title(st.session_state.data[st.session_state.data["trip_id"]==row["trip_id"]], row["stop_name"], row['datetime'].strftime('%H:%M'))
373
+
374
  st.write(f"Length {len(trip_ids)}")
375
  for id in trip_ids:
376
  plot_graph(st.session_state.data[st.session_state.data["trip_id"]==id])
test.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+
4
+ # Load local data
5
+ def load_local_data():
6
+
7
+ #filepath = os.path.join(current_dir, "test_data.csv")
8
+ filepath = "WheelyFunTimes/test_data.csv"
9
+ return pd.read_csv(filepath)
10
+ """if os.path.exists(filepath):
11
+ return pd.read_csv(filepath)
12
+ else:
13
+ return None"""
14
+
15
+ def remove_near_duplicates(data):
16
+ print(data["trip_id"].nunique())
17
+ result = []
18
+ data["datetime"] = pd.to_datetime(data["datetime"])
19
+ for _, group in data.groupby(['route_id', 'stop_name']):
20
+ # Initialize a list to store rows that are not duplicates
21
+ filtered_rows = []
22
+ last_row = None
23
+
24
+ for idx, row in group.iterrows():
25
+ if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
26
+ # Keep the row if it's the first or sufficiently far apart in time
27
+ filtered_rows.append(row)
28
+ last_row = row
29
+
30
+ # Add filtered rows to the result
31
+ result.extend(filtered_rows)
32
+ filtered_df = pd.DataFrame(result)
33
+ # Return the filtered dataframe
34
+ print(filtered_df["trip_id"].nunique())
35
+ return filtered_df
36
+
37
+ df = load_local_data()
38
+ print(df.head(12))
39
+ df = remove_near_duplicates(df)
40
+ print(df.head(12))
41
+
42
+
43
+
44
+
test_data.csv ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ direction_id,route_short_name,route_long_name,route_id,datetime,stop_lat,stop_lon,trip_id,temperature_2m,precipitation,wind_speed_10m,hourly_cloud_cover,dag_i_vecka,arbetsfri_dag,holiday,helgdag,squeeze_day,helgdagsafton,day_before_holiday,hour,minute,vehicle_occupancystatus,stop_name
2
+ FALSE,NO4,Kristianstad - Simrishamn,1,2025-01-05 13:04:37+00:01,9986.36.00,167.37.00,2,1.6,0.0,9935.39.00,87.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,13,15,1,Malmö Spångatan
3
+ TRUE,8,Lindängen - Västra hamnen,2,2025-01-05 15:50:14+00:00,9586.03.00,410.28.00,3,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,50,0,Malmö Nydala
4
+ FALSE,3,Lindängen - Västra hamnen,2,2025-01-05 07:45:42+00:00,363.43.00,3756.51.00,3,1.9,0.0,4126.46.00,100.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,7,45,0,Viby Kvarnnäsvägen
5
+ FALSE,4,Lindängen - Västra hamnen,2,2025-01-05 14:27:33+00:00,11973.42.00,3358.39.00,3,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,27,0,Lund Smörlyckan
6
+ FALSE,502,Lindängen - Västra hamnen,2,2025-01-05 14:45:15+00:00,7171.11.00,14479.54.00,3,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,45,0,Båstad Kalkvägen
7
+ TRUE,511,Lund C - ESS (spårvagn),3,2025-01-05 12:44:36+00:00,12025.35.00,3912.03.00,5,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,48,0,Lund Solbjer
8
+ TRUE,150,Lund C - ESS (spårvagn),3,2025-01-05 10:08:00+00:00,10014.42.00,4345.02.00,4,1.8,0.0,16011.59.00,97.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,10,8,1,Klågerup busstation
9
+ FALSE,5,Lund C - ESS (spårvagn),3,2025-01-05 10:09:00+00:00,10015.42.00,4346.02.00,6,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,55,0,Klågerup busstation
10
+ TRUE,511,Lund C - ESS (spårvagn),3,2025-01-05 12:44:36+00:00,12025.35.00,3912.03.00,7,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,48,0,Lund Solbjer
11
+ TRUE,150,Lund C - ESS (spårvagn),3,2025-01-05 10:08:00+00:00,10014.42.00,4345.02.00,7,1.8,0.0,16011.59.00,97.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,10,8,1,Klågerup busstation
12
+ TRUE,1,Lund C - ESS (spårvagn),3,2025-01-05 12:43:36+00:00,12024.35.00,3911.03.00,6,02.15,0.0,11580.39.00,90.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,12,43,0,Lund Solbjer
13
+ FALSE,817,Kristianstad - Simrishamn,1,2025-01-05 21:49:00+00:01,14016.02.00,1519.11.00,2,01.35,0.0,10456.25.00,84.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,14,20,1,Degeberga Skaddevägen
14
+ TRUE,174,Kristianstad - Simrishamn,1,2025-01-05 13:04:37+00:00,9985.36.00,166.37.00,1,1.6,0.0,9935.39.00,87.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,13,4,0,Malmö Spångatan
15
+ TRUE,1,Lund C - ESS (spårvagn),3,2025-01-05 12:43:36+00:00,12024.35.00,3911.03.00,4,02.15,0.0,11580.39.00,90.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,12,43,0,Lund Solbjer
16
+ FALSE,3,Kristianstad - Simrishamn,1,2025-01-05 21:49:00+00:00,14015.02.00,1518.11.00,1,1.4,0.2,1678.34.00,100.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,21,49,0,Degeberga Skaddevägen
17
+ FALSE,5,Lund C - ESS (spårvagn),3,2025-01-05 10:09:00+00:00,10015.42.00,4346.02.00,5,01.25,0.0,1623.03.00,83.0,7,TRUE,TRUE,FALSE,FALSE,TRUE,FALSE,15,55,0,Klågerup busstation