File size: 1,279 Bytes
8769306 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
import os
import pandas as pd
# Load local data
def load_local_data():
#filepath = os.path.join(current_dir, "test_data.csv")
filepath = "WheelyFunTimes/test_data.csv"
return pd.read_csv(filepath)
"""if os.path.exists(filepath):
return pd.read_csv(filepath)
else:
return None"""
def remove_near_duplicates(data):
print(data["trip_id"].nunique())
result = []
data["datetime"] = pd.to_datetime(data["datetime"])
for _, group in data.groupby(['route_id', 'stop_name']):
# Initialize a list to store rows that are not duplicates
filtered_rows = []
last_row = None
for idx, row in group.iterrows():
if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
# Keep the row if it's the first or sufficiently far apart in time
filtered_rows.append(row)
last_row = row
# Add filtered rows to the result
result.extend(filtered_rows)
filtered_df = pd.DataFrame(result)
# Return the filtered dataframe
print(filtered_df["trip_id"].nunique())
return filtered_df
df = load_local_data()
print(df.head(12))
df = remove_near_duplicates(df)
print(df.head(12))
|