import os | |
import pandas as pd | |
# Load local data | |
def load_local_data(): | |
#filepath = os.path.join(current_dir, "test_data.csv") | |
filepath = "WheelyFunTimes/test_data.csv" | |
return pd.read_csv(filepath) | |
"""if os.path.exists(filepath): | |
return pd.read_csv(filepath) | |
else: | |
return None""" | |
def remove_near_duplicates(data): | |
print(data["trip_id"].nunique()) | |
result = [] | |
data["datetime"] = pd.to_datetime(data["datetime"]) | |
for _, group in data.groupby(['route_id', 'stop_name']): | |
# Initialize a list to store rows that are not duplicates | |
filtered_rows = [] | |
last_row = None | |
for idx, row in group.iterrows(): | |
if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)): | |
# Keep the row if it's the first or sufficiently far apart in time | |
filtered_rows.append(row) | |
last_row = row | |
# Add filtered rows to the result | |
result.extend(filtered_rows) | |
filtered_df = pd.DataFrame(result) | |
# Return the filtered dataframe | |
print(filtered_df["trip_id"].nunique()) | |
return filtered_df | |
df = load_local_data() | |
print(df.head(12)) | |
df = remove_near_duplicates(df) | |
print(df.head(12)) | |