File size: 1,279 Bytes
8769306
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import os
import pandas as pd

# Load local data
def load_local_data():
    
    #filepath = os.path.join(current_dir, "test_data.csv")
    filepath = "WheelyFunTimes/test_data.csv"
    return pd.read_csv(filepath)
    """if os.path.exists(filepath):
        return pd.read_csv(filepath)
    else:
        return None"""
    
def remove_near_duplicates(data):
    print(data["trip_id"].nunique())
    result = []
    data["datetime"] = pd.to_datetime(data["datetime"])
    for _, group in data.groupby(['route_id', 'stop_name']):
        # Initialize a list to store rows that are not duplicates
        filtered_rows = []
        last_row = None

        for idx, row in group.iterrows():
            if last_row is None or (row['datetime'] - last_row['datetime'] > pd.Timedelta(minutes = 3)):
                # Keep the row if it's the first or sufficiently far apart in time
                filtered_rows.append(row)
                last_row = row

        # Add filtered rows to the result
        result.extend(filtered_rows)
    filtered_df = pd.DataFrame(result)
    # Return the filtered dataframe
    print(filtered_df["trip_id"].nunique())
    return filtered_df
    
df = load_local_data()
print(df.head(12))
df = remove_near_duplicates(df)
print(df.head(12))