dataset-creator-reddit-uwaterloo

Running

File size: 2,333 Bytes

285612d
 
 
2703fdd
285612d
d0c9304
76a52b4
 
285612d
 
d0c9304
285612d
 
d0c9304
 
76a52b4
285612d
 
76a52b4
 
 
285612d
 
 
d0c9304
04fde16
76a52b4
285612d
d0c9304
04fde16
d0c9304
285612d
d0c9304
 
285612d
04fde16
76a52b4
 
04fde16
 
285612d
76a52b4
 
 
04fde16
 
 
76a52b4
5d9e0b8
 
d0c9304
 
 
5d9e0b8
 
 
 
d0c9304
 
 
 
 
 
 
2703fdd

import pandas as pd


def data_processing(df: pd.DataFrame) -> pd.DataFrame:
    """
    For each id, creates a new row with the longest content and the highest score
    from the available rows with the same id. Adds a boolean column 'updated'
    indicating whether the row was updated.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.

    Returns:
    - pd.DataFrame: A DataFrame with unique ids, where each id is associated
                    with the longest content available and the highest score from
                    potentially different rows, and a boolean column 'updated'.
    """

    # Create a copy of the original DataFrame to avoid modifying it directly
    original_df = df.copy()

    # Create a column for content length
    df['content_length'] = df['content'].str.len()

    # Find row with the longest content for each 'id'
    idx_longest_content = df.groupby('id')['content_length'].idxmax()
    df_longest_content = df.loc[idx_longest_content][['id', 'content']]

    # Find row with the highest score for each 'id'
    idx_highest_score = df.groupby('id')['score'].idxmax()
    df_highest_score = df.loc[idx_highest_score][['id', 'score']]

    # Merge the two DataFrames on 'id'
    df_merged = pd.merge(df_longest_content, df_highest_score, on='id')

    # Merge with original DataFrame to compare content and score
    df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))

    # Check if the content or score was updated for each id
    df_merged['updated'] = (df_merged['content'] != df_merged['content_original'])

    # Drop original content and score columns
    df_merged.drop(columns=['content_original', 'score_original'], inplace=True)

    # Drop duplicates to keep only the rows with longest content and highest score
    df_merged.drop_duplicates(subset='id', inplace=True)

    return df_merged


if __name__ == '__main__':
    # Mock data
    data = {
        'id': [1, 1, 2, 2, 3],
        'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
        'score': [10, 5, 7, 9, 6],
        'another_column': ['a', 'a', 'b', 'b', 'c']
        }

    df = pd.DataFrame(data)

    print("Original DataFrame:")
    print(df)
    print("\nFiltered DataFrame:")
    print(data_processing(df))