File size: 2,333 Bytes
285612d 2703fdd 285612d d0c9304 76a52b4 285612d d0c9304 285612d d0c9304 76a52b4 285612d 76a52b4 285612d d0c9304 04fde16 76a52b4 285612d d0c9304 04fde16 d0c9304 285612d d0c9304 285612d 04fde16 76a52b4 04fde16 285612d 76a52b4 04fde16 76a52b4 5d9e0b8 d0c9304 5d9e0b8 d0c9304 2703fdd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import pandas as pd
def data_processing(df: pd.DataFrame) -> pd.DataFrame:
"""
For each id, creates a new row with the longest content and the highest score
from the available rows with the same id. Adds a boolean column 'updated'
indicating whether the row was updated.
Parameters:
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
Returns:
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
with the longest content available and the highest score from
potentially different rows, and a boolean column 'updated'.
"""
# Create a copy of the original DataFrame to avoid modifying it directly
original_df = df.copy()
# Create a column for content length
df['content_length'] = df['content'].str.len()
# Find row with the longest content for each 'id'
idx_longest_content = df.groupby('id')['content_length'].idxmax()
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
# Find row with the highest score for each 'id'
idx_highest_score = df.groupby('id')['score'].idxmax()
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
# Merge the two DataFrames on 'id'
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
# Merge with original DataFrame to compare content and score
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
# Check if the content or score was updated for each id
df_merged['updated'] = (df_merged['content'] != df_merged['content_original'])
# Drop original content and score columns
df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
# Drop duplicates to keep only the rows with longest content and highest score
df_merged.drop_duplicates(subset='id', inplace=True)
return df_merged
if __name__ == '__main__':
# Mock data
data = {
'id': [1, 1, 2, 2, 3],
'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
'score': [10, 5, 7, 9, 6],
'another_column': ['a', 'a', 'b', 'b', 'c']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nFiltered DataFrame:")
print(data_processing(df))
|