derek-thomas's picture
derek-thomas HF staff
Refactoring to make it clear which functions are user-defined
2703fdd
raw
history blame
2.35 kB
import pandas as pd
def data_processing(df: pd.DataFrame) -> pd.DataFrame:
"""
For each id, creates a new row with the longest content and the highest score
from the available rows with the same id. Adds a boolean column 'updated'
indicating whether the row was updated.
Parameters:
- df (pd.DataFrame): The input DataFrame with columns 'id', 'content', and 'score'.
Returns:
- pd.DataFrame: A DataFrame with unique ids, where each id is associated
with the longest content available and the highest score from
potentially different rows, and a boolean column 'updated'.
"""
# Create a copy of the original DataFrame to avoid modifying it directly
original_df = df.copy()
# Create a column for content length
df['content_length'] = df['content'].str.len()
# Find row with the longest content for each 'id'
idx_longest_content = df.groupby('id')['content_length'].idxmax().values
df_longest_content = df.loc[idx_longest_content][['id', 'content']]
# Find row with the highest score for each 'id'
idx_highest_score = df.groupby('id')['score'].idxmax().values
df_highest_score = df.loc[idx_highest_score][['id', 'score']]
# Merge the two DataFrames on 'id'
df_merged = pd.merge(df_longest_content, df_highest_score, on='id')
# Check if the content or score was updated for each id
df_merged = df_merged.merge(original_df, on='id', suffixes=('', '_original'))
df_merged['updated'] = (df_merged['content'] != df_merged['content_original']) | (
df_merged['score'] != df_merged['score_original'])
# Drop duplicates to keep only the rows with longest content and highest score
df_merged.drop_duplicates(subset='id', inplace=True)
# Drop original content and score columns
df_merged.drop(columns=['content_original', 'score_original'], inplace=True)
return df_merged
if __name__ == '__main__':
# Mock data
data = {
'id': [1, 1, 2, 2, 3],
'content': ['short', 'much longer content', 'mid', 'size', 'constant'],
'score': [10, 5, 7, 9, 6],
'another_column': ['a', 'a', 'b', 'b', 'c']
}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nFiltered DataFrame:")
print(data_processing(df))