File size: 2,842 Bytes
bb979cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102a3e9
 
bb979cd
 
 
 
 
 
 
 
 
 
 
03b7826
bb979cd
67a4496
 
898099b
 
 
67a4496
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import gradio as gr
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

def calculate_rfm(df):
    # Convert 'Fecha compra' to datetime and calculate recency
    df['Fecha compra'] = pd.to_datetime(df['Fecha compra'], format='%m/%d/%Y')
    today = datetime.datetime.now().date()
    fecha_actual = pd.to_datetime(today).to_numpy().astype('datetime64[D]')
    df['recencia'] = (fecha_actual - df['Fecha compra'].to_numpy().astype('datetime64[D]'))
    df['recencia'] = df['recencia'].astype('timedelta64[D]').astype(int)

    # Group by 'Email' and calculate frequency and monetary value
    grouped = df.groupby('Email')
    frequency = grouped['Email'].count().to_frame().rename(columns={"Email": "frecuencia"})
    monetary = grouped['Valor compra'].sum().to_frame().rename(columns={'Valor compra': 'monetario'})
    monetary['monetario'] = monetary['monetario'].round(2)

    # Join the recency dataframe with frequency and monetary dataframes
    df = df.join(frequency, on='Email')
    df = df.join(monetary, on='Email')

    # Keep only the latest purchase for each customer
    df = df.sort_values(by=['Email', 'Fecha compra'], ascending=False)
    df = df.drop_duplicates(subset='Email', keep='first')

    # Clean up the final dataframe
    df.drop(['Fecha compra', 'Valor compra'], axis=1, inplace=True)
    df.set_index('Email', inplace=True)

    # Scale the features
    scaler = StandardScaler()
    scaled_columns = ['recencia', 'frecuencia', 'monetario']
    scaled_values = scaler.fit_transform(df[scaled_columns])
    z_scores = np.abs(scaled_values)
    outlier_mask = (z_scores > 3).any(axis=1)

    for i, column in enumerate(scaled_columns):
        df[f"{column}_scaled"] = scaled_values[:, i]

    df = df[~outlier_mask]

    # Cluster the data
    np.random.seed(0)
    scaled_columns = ['recencia_scaled', 'frecuencia_scaled', 'monetario_scaled']
    kmeans = KMeans(n_clusters=5, n_init=10)
    rfm_clusters = kmeans.fit_predict(df[scaled_columns])
    df = df.copy()
    df['cluster'] = rfm_clusters

    df['cluster'] = df['cluster'].replace({4: 'Dormidos', 0: 'Nuevos', 2: 'Potenciales', 1: 'En riesgo', 3: 'Fieles'})

    # Drop the scaled columns
    df.drop(scaled_columns, axis=1, inplace=True)

    # Reset the index
    df = df.reset_index()

    # Return the desired columns
    return df[['Email', 'recencia', 'frecuencia', 'monetario', 'cluster']]

def read_csv(file):
    df = pd.read_csv(file.name)
    return calculate_rfm(df).head(10)

demo= gr.Interface(fn=read_csv, 
                     inputs=[gr.components.File(label="Select a CSV file")], 
                     outputs="dataframe", 
                     title="RFM Automatizado con Inteligencia Artificial")

if __name__ == "__main__":
    demo.launch()