curfox_model_trainer

Sleeping

File size: 14,398 Bytes

import asyncio
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import requests
import pandas as pd
import json
import os,datetime
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from joblib import dump, load
import numpy as np
import requests
import mysql.connector
from mysql.connector import Error


app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)


@app.get("/trigger_the_model_trainer")
async def train_the_model():     
        # Load the dataset
        #file_path = 'model/trainer_data.csv'  # Update to the correct file path 'model/trainer_data_new.csv'
        #data = pd.read_csv(file_path)
        csv_files = ['model/trainer_data.csv','model/trainer_data2.csv','model/trainer_data3.csv','model/trainer_data4.csv']
        data_frames = [pd.read_csv(file) for file in csv_files]
        
        # Step 4: Concatenate all DataFrames into a single DataFrame
        data = pd.concat(data_frames, ignore_index=True)
        #data = data.iloc[0:50000]
        # Analyze class distribution
        class_distribution = data['status_name'].value_counts()
        print("Class Distribution before balancing:\n", class_distribution)
        
        # Get the size of the largest class to match other classes' sizes
        max_class_size = class_distribution.max()
        
        # Oversampling
        oversampled_data = pd.DataFrame()
        for class_name, group in data.groupby('status_name'):
            oversampled_group = resample(group,
                                         replace=True,  # Sample with replacement
                                         n_samples=max_class_size,  # to match majority class
                                         random_state=123)  # for reproducibility
            oversampled_data = pd.concat([oversampled_data, oversampled_group], axis=0)
        
        # Verify new class distribution
        print("Class Distribution after oversampling:\n", oversampled_data['status_name'].value_counts())
        
        # Save the balanced dataset if needed
        #oversampled_data.to_csv('model/trainer_data_balanced.csv', index=False)

        data = pd.read_csv("model/trainer_data_new.csv")
        print(data["customer_name"].count())
    
        data = pd.read_csv("model/trainer_data_balanced.csv")
        print(data["customer_name"].count())

        data = oversampled_data
        print(data["customer_name"].count())
        # Select columns
        selected_columns = ['customer_name', 'customer_address', 'customer_phone_no',
                            'weight','cod','pickup_address','client_number','destination_city',
                            'status_name']
        
        # Handling missing values
        #data_filled = data[selected_columns].fillna('Missing')
        data_filled = data[selected_columns].dropna()
        
        # Encoding categorical variables
        encoders = {col: LabelEncoder() for col in selected_columns if data_filled[col].dtype == 'object'}
        for col, encoder in encoders.items():
            data_filled[col] = encoder.fit_transform(data_filled[col])
        
        # Splitting the dataset
        X = data_filled.drop('status_name', axis=1)
        y = data_filled['status_name']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Parameters to use for the model
        # Parameters to use for the model
        """params = {
            'colsample_bytree': 0.3,
            'learning_rate': 0.6,
            'max_depth': 6,
            'n_estimators': 100,
            'subsample': 0.9,
            'use_label_encoder': False,
            'eval_metric': 'logloss'
        }"""
        params = {
            'colsample_bytree': 0.9,
            'learning_rate': 0.1,
            'max_depth': 30,
            'n_estimators': 600,
            'subsample': 0.9,
            'use_label_encoder': False,
            'eval_metric': 'logloss'
        }
        
        # Initialize the classifier with the specified parameters
        xgb = XGBClassifier(**params)
        
        # Train the model
        xgb.fit(X_train, y_train)        

        
        # Predict on the test set
        y_pred = xgb.predict(X_test)
        y_pred_proba = xgb.predict_proba(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        classification_rep = classification_report(y_test, y_pred)
        
        # Save the model
        model_filename = 'model/transexpress_xgb_model.joblib'
        dump(xgb, model_filename)
        
        # Save the encoders
        encoders_filename = 'model/transexpress_encoders.joblib'
        dump(encoders, encoders_filename)
        
        return accuracy,classification_rep,"Model trained with new data"
    
@app.get("/trigger_the_data_fecher")
async def get_data(page: str,paginate: str):

            
    print("data fetcher running.....")
            
    # Initialize an empty DataFrame to store the combined data
    combined_df = pd.DataFrame()
            
    # Update the payload for each page
    url = "https://report.transexpress.lk/api/orders/delivery-success-rate/return-to-client-orders?page="+page+"&per_page="+paginate
    
    payload = {}
    headers = {
      'Cookie': 'development_trans_express_session=NaFDGzh5WQCFwiortxA6WEFuBjsAG9GHIQrbKZ8B'
    }
            
    response = requests.request("GET", url, headers=headers, data=payload)
            
    # Sample JSON response
    json_response = response.json()
    # Extracting 'data' for conversion
    data = json_response["return_to_client_orders"]['data']

    data_count = len(data)  
    
    df = pd.json_normalize(data)
    
            
    df['status_name'] = df['status_name'].replace('Partially Delivered', 'Delivered')
    df['status_name'] = df['status_name'].replace('Received by Client', 'Returned to Client')
    
    print("data collected from page : "+page)
    #return "done"
    try:
        file_path = 'model/trainer_data5.csv'  # Replace with your file path
        source_csv = pd.read_csv(file_path)
        new_data = df
        combined_df_final = pd.concat([source_csv,new_data], ignore_index=True)
    
        combined_df_final.to_csv("model/trainer_data5.csv")
        print("data added")
    except:
        
        df.to_csv("model/trainer_data5.csv")
        print("data created")

    print({"page_number":page,"data_count":data_count})
    return {"page_number":page,"data_count":data_count}

@app.get("/get_module_versions")
async def get_versions():
    try: 
        from pip._internal.operations import freeze
    except ImportError: # pip < 10.0
        from pip.operations import freeze
    
    pkgs = freeze.freeze()
    for pkg in pkgs: 
        print(pkg)
    return pkgs
 

@app.get("/get_latest_model_updated_time")
async def model_updated_time():
    try:
        file_size = os.path.getsize("model/transexpress_xgb_model.joblib")
        m_time_encoder = os.path.getmtime('model/transexpress_encoders.joblib')
        m_time_model = os.path.getmtime('model/transexpress_xgb_model.joblib')
        return {"base model created time ":datetime.datetime.fromtimestamp(m_time_encoder),
                "last model updated time":datetime.datetime.fromtimestamp(m_time_model),
                "The size of the file is bytes":file_size
               }
    except:
        return {"no model found so first trained the model using data fecther"}



# Database connection parameters
DB_HOST = 'trans-prod-clone-staging.mysql.database.azure.com'
DB_PORT = 3306
DB_DATABASE = 'defaultdb'
DB_USERNAME = 'wwwdata'
DB_PASSWORD = 'fcLa8F3sxgNYQ$K@%'
# Connect to the database

#calling this function for each request
def fetch_customer_data(phone_number):
                #local connection
                connection = mysql.connector.connect(
                                host=DB_HOST,
                                port=DB_PORT,
                                database=DB_DATABASE,
                                user=DB_USERNAME,
                                password=DB_PASSWORD
                            )
            #try:
                if connection.is_connected():
                    print("Connected to the database")
        
                    # SQL query
                    query = """
                    SELECT 
                        orders.customer_name AS customer_name,
                        orders.address AS customer_address,
                        orders.phone_no AS customer_phone_no,
                        primary_statuses.name AS status_name
                    FROM 
                        orders
                    INNER JOIN 
                        statuses ON orders.status_id = statuses.id
                    INNER JOIN 
                        primary_statuses ON statuses.name = primary_statuses.key
                    WHERE orders.phone_no LIKE %s
                    """
        
                    # Execute the query
                    cursor = connection.cursor(dictionary=True)
                    cursor.execute(query, (f"%{phone_number}%",))
                    
                    # Fetch results
                    results = cursor.fetchall()
                    #print("Results:", results)
                    #close conection
                    #if connection.is_connected():
                    cursor.close()
                    connection.close()
                    print("Database connection closed")
                    return results
        
            # except Error as e:
            #     print(f"Error: {e}")
            #     #close conection
            #     #if connection.is_connected():
            #     cursor.close()
            #     connection.close()
            #     print("Database connection closed")


# Endpoint for making predictions
@app.post("/predict")
async def predict(
    date : str,
    customer_name: str,
    customer_address: str,
    customer_phone: str,
    weight: float,
    cod: int,
    pickup_address: str,
    client_number:str,
    destination_city:str
    ):


    try:
        # Load your trained model and encoders
        xgb_model = load('model/transexpress_xgb_model.joblib')
        encoders = load('model/transexpress_encoders.joblib')
    except:
        return {"no model found so first trained the model using data fecther"}

    
    # Function to handle unseen labels during encoding
    def safe_transform(encoder, column):
        classes = encoder.classes_
        return [encoder.transform([x])[0] if x in classes else -1 for x in column] 
        
    # Convert input data to DataFrame
    input_data = {
        'customer_name': customer_name,
        'customer_address': customer_address,
        'customer_phone_no': customer_phone,
        'weight': float(weight),
        'cod': int(cod),
        'pickup_address':pickup_address,
        'client_number':client_number,
        'destination_city':destination_city
    }
    input_df = pd.DataFrame([input_data])

    # Encode categorical variables using the same encoders used during training
    for col in input_df.columns:
        if col in encoders:
            input_df[col] = safe_transform(encoders[col], input_df[col])

    # Predict and obtain probabilities
    pred = xgb_model.predict(input_df)
    pred_proba = xgb_model.predict_proba(input_df)
    
    import numpy as np
    from urllib.parse import unquote
    def extract_phone_numbers(customer_phone):
        # Decode URL-encoded phone numbers
        decoded_phone = unquote(customer_phone)
        # Split into a list of phone numbers
        phone_numbers = [phone.strip() for phone in decoded_phone.split('/')]
        # Handle case where there is a single phone number
        if len(phone_numbers) == 1 and phone_numbers[0]:
            return phone_numbers
        elif len(phone_numbers) == 0:
            return []
        return phone_numbers
        
    
    def calculate_delivery_factor(phone_number):        
        # Replace with the desired customer name and phone number
        
        #customer_phone_no = '0773224384'
        json = fetch_customer_data(phone_number)
        data = json
        #print(url,data)
        # Filter only relevant status names
        valid_statuses = ['Failed to Deliver', 'Delivered', 'Returned to Client']
        relevant_orders = [order for order in data if order['status_name'] in valid_statuses]
    
        if not relevant_orders:
            base_probability = 0.50
        else:
            delivered_count = sum(1 for order in relevant_orders if order['status_name'] == 'Delivered')
            total_orders_count = len(relevant_orders)
    
            base_probability = delivered_count / total_orders_count
            base_probability = max(0.05, min(base_probability, 0.95))
    
        # Add a narrower random component
        random_component = np.random.uniform(-0.05, 0.05)
        adjusted_probability = base_probability + random_component
    
        return adjusted_probability
    try:
        
        print(customer_phone)
        phone_numbers = extract_phone_numbers(customer_phone)
        print(phone_numbers, "api calling ......")
        probability = calculate_delivery_factor(phone_numbers[0])
        probability = round((probability * 100),2)
        #probability = f"{probability:.2f}" probability = f"{float(probability):.2f}"

        print(f"new model probability: {probability}")
        predicted_status = "delivered"
        
    # Output
    except Exception as e:
        print(f"Error: {e}")
        predicted_status = "Unknown" if pred[0] == -1 else encoders['status_name'].inverse_transform([pred])[0]
        probability = pred_proba[0][pred[0]] * 100 if pred[0] != -1 else "Unknown"
        print(str(predicted_status),probability)
        if probability>98:
           probability = probability-1
        if predicted_status == "Returned to Client":
           probability = 100 - probability


    return {"Probability": round(probability,2),"predicted_status":predicted_status}