Spaces:

Tevfik-istanbullu
/

ArabicTextClassification

Sleeping

File size: 2,532 Bytes

import joblib
import gradio as gr
from datasets import Dataset, DatasetDict, load_dataset
from huggingface_hub import login
import os

token = os.getenv('HF_TOKEN')
login(token, add_to_git_credential=True,write_permission=True )
model = joblib.load('arabic_text_classifier.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')
available_labels = label_encoder.classes_
def predict_category(text):
    text_vector = vectorizer.transform([text])
    probabilities = model.predict_proba(text_vector)[0]
    max_prob = max(probabilities)
    predicted_category = model.predict(text_vector)[0]
    
    if max_prob < 0.5:
        return "Other" 
    
    predicted_label = label_encoder.inverse_transform([predicted_category])[0]
    return predicted_label

def flag_data(text, prediction):
    
    try:
        dataset = load_dataset("Tevfik34/crowdsourced-text-classification-data", split="train")
    except:
        
        dataset = Dataset.from_dict({"text": [], "prediction": []})

    new_data = {"text": [text], "prediction": [prediction]}
    dataset = dataset.add_item(new_data)
    
    
    dataset.push_to_hub("Tevfik34/crowdsourced-text-classification-data")


def classify_and_flag(text):
    prediction = predict_category(text)
    flag_data(text, prediction)
    return prediction


interface = gr.Interface(fn=classify_and_flag, 
                         inputs=gr.Textbox(lines=5, placeholder= "Enter text in Arabic here...", label="Text" ), 
                         outputs=gr.Label(label="Predicted Category"), 
                         title="Arabic Text Classifier", 
                         description="""
    This interface allows you to classify Arabic text into different categories using a machine learning model trained on 160,000 real-world text samples.
    
    **Model Overview**:
    - The model is based on **Logistic Regression**.
    - It was trained on a large dataset of **160,000 Arabic text entries**, ensuring robustness and accuracy in classifying Arabic text.

    **How to use**:
    - Enter any Arabic text in the input box.
    - The model will predict the category that the text most likely belongs to.
    - If the model is uncertain, it will classify the text as 'Other'.
    
    **Available Labels**:
    The model can predict the following categories:
    - {}
    
    Try entering some text in Arabic to see how the model works.
    """.format(", ".join(available_labels)),theme="ParityError/Interstellar")

interface.launch()