|
import joblib |
|
import gradio as gr |
|
from datasets import Dataset, DatasetDict, load_dataset |
|
from huggingface_hub import login |
|
import os |
|
|
|
token = os.getenv('HF_TOKEN') |
|
login(token, add_to_git_credential=True,write_permission=True ) |
|
model = joblib.load('arabic_text_classifier.pkl') |
|
vectorizer = joblib.load('tfidf_vectorizer.pkl') |
|
label_encoder = joblib.load('label_encoder.pkl') |
|
available_labels = label_encoder.classes_ |
|
def predict_category(text): |
|
text_vector = vectorizer.transform([text]) |
|
probabilities = model.predict_proba(text_vector)[0] |
|
max_prob = max(probabilities) |
|
predicted_category = model.predict(text_vector)[0] |
|
|
|
if max_prob < 0.5: |
|
return "Other" |
|
|
|
predicted_label = label_encoder.inverse_transform([predicted_category])[0] |
|
return predicted_label |
|
|
|
def flag_data(text, prediction): |
|
|
|
try: |
|
dataset = load_dataset("Tevfik34/crowdsourced-text-classification-data", split="train") |
|
except: |
|
|
|
dataset = Dataset.from_dict({"text": [], "prediction": []}) |
|
|
|
new_data = {"text": [text], "prediction": [prediction]} |
|
dataset = dataset.add_item(new_data) |
|
|
|
|
|
dataset.push_to_hub("Tevfik34/crowdsourced-text-classification-data") |
|
|
|
|
|
def classify_and_flag(text): |
|
prediction = predict_category(text) |
|
flag_data(text, prediction) |
|
return prediction |
|
|
|
|
|
interface = gr.Interface(fn=classify_and_flag, |
|
inputs=gr.Textbox(lines=5, placeholder= "Enter text in Arabic here...", label="Text" ), |
|
outputs=gr.Label(label="Predicted Category"), |
|
title="Arabic Text Classifier", |
|
description=""" |
|
This interface allows you to classify Arabic text into different categories using a machine learning model trained on 160,000 real-world text samples. |
|
|
|
**Model Overview**: |
|
- The model is based on **Logistic Regression**. |
|
- It was trained on a large dataset of **160,000 Arabic text entries**, ensuring robustness and accuracy in classifying Arabic text. |
|
|
|
**How to use**: |
|
- Enter any Arabic text in the input box. |
|
- The model will predict the category that the text most likely belongs to. |
|
- If the model is uncertain, it will classify the text as 'Other'. |
|
|
|
**Available Labels**: |
|
The model can predict the following categories: |
|
- {} |
|
|
|
Try entering some text in Arabic to see how the model works. |
|
""".format(", ".join(available_labels)),theme="ParityError/Interstellar") |
|
|
|
interface.launch() |