farah1's picture
Update app.py
b369092 verified
raw
history blame
4.27 kB
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/11FAEDRYHuCI7iX5w3JaeKoD76-9pwrLi
"""
import os
import json
import pandas as pd
from rank_bm25 import BM25Okapi
import gradio as gr
import openai
from datasets import load_dataset
# Ensure Hugging Face CLI is authenticated
if "HF_TOKEN" not in os.environ:
print("Please authenticate with Hugging Face CLI or set HF_TOKEN as an environment variable.")
exit(1)
# Explicitly define dataset file paths
data_files = {
"train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
"validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
}
# Load dataset
try:
print("Loading dataset...")
dataset = load_dataset("csv", data_files=data_files)
train_data = dataset["train"].to_pandas()
validation_data = dataset["validation"].to_pandas()
print("Dataset loaded successfully.")
print("Train dataset columns:", train_data.columns)
except Exception as e:
print(f"Failed to load dataset: {e}")
train_data = pd.DataFrame() # Fallback to empty DataFrame
validation_data = pd.DataFrame()
# Check and create the 'text' column
if "text" not in train_data.columns:
if "title" in train_data.columns and "content" in train_data.columns:
train_data["text"] = train_data["title"] + " " + train_data["content"]
else:
raise ValueError("The 'text' column is missing, and the required 'title' and 'content' columns are not available to create it.")
# Ensure the necessary columns exist in the training dataset
required_columns = ["text", "Ground_Truth_Stress", "Ground_Truth_Anxiety", "Ground_Truth_Depression", "Ground_Truth_Other_binary"]
for column in required_columns:
if column not in train_data.columns:
raise ValueError(f"Missing required column '{column}' in the training dataset.")
# Initialize BM25
tokenized_train = [doc.split() for doc in train_data["text"]]
bm25 = BM25Okapi(tokenized_train)
# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
raise ValueError("OpenAI API key is not set. Please set it as an environment variable.")
# Few-shot classification function
def classify_text(input_text, k=20):
# Tokenize input text
tokenized_text = input_text.split()
# Get top-k similar examples using BM25
scores = bm25.get_scores(tokenized_text)
top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]
# Build examples for prompt
examples = "\n".join(
f"Example {i+1}:\nText: {train_data.iloc[idx]['text']}\nClassification: "
f"Stress={train_data.iloc[idx]['Ground_Truth_Stress']}, "
f"Anxiety={train_data.iloc[idx]['Ground_Truth_Anxiety']}, "
f"Depression={train_data.iloc[idx]['Ground_Truth_Depression']}, "
f"Other={train_data.iloc[idx]['Ground_Truth_Other_binary']}\n"
for i, idx in enumerate(top_k_indices)
)
# Construct OpenAI prompt
prompt = f"""
You are a mental health specialist. Classify the text into Stress, Anxiety, Depression, or Other:
### Examples:
{examples}
### Text to Classify:
"{input_text}"
### Output Format:
- **Ground_Truth_Stress**: 1 or 0
- **Ground_Truth_Anxiety**: 1 or 0
- **Ground_Truth_Depression**: 1 or 0
- **Ground_Truth_Other_binary**: 1 or 0
"""
try:
response = openai.ChatCompletion.create(
messages=[
{"role": "system", "content": "You are a mental health specialist."},
{"role": "user", "content": prompt},
],
model="gpt-4",
temperature=0,
)
results = response.choices[0].message.content
return json.loads(results)
except Exception as e:
return {"error": str(e)}
# Gradio Interface
interface = gr.Interface(
fn=classify_text,
inputs=gr.Textbox(lines=5, placeholder="Enter text for classification..."),
outputs="json",
title="Mental Health Text Classifier",
description="Classify text into Stress, Anxiety, Depression, or Other using BM25 and GPT-4.",
)
if __name__ == "__main__":
interface.launch()