# -*- coding: utf-8 -*- """app.ipynb Automatically generated by Colab. Original file is located at https://colab.research.google.com/drive/11FAEDRYHuCI7iX5w3JaeKoD76-9pwrLi """ import os import json import pandas as pd from rank_bm25 import BM25Okapi import gradio as gr import openai from datasets import load_dataset # Ensure Hugging Face CLI is authenticated if "HF_TOKEN" not in os.environ: print("Please authenticate with Hugging Face CLI or set HF_TOKEN as an environment variable.") exit(1) # Explicitly define dataset file paths data_files = { "train": "hf://datasets/farah1/mental-health-posts-classification/train.csv", "validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv", } # Load dataset try: print("Loading dataset...") dataset = load_dataset("csv", data_files=data_files) train_data = dataset["train"].to_pandas() validation_data = dataset["validation"].to_pandas() print("Dataset loaded successfully.") print("Train dataset columns:", train_data.columns) except Exception as e: print(f"Failed to load dataset: {e}") train_data = pd.DataFrame() # Fallback to empty DataFrame validation_data = pd.DataFrame() # Check and create the 'text' column if "text" not in train_data.columns: if "title" in train_data.columns and "content" in train_data.columns: train_data["text"] = train_data["title"] + " " + train_data["content"] else: raise ValueError("The 'text' column is missing, and the required 'title' and 'content' columns are not available to create it.") # Ensure the necessary columns exist in the training dataset required_columns = ["text", "Ground_Truth_Stress", "Ground_Truth_Anxiety", "Ground_Truth_Depression", "Ground_Truth_Other_binary"] for column in required_columns: if column not in train_data.columns: raise ValueError(f"Missing required column '{column}' in the training dataset.") # Initialize BM25 tokenized_train = [doc.split() for doc in train_data["text"]] bm25 = BM25Okapi(tokenized_train) # Set OpenAI API key openai.api_key = os.getenv("OPENAI_API_KEY") if not openai.api_key: raise ValueError("OpenAI API key is not set. Please set it as an environment variable.") # Few-shot classification function def classify_text(input_text, k=20): # Tokenize input text tokenized_text = input_text.split() # Get top-k similar examples using BM25 scores = bm25.get_scores(tokenized_text) top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k] # Build examples for prompt examples = "\n".join( f"Example {i+1}:\nText: {train_data.iloc[idx]['text']}\nClassification: " f"Stress={train_data.iloc[idx]['Ground_Truth_Stress']}, " f"Anxiety={train_data.iloc[idx]['Ground_Truth_Anxiety']}, " f"Depression={train_data.iloc[idx]['Ground_Truth_Depression']}, " f"Other={train_data.iloc[idx]['Ground_Truth_Other_binary']}\n" for i, idx in enumerate(top_k_indices) ) # Construct OpenAI prompt prompt = f""" You are a mental health specialist. Classify the text into Stress, Anxiety, Depression, or Other: ### Examples: {examples} ### Text to Classify: "{input_text}" ### Output Format: - **Ground_Truth_Stress**: 1 or 0 - **Ground_Truth_Anxiety**: 1 or 0 - **Ground_Truth_Depression**: 1 or 0 - **Ground_Truth_Other_binary**: 1 or 0 """ try: response = openai.ChatCompletion.create( messages=[ {"role": "system", "content": "You are a mental health specialist."}, {"role": "user", "content": prompt}, ], model="gpt-4", temperature=0, ) results = response.choices[0].message.content return json.loads(results) except Exception as e: return {"error": str(e)} # Gradio Interface interface = gr.Interface( fn=classify_text, inputs=gr.Textbox(lines=5, placeholder="Enter text for classification..."), outputs="json", title="Mental Health Text Classifier", description="Classify text into Stress, Anxiety, Depression, or Other using BM25 and GPT-4.", ) if __name__ == "__main__": interface.launch()