# -*- coding: utf-8 -*-
"""app.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/11FAEDRYHuCI7iX5w3JaeKoD76-9pwrLi
"""

import os
import json
import pandas as pd
from rank_bm25 import BM25Okapi
import gradio as gr
import openai
from datasets import load_dataset

# Ensure Hugging Face CLI is authenticated
if "HF_TOKEN" not in os.environ:
    print("Please authenticate with Hugging Face CLI or set HF_TOKEN as an environment variable.")
    exit(1)

# Explicitly define dataset file paths
data_files = {
    "train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
    "validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
}

# Load dataset
try:
    print("Loading dataset...")
    dataset = load_dataset("csv", data_files=data_files)
    train_data = dataset["train"].to_pandas()
    validation_data = dataset["validation"].to_pandas()
    print("Dataset loaded successfully.")
    print("Train dataset columns:", train_data.columns)
except Exception as e:
    print(f"Failed to load dataset: {e}")
    train_data = pd.DataFrame()  # Fallback to empty DataFrame
    validation_data = pd.DataFrame()

# Check and create the 'text' column
if "text" not in train_data.columns:
    if "title" in train_data.columns and "content" in train_data.columns:
        train_data["text"] = train_data["title"] + " " + train_data["content"]
    else:
        raise ValueError("The 'text' column is missing, and the required 'title' and 'content' columns are not available to create it.")

# Ensure the necessary columns exist in the training dataset
required_columns = ["text", "Ground_Truth_Stress", "Ground_Truth_Anxiety", "Ground_Truth_Depression", "Ground_Truth_Other_binary"]
for column in required_columns:
    if column not in train_data.columns:
        raise ValueError(f"Missing required column '{column}' in the training dataset.")

# Initialize BM25
tokenized_train = [doc.split() for doc in train_data["text"]]
bm25 = BM25Okapi(tokenized_train)

# Set OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")
if not openai.api_key:
    raise ValueError("OpenAI API key is not set. Please set it as an environment variable.")

# Few-shot classification function
def classify_text(input_text, k=20):
    # Tokenize input text
    tokenized_text = input_text.split()
    # Get top-k similar examples using BM25
    scores = bm25.get_scores(tokenized_text)
    top_k_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:k]

    # Build examples for prompt
    examples = "\n".join(
        f"Example {i+1}:\nText: {train_data.iloc[idx]['text']}\nClassification: "
        f"Stress={train_data.iloc[idx]['Ground_Truth_Stress']}, "
        f"Anxiety={train_data.iloc[idx]['Ground_Truth_Anxiety']}, "
        f"Depression={train_data.iloc[idx]['Ground_Truth_Depression']}, "
        f"Other={train_data.iloc[idx]['Ground_Truth_Other_binary']}\n"
        for i, idx in enumerate(top_k_indices)
    )

    # Construct OpenAI prompt
    prompt = f"""
    You are a mental health specialist. Classify the text into Stress, Anxiety, Depression, or Other:
    ### Examples:
    {examples}
    ### Text to Classify:
    "{input_text}"
    ### Output Format:
    - **Ground_Truth_Stress**: 1 or 0
    - **Ground_Truth_Anxiety**: 1 or 0
    - **Ground_Truth_Depression**: 1 or 0
    - **Ground_Truth_Other_binary**: 1 or 0
    """

    try:
        response = openai.ChatCompletion.create(
            messages=[
                {"role": "system", "content": "You are a mental health specialist."},
                {"role": "user", "content": prompt},
            ],
            model="gpt-4",
            temperature=0,
        )
        results = response.choices[0].message.content
        return json.loads(results)
    except Exception as e:
        return {"error": str(e)}

# Gradio Interface
interface = gr.Interface(
    fn=classify_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter text for classification..."),
    outputs="json",
    title="Mental Health Text Classifier",
    description="Classify text into Stress, Anxiety, Depression, or Other using BM25 and GPT-4.",
)

if __name__ == "__main__":
    interface.launch()