Spaces:

farah1
/

mental-health-bm25

Sleeping

App Files Files Community

farah1 commited on Dec 9, 2024

Commit

edaa6c5

verified ·

1 Parent(s): a180cf5

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -41

app.py CHANGED Viewed

@@ -16,52 +16,29 @@ import gradio as gr
 import openai
 from datasets import get_dataset_config_names, get_dataset_split_names
-# Verify configurations
-configurations = get_dataset_config_names("farah1/mental-health-posts-classification")
-print("Available configurations:", configurations)
-# Verify splits for the default configuration
-splits = get_dataset_split_names("farah1/mental-health-posts-classification", config="default")
-print("Available splits:", splits)
-# Set Hugging Face token for authentication
-HF_TOKEN = os.getenv("HF_TOKEN")  # Ensure this environment variable is set
-# Dataset URL (modify as needed)
-DATASET_NAME = "farah1/mental-health-posts-classification"
-BASE_URL = "https://datasets-server.huggingface.co/rows"
-HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
-def fetch_dataset(split="train", offset=0, length=50):
-    """
-    Fetch dataset split from Hugging Face using its API.
-    Handles small datasets by adjusting length dynamically.
-    """
-    dataset_url = f"{BASE_URL}?dataset={DATASET_NAME}&config=default&split={split}&offset={offset}&length={length}"
-    try:
-        response = requests.get(dataset_url, headers=HEADERS)
-        response.raise_for_status()
-        rows = response.json().get("rows", [])
-        if not rows:
-            raise ValueError("No data returned. Check dataset split or parameters.")
-        records = [row["row"] for row in rows]
-        return pd.DataFrame(records)
-    except requests.exceptions.HTTPError as e:
-        print(f"HTTP error occurred: {e}")
-    except Exception as e:
-        print(f"Error fetching dataset: {e}")
-    return pd.DataFrame()  # Return an empty DataFrame on failure
-# Fetch and prepare the train dataset
 try:
-    print("Fetching the dataset...")
-    train_data = fetch_dataset(split="train", offset=0, length=1000)
-    train_data["text"] = train_data["title"] + " " + train_data["content"]
-    print(f"Loaded {len(train_data)} rows from the training dataset.")
 except Exception as e:
-    print(f"Failed to fetch dataset: {e}")
-    exit(1)
 # Initialize BM25
 tokenized_train = [doc.split() for doc in train_data["text"]]

 import openai
 from datasets import get_dataset_config_names, get_dataset_split_names
+from datasets import load_dataset
+import os
+# Set Hugging Face token
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise ValueError("Hugging Face token is not set. Please set HF_TOKEN as an environment variable.")
+# Explicitly define dataset file paths
+data_files = {
+    "train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
+    "validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
+}
+# Load dataset with explicit files
 try:
+    dataset = load_dataset("csv", data_files=data_files, use_auth_token=HF_TOKEN)
+    train_data = dataset["train"]
+    validation_data = dataset["validation"]
+    print("Train data sample:")
+    print(train_data.head())
 except Exception as e:
+    print(f"Failed to load dataset: {e}")
 # Initialize BM25
 tokenized_train = [doc.split() for doc in train_data["text"]]