farah1 commited on
Commit
edaa6c5
·
verified ·
1 Parent(s): a180cf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -41
app.py CHANGED
@@ -16,52 +16,29 @@ import gradio as gr
16
  import openai
17
  from datasets import get_dataset_config_names, get_dataset_split_names
18
 
19
- # Verify configurations
20
- configurations = get_dataset_config_names("farah1/mental-health-posts-classification")
21
- print("Available configurations:", configurations)
22
-
23
- # Verify splits for the default configuration
24
- splits = get_dataset_split_names("farah1/mental-health-posts-classification", config="default")
25
- print("Available splits:", splits)
26
-
27
- # Set Hugging Face token for authentication
28
- HF_TOKEN = os.getenv("HF_TOKEN") # Ensure this environment variable is set
29
-
30
- # Dataset URL (modify as needed)
31
- DATASET_NAME = "farah1/mental-health-posts-classification"
32
- BASE_URL = "https://datasets-server.huggingface.co/rows"
33
- HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
34
 
35
- def fetch_dataset(split="train", offset=0, length=50):
36
- """
37
- Fetch dataset split from Hugging Face using its API.
38
- Handles small datasets by adjusting length dynamically.
39
- """
40
- dataset_url = f"{BASE_URL}?dataset={DATASET_NAME}&config=default&split={split}&offset={offset}&length={length}"
41
- try:
42
- response = requests.get(dataset_url, headers=HEADERS)
43
- response.raise_for_status()
44
- rows = response.json().get("rows", [])
45
- if not rows:
46
- raise ValueError("No data returned. Check dataset split or parameters.")
47
- records = [row["row"] for row in rows]
48
- return pd.DataFrame(records)
49
- except requests.exceptions.HTTPError as e:
50
- print(f"HTTP error occurred: {e}")
51
- except Exception as e:
52
- print(f"Error fetching dataset: {e}")
53
- return pd.DataFrame() # Return an empty DataFrame on failure
54
 
 
 
 
 
 
55
 
56
- # Fetch and prepare the train dataset
57
  try:
58
- print("Fetching the dataset...")
59
- train_data = fetch_dataset(split="train", offset=0, length=1000)
60
- train_data["text"] = train_data["title"] + " " + train_data["content"]
61
- print(f"Loaded {len(train_data)} rows from the training dataset.")
 
62
  except Exception as e:
63
- print(f"Failed to fetch dataset: {e}")
64
- exit(1)
65
 
66
  # Initialize BM25
67
  tokenized_train = [doc.split() for doc in train_data["text"]]
 
16
  import openai
17
  from datasets import get_dataset_config_names, get_dataset_split_names
18
 
19
+ from datasets import load_dataset
20
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
+ # Set Hugging Face token
23
+ HF_TOKEN = os.getenv("HF_TOKEN")
24
+ if not HF_TOKEN:
25
+ raise ValueError("Hugging Face token is not set. Please set HF_TOKEN as an environment variable.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ # Explicitly define dataset file paths
28
+ data_files = {
29
+ "train": "hf://datasets/farah1/mental-health-posts-classification/train.csv",
30
+ "validation": "hf://datasets/farah1/mental-health-posts-classification/validation.csv",
31
+ }
32
 
33
+ # Load dataset with explicit files
34
  try:
35
+ dataset = load_dataset("csv", data_files=data_files, use_auth_token=HF_TOKEN)
36
+ train_data = dataset["train"]
37
+ validation_data = dataset["validation"]
38
+ print("Train data sample:")
39
+ print(train_data.head())
40
  except Exception as e:
41
+ print(f"Failed to load dataset: {e}")
 
42
 
43
  # Initialize BM25
44
  tokenized_train = [doc.split() for doc in train_data["text"]]