Spaces:
Runtime error
Runtime error
import sys | |
import time | |
import os | |
import argilla as rg | |
import pandas as pd | |
import requests | |
from datasets import load_dataset, concatenate_datasets | |
from argilla.listeners import listener | |
HF_TOKEN = os.environ.get("HF_TOKEN") | |
HUB_DATASET_NAME = "mserras/alpaca-es-hackaton" | |
HUB_DATASET_NAME_VAL = "mserras/alpaca-es-hackaton-validated" | |
def save_validated_to_hub(records, ctx): | |
if len(records) > 0: | |
ds = rg.DatasetForTextClassification(records=records).to_datasets() | |
if HF_TOKEN: | |
print("Pushing the dataset") | |
print(ds) | |
ds.push_to_hub(HUB_DATASET_NAME_VAL, token=HF_TOKEN) | |
else: | |
print("SET HF_TOKEN and HUB_DATASET_NAME TO SYNC YOUR DATASET!!!") | |
else: | |
print("NO RECORDS found") | |
class LoadDatasets: | |
def __init__(self, api_key, workspace="team"): | |
rg.init(api_key=api_key, workspace=workspace) | |
def load_somos(): | |
# Leer el dataset del Hub | |
try: | |
print(f"Trying to sync with {HUB_DATASET_NAME}") | |
dataset = load_dataset(HUB_DATASET_NAME, split="train") | |
except Exception as e: | |
print(f"Not possible to sync with {HUB_DATASET_NAME}") | |
print(e) | |
dataset = None | |
# dataset = load_dataset("somosnlp/somos-clean-alpaca-es", split="train") | |
# if old_ds: | |
# print("Concatenating datasets") | |
# dataset = concatenate_datasets([dataset, old_ds]) | |
# print("Concatenated dataset is:") | |
# print(dataset) | |
dataset = dataset.remove_columns("metrics") | |
if not dataset: | |
print(f"There is no DATASET - Skipping!") | |
return | |
print(f"Generating records from the dataset") | |
records = rg.DatasetForTextClassification.from_datasets(dataset) | |
settings = rg.TextClassificationSettings( | |
label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD", "HALLUCINATION", "UNPROCESSABLE"] | |
) | |
rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team") | |
print("Logging the dataset!") | |
# Log the dataset | |
rg.log( | |
records, | |
name="somos-alpaca-es", | |
tags={"description": "SomosNLP Hackathon dataset - instruction filtering version"}, | |
batch_size=200 | |
) | |
# run listener | |
save_validated_to_hub.start() | |
if __name__ == "__main__": | |
API_KEY = sys.argv[1] | |
LOAD_DATASETS = sys.argv[2] | |
if LOAD_DATASETS.lower() == "none": | |
print("No datasets being loaded") | |
else: | |
while True: | |
try: | |
response = requests.get("http://0.0.0.0:6900/") | |
if response.status_code == 200: | |
ld = LoadDatasets(API_KEY) | |
ld.load_somos() | |
break | |
except requests.exceptions.ConnectionError: | |
pass | |
except Exception as e: | |
print(e) | |
time.sleep(10) | |
pass | |
time.sleep(5) | |
while True: | |
time.sleep(60) |