ejbejaranos dvilasuero HF staff commited on
Commit
30a30bf
0 Parent(s):

Duplicate from somosnlp/somos-alpaca-es

Browse files

Co-authored-by: Daniel Vila <dvilasuero@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. Dockerfile +5 -0
  3. README.md +13 -0
  4. load_data.py +73 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM argilla/argilla-quickstart:latest
2
+
3
+ COPY load_data.py /
4
+
5
+ CMD whoami && /start_quickstart_argilla.sh
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Hackathon SomosNLP Reto Datasets LLM Español
3
+ emoji: 🦙 🏷️
4
+ colorFrom: purple
5
+ colorTo: red
6
+ sdk: docker
7
+ app_port: 6900
8
+ fullWidth: true
9
+ tags:
10
+ - argilla
11
+ - somosnlp
12
+ duplicated_from: somosnlp/somos-alpaca-es
13
+ ---
load_data.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2021-present, the Recognai S.L. team.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import sys
16
+ import time
17
+
18
+ import argilla as rg
19
+ import pandas as pd
20
+ import requests
21
+ from argilla.labeling.text_classification import Rule, add_rules
22
+ from datasets import load_dataset
23
+
24
+
25
+ class LoadDatasets:
26
+ def __init__(self, api_key, workspace="team"):
27
+ rg.init(api_key=api_key, workspace=workspace)
28
+
29
+
30
+ @staticmethod
31
+ def load_somos():
32
+ print("Loading somos dataset")
33
+ # Leer el dataset del Hub
34
+ dataset = load_dataset("somosnlp/somos-alpaca-es", split="train")
35
+ dataset = dataset.remove_columns("metrics") # si falla se puede comentar esta linea
36
+ records = rg.DatasetForTextClassification.from_datasets(dataset)
37
+
38
+ # Log the dataset
39
+ rg.log(
40
+ records,
41
+ name="somos-alpaca-es",
42
+ tags={"description": "SomosNLP Hackathon dataset"},
43
+ )
44
+ settings = rg.TextClassificationSettings(
45
+ label_schema=["BAD INSTRUCTION", "BAD INPUT", "BAD OUTPUT", "INAPPROPRIATE", "BIASED", "ALL GOOD"]
46
+ )
47
+ rg.configure_dataset(name="somos-alpaca-es", settings=settings, workspace="team")
48
+
49
+
50
+ if __name__ == "__main__":
51
+ API_KEY = sys.argv[1]
52
+ LOAD_DATASETS = sys.argv[2]
53
+
54
+ if LOAD_DATASETS.lower() == "none":
55
+ print("No datasets being loaded")
56
+ else:
57
+ while True:
58
+ try:
59
+ response = requests.get("http://0.0.0.0:6900/")
60
+ if response.status_code == 200:
61
+ ld = LoadDatasets(API_KEY)
62
+
63
+ ld.load_somos()
64
+ break
65
+
66
+ except requests.exceptions.ConnectionError:
67
+ pass
68
+ except Exception as e:
69
+ print(e)
70
+ time.sleep(10)
71
+ pass
72
+
73
+ time.sleep(5)