Add model

Browse files

Files changed (8) hide show

README.md +7 -43
config.json +5 -10
model_head.pkl +1 -1
pytorch_model.bin +2 -2
sentence_bert_config.json +1 -1
special_tokens_map.json +1 -1
tokenizer.json +2 -2
tokenizer_config.json +54 -7

README.md CHANGED Viewed

@@ -1,31 +1,18 @@
 ---
 tags:
 - setfit
 - sentence-transformers
 - text-classification
 pipeline_tag: text-classification
-datasets:
-- mserras/alpaca-es-hackaton
-- somosnlp/somos-clean-alpaca-es
-language:
-- es
 ---
 # mserras/setfit-alpaca-es-unprocessable-sample-detection
-This is a [SetFit model](https://github.com/huggingface/setfit) that can be used for filtering the Alpaca ES instruction dataset.
-The base model is the multilingual model of [Paraphrase mpnet base v2](sentence-transformers/paraphrase-multilingual-mpnet-base-v2) from Sentence Transformers
- This model has been developed during the 2023 Hackaton organized by [SomosNLP](https://somosnlp.org/)/[HF Card](https://huggingface.co/somosnlp) and with the GPUs provided by [Q Blocks](https://www.qblocks.cloud)
-This model has been trained over "unprocessable" samples of the translated [Clean Alpaca Es](https://huggingface.co/datasets/somosnlp/somos-clean-alpaca-es) dataset from
-the HF [Argilla](https://argilla.io) space https://huggingface.co/spaces/mserras/somos-alpaca-es.
-To this end, a custom tag is proposed: "unprocessable" which corresponds to instruction/input/output triplets that require processing image, fetching information from the
-open web and similar tasks where the LLM has no capability action, thus, ending in hallucinations or strange outcomes.
-As this model was trained over samples of Alpaca, which were generated using ChatGPT3.5 this model **cannot be used for commercial purposes or to compete against OpenAI**
 ## Usage
@@ -39,36 +26,13 @@ You can then run inference as follows:
 ```python
 from setfit import SetFitModel
-import argilla as rg
 # Download from Hub and run inference
 model = SetFitModel.from_pretrained("mserras/setfit-alpaca-es-unprocessable-sample-detection")
-def instruct_fields_to_text(field_instruction: str, field_input: str, field_output: str):
-    """Given the instruction, input and output fields, return a text to be used by setfit"""
-    return f"INSTRUCTION:\n{field_instruction}\nINPUT:\n{field_input}\nOUTPUT:\n{field_output}\n"
-def sample_to_text(sample: rg.TextClassificationRecord) -> str:
-    """Converts and Argilla TextClassificationRecord to a text to be used by setfit"""
-    return instruct_fields_to_text(sample.inputs["1-instruction"], sample.inputs["2-input"], sample.inputs["3-output"])
-# For a given Argilla record:
-unprocessable_score = model.predict_proba([sample_to_text(argilla_record)])[0].tolist()[1]
 ```
-## Evaluation
-*Disclaimer*: There was no formal evaluation done, just a bunch of guys looking at the data & the outcomes.
-## Changelog
-- [09/04/2023] SQL code generation, date conversion, percentual discounts and renewable energies no longer detected as unprocessable.
-- [06/04/2023] It no longer detects password generation as unprocessable.
 ## BibTeX entry and citation info
 ```bibtex
@@ -82,4 +46,4 @@ publisher = {arXiv},
 year = {2022},
 copyright = {Creative Commons Attribution 4.0 International}
 }
-```

 ---
+license: apache-2.0
 tags:
 - setfit
 - sentence-transformers
 - text-classification
 pipeline_tag: text-classification
 ---
 # mserras/setfit-alpaca-es-unprocessable-sample-detection
+This is a [SetFit model](https://github.com/huggingface/setfit) that can be used for text classification. The model has been trained using an efficient few-shot learning technique that involves:
+1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
+2. Training a classification head with features from the fine-tuned Sentence Transformer.
 ## Usage
 ```python
 from setfit import SetFitModel
 # Download from Hub and run inference
 model = SetFitModel.from_pretrained("mserras/setfit-alpaca-es-unprocessable-sample-detection")
+# Run inference
+preds = model(["i loved the spiderman movie!", "pineapple on pizza is the worst 🤮"])
 ```
 ## BibTeX entry and citation info
 ```bibtex
 year = {2022},
 copyright = {Creative Commons Attribution 4.0 International}
 }
+```

config.json CHANGED Viewed

@@ -1,13 +1,11 @@
 {
-  "_name_or_path": "/home/mserras/Downloads/setfit-model/backup-model-setfit-unprocessable/",
   "architectures": [
-    "XLMRobertaModel"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
-  "classifier_dropout": null,
   "eos_token_id": 2,
-  "gradient_checkpointing": false,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
@@ -15,15 +13,12 @@
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
-  "model_type": "xlm-roberta",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
-  "output_past": true,
   "pad_token_id": 1,
-  "position_embedding_type": "absolute",
   "torch_dtype": "float32",
   "transformers_version": "4.27.4",
-  "type_vocab_size": 1,
-  "use_cache": true,
-  "vocab_size": 250002
 }

 {
+  "_name_or_path": "/home/mserras/Downloads/setfit-model-nomulti/backup-model-setfit-unprocessable/",
   "architectures": [
+    "MPNetModel"
   ],
   "attention_probs_dropout_prob": 0.1,
   "bos_token_id": 0,
   "eos_token_id": 2,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 768,
   "intermediate_size": 3072,
   "layer_norm_eps": 1e-05,
   "max_position_embeddings": 514,
+  "model_type": "mpnet",
   "num_attention_heads": 12,
   "num_hidden_layers": 12,
   "pad_token_id": 1,
+  "relative_attention_num_buckets": 32,
   "torch_dtype": "float32",
   "transformers_version": "4.27.4",
+  "vocab_size": 30527
 }

model_head.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fad7d4c0ce4f486ba42180d19aa1647bcde1c0847b6b8b29004c86ab4d1b98de
 size 6991

 version https://git-lfs.github.com/spec/v1
+oid sha256:a895306e4dfca3415d2d3c6700eab5bd0eded8b713bb6bb2b27a53600dcc95ff
 size 6991

pytorch_model.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7130a90b18531f9303ffac78c1a02fbf97be1098c86ae7b584de8a9c425580d8
-size 1112242989

 version https://git-lfs.github.com/spec/v1
+oid sha256:a7e35ed7e8ee9d2a85b66741c57f9858eb69a6c15eb1eea1b5015161b6096589
+size 438013677

sentence_bert_config.json CHANGED Viewed

@@ -1,4 +1,4 @@
 {
-  "max_seq_length": 128,
   "do_lower_case": false
 }

 {
+  "max_seq_length": 512,
   "do_lower_case": false
 }

special_tokens_map.json CHANGED Viewed

@@ -11,5 +11,5 @@
   },
   "pad_token": "<pad>",
   "sep_token": "</s>",
-  "unk_token": "<unk>"
 }

   },
   "pad_token": "<pad>",
   "sep_token": "</s>",
+  "unk_token": "[UNK]"
 }

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b60b6b43406a48bf3638526314f3d232d97058bc93472ff2de930d43686fa441
-size 17082913

 version https://git-lfs.github.com/spec/v1
+oid sha256:5858fbf3342d891d40efa32ba2478c09c895e7eb3d94cdb721651126f0ba58f9
+size 711831

tokenizer_config.json CHANGED Viewed

@@ -1,7 +1,30 @@
 {
-  "bos_token": "<s>",
-  "cls_token": "<s>",
-  "eos_token": "</s>",
   "mask_token": {
     "__type": "AddedToken",
     "content": "<mask>",
@@ -11,9 +34,33 @@
     "single_word": false
   },
   "model_max_length": 512,
-  "pad_token": "<pad>",
-  "sep_token": "</s>",
   "special_tokens_map_file": null,
-  "tokenizer_class": "XLMRobertaTokenizer",
-  "unk_token": "<unk>"
 }

 {
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "__type": "AddedToken",
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_basic_tokenize": true,
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
   "mask_token": {
     "__type": "AddedToken",
     "content": "<mask>",
     "single_word": false
   },
   "model_max_length": 512,
+  "never_split": null,
+  "pad_token": {
+    "__type": "AddedToken",
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "__type": "AddedToken",
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
   "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "MPNetTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
 }