PereLluis13
commited on
Commit
•
5843d66
1
Parent(s):
1800ae6
Update README.md
Browse files
README.md
CHANGED
@@ -51,9 +51,9 @@ mREBEL is introduced in the ACL 2023 paper [RED^{FM}: a Filtered and Multilingua
|
|
51 |
url = "https://arxiv.org/abs/2306.09802",
|
52 |
}
|
53 |
|
54 |
-
The original repository for the paper can be found [here](https://github.com/Babelscape/rebel)
|
55 |
|
56 |
-
Be aware that the inference widget at the right does not output special tokens, which are necessary to distinguish the subject, object and relation types. For a demo of
|
57 |
|
58 |
## Pipeline usage
|
59 |
|
@@ -146,7 +146,11 @@ def extract_triplets_typed(text):
|
|
146 |
return triplets
|
147 |
|
148 |
# Load model and tokenizer
|
149 |
-
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large-32", src_lang="en_XX",
|
|
|
|
|
|
|
|
|
150 |
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large-32")
|
151 |
gen_kwargs = {
|
152 |
"max_length": 256,
|
@@ -166,7 +170,7 @@ model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, re
|
|
166 |
generated_tokens = model.generate(
|
167 |
model_inputs["input_ids"].to(model.device),
|
168 |
attention_mask=model_inputs["attention_mask"].to(model.device),
|
169 |
-
decoder_start_token_id =
|
170 |
**gen_kwargs,
|
171 |
)
|
172 |
|
@@ -179,6 +183,7 @@ for idx, sentence in enumerate(decoded_preds):
|
|
179 |
print(extract_triplets_typed(sentence))
|
180 |
```
|
181 |
|
|
|
182 |
## License
|
183 |
|
184 |
This model is licensed under the CC BY-SA 4.0 license. The text of the license can be found [here](https://creativecommons.org/licenses/by-nc-sa/4.0/).
|
|
|
51 |
url = "https://arxiv.org/abs/2306.09802",
|
52 |
}
|
53 |
|
54 |
+
The original repository for the paper can be found [here](https://github.com/Babelscape/rebel#REDFM)
|
55 |
|
56 |
+
Be aware that the inference widget at the right does not output special tokens, which are necessary to distinguish the subject, object and relation types. For a demo of mREBEL and its pre-training dataset check the [Spaces demo](https://huggingface.co/spaces/Babelscape/mrebel-demo).
|
57 |
|
58 |
## Pipeline usage
|
59 |
|
|
|
146 |
return triplets
|
147 |
|
148 |
# Load model and tokenizer
|
149 |
+
tokenizer = AutoTokenizer.from_pretrained("Babelscape/mrebel-large-32", src_lang="en_XX", tgt_lang="tp_XX")
|
150 |
+
# Here we set English ("en_XX") as source language. To change the source language swap the first token of the input for your desired language or change to supported language. For catalan ("ca_XX") or greek ("el_EL") (not included in mBART pretraining) you need a workaround:
|
151 |
+
# tokenizer._src_lang = "ca_XX"
|
152 |
+
# tokenizer.cur_lang_code_id = tokenizer.convert_tokens_to_ids("ca_XX")
|
153 |
+
# tokenizer.set_src_lang_special_tokens("ca_XX")
|
154 |
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/mrebel-large-32")
|
155 |
gen_kwargs = {
|
156 |
"max_length": 256,
|
|
|
170 |
generated_tokens = model.generate(
|
171 |
model_inputs["input_ids"].to(model.device),
|
172 |
attention_mask=model_inputs["attention_mask"].to(model.device),
|
173 |
+
decoder_start_token_id = tokenizer.convert_tokens_to_ids("tp_XX"),
|
174 |
**gen_kwargs,
|
175 |
)
|
176 |
|
|
|
183 |
print(extract_triplets_typed(sentence))
|
184 |
```
|
185 |
|
186 |
+
|
187 |
## License
|
188 |
|
189 |
This model is licensed under the CC BY-SA 4.0 license. The text of the license can be found [here](https://creativecommons.org/licenses/by-nc-sa/4.0/).
|