model v1.0
Browse files- README.md +133 -0
- config.json +30 -0
- pytorch_model.bin +3 -0
- scheduler.pt +3 -0
- special_tokens_map.json +1 -0
- spiece.model +3 -0
- tokenizer.json +0 -0
- tokenizer_config.json +1 -0
- training_args.bin +3 -0
README.md
ADDED
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
------------
|
2 |
+
## Arabic and English News Summarization NLP Model
|
3 |
+
|
4 |
+
### About
|
5 |
+
|
6 |
+
This model is for summarizing news stories in short highlights for both Arabic and English tasks.
|
7 |
+
|
8 |
+
نموذج معرفي متخصص في تلخيص الأخبار العربية و الإنجليزية الى مجموعة من أهم النقاط
|
9 |
+
|
10 |
+
### Fine-Tuning
|
11 |
+
|
12 |
+
The model was finetuned using the [Arabic T5 Model](https://huggingface.co/bakrianoo/t5-arabic-large) which developed by [Abu Bakr Soliman](http://github.com/bakrianoo).
|
13 |
+
|
14 |
+
The primary summarization model also developed by the same developer.
|
15 |
+
|
16 |
+
### How to Use
|
17 |
+
|
18 |
+
- You can use this [Colab Notebook](https://colab.research.google.com/drive/1DWND1CAfCXD4OxrfmLBEaKeXhjGmYkod?usp=sharing) to test the model
|
19 |
+
|
20 |
+
1. Install [PyTorch](https://pytorch.org/)
|
21 |
+
|
22 |
+
2. Install the following Python packages
|
23 |
+
|
24 |
+
`$ pip3 install transformers==4.7.0 nltk==3.5 protobuf==3.15.3 sentencepiece==0.1.96`
|
25 |
+
|
26 |
+
3. Run this code
|
27 |
+
|
28 |
+
```python
|
29 |
+
|
30 |
+
from transformers import AutoTokenizer, AutoModelWithLMHead
|
31 |
+
import torch
|
32 |
+
import nltk
|
33 |
+
nltk.download('punkt')
|
34 |
+
from nltk.tokenize import sent_tokenize
|
35 |
+
|
36 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
37 |
+
m_name = "marefa-nlp/summarization-arabic-english-news"
|
38 |
+
|
39 |
+
tokenizer = AutoTokenizer.from_pretrained(m_name)
|
40 |
+
model = AutoModelWithLMHead.from_pretrained(m_name).to(device)
|
41 |
+
|
42 |
+
def get_summary(text, tokenizer, model, device="cpu", num_beams=2):
|
43 |
+
if len(text.strip()) < 50:
|
44 |
+
return ["Please provide more longer text"]
|
45 |
+
|
46 |
+
text = "summarize: <paragraph> " + " <paragraph> ".join([ s.strip() for s in sent_tokenize(text) if s.strip() != ""]) + " </s>"
|
47 |
+
text = text.strip().replace("\n","")
|
48 |
+
|
49 |
+
tokenized_text = tokenizer.encode(text, return_tensors="pt").to(device)
|
50 |
+
|
51 |
+
summary_ids = model.generate(
|
52 |
+
tokenized_text,
|
53 |
+
max_length=512,
|
54 |
+
num_beams=num_beams,
|
55 |
+
repetition_penalty=1.5,
|
56 |
+
length_penalty=1.0,
|
57 |
+
early_stopping=True
|
58 |
+
)
|
59 |
+
|
60 |
+
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
|
61 |
+
return [ s.strip() for s in output.split("<hl>") if s.strip() != "" ]
|
62 |
+
|
63 |
+
## Prepare Samples
|
64 |
+
|
65 |
+
samples = [
|
66 |
+
"""
|
67 |
+
قال المدافع الإيطالي ليوناردو بونوتشي إن منتخب بلاده ليس خائفا من مواجهة نظيره الانجليزي على أرضه في المباراة النهائية في بطولة يورو 2020 لكرة القدم، في حين وصف المدافع الانجليزي جون ستونز المباراة المرتقبة بأنها ستكون "أكثر تميزا".
|
68 |
+
وسوف تقام المباراة في استاد ويمبلي، شمال غربي لندن، يوم الأحد.
|
69 |
+
وتسعى إيطاليا لإحراز اللقب الأوروبي للمرة الثانية بعد فوزها به أول مرة عام 1968.
|
70 |
+
ولم يفز الفريق الانجليزي بهذا اللقب القاري من قبل. والبطولة الرئيسية الوحيدة التي فازت بها انجلترا هي كأس العالم عام 1966 الذي أقيمت مباراته النهائية في استاد ويمبلي.
|
71 |
+
""",
|
72 |
+
|
73 |
+
"""
|
74 |
+
On a night fraught with tension, Italy clinched its first major title for 15 years with a penalty shootout win over England in the Euro 2020 final.
|
75 |
+
Luke Shaw's goal inside the opening two minutes gave England a lead it looked like it would hold onto all night, before a goalmouth scramble midway through the second half allowed Leonardo Bonucci to poke home an equalizer for Italy.
|
76 |
+
For the remainder of the match it felt as though extra-time and penalties were inevitable, as neither side seemed willing or brave enough to commit enough men forward to really trouble the opposing defenders.
|
77 |
+
England had suffered innumerable heartbreaks on penalties over the years and this time it was Italy's turn to inflict yet more pain on beleaguered English fans as Marcus Rashford, Jadon Sancho and Bukayo Saka all missed from the spot.
|
78 |
+
""",
|
79 |
+
]
|
80 |
+
|
81 |
+
## Get summariez
|
82 |
+
|
83 |
+
print("Original Article:", samples[0])
|
84 |
+
print("\n===========\nSummary: \n")
|
85 |
+
hls = get_summary(samples[0], tokenizer, model, device)
|
86 |
+
for hl in hls:
|
87 |
+
print("\t-", hl)
|
88 |
+
|
89 |
+
|
90 |
+
print("Original Article:", samples[1])
|
91 |
+
print("\n=========== \nSummary: \n")
|
92 |
+
hls = get_summary(samples[1], tokenizer, model, device)
|
93 |
+
for hl in hls:
|
94 |
+
print("\t-", hl)
|
95 |
+
```
|
96 |
+
|
97 |
+
Results
|
98 |
+
|
99 |
+
```
|
100 |
+
Original Article:
|
101 |
+
|
102 |
+
قال المدافع الإيطالي ليوناردو بونوتشي إن منتخب بلاده ليس خائفا من مواجهة نظيره الانجليزي على أرضه في المباراة النهائية في بطولة يورو 2020 لكرة القدم، في حين وصف المدافع الانجليزي جون ستونز المباراة المرتقبة بأنها ستكون "أكثر تميزا".
|
103 |
+
وسوف تقام المباراة في استاد ويمبلي، شمال غربي لندن، يوم الأحد.
|
104 |
+
وتسعى إيطاليا لإحراز اللقب الأوروبي للمرة الثانية بعد فوزها به أول مرة عام 1968.
|
105 |
+
ولم يفز الفريق الانجليزي بهذا اللقب القاري من قبل. والبطولة الرئيسية الوحيدة التي فازت بها انجلترا هي كأس العالم عام 1966 الذي أقيمت مباراته النهائية في استاد ويمبلي.
|
106 |
+
|
107 |
+
|
108 |
+
===========
|
109 |
+
Summary:
|
110 |
+
|
111 |
+
- وسوف تواجه إيطاليا إنجلترا في بطولة يورو 2020 لكرة القدم يوم الأحد.
|
112 |
+
- ستقام المباراة في استاد ويمبلي، شمال غربي لندن، يوم الأحد.
|
113 |
+
- ولم يفز الفريق الانجليزي بهذا اللقب القاري قبل.
|
114 |
+
|
115 |
+
```
|
116 |
+
|
117 |
+
```
|
118 |
+
Original Article:
|
119 |
+
|
120 |
+
On a night fraught with tension, Italy clinched its first major title for 15 years with a penalty shootout win over England in the Euro 2020 final.
|
121 |
+
Luke Shaw's goal inside the opening two minutes gave England a lead it looked like it would hold onto all night, before a goalmouth scramble midway through the second half allowed Leonardo Bonucci to poke home an equalizer for Italy.
|
122 |
+
For the remainder of the match it felt as though extra-time and penalties were inevitable, as neither side seemed willing or brave enough to commit enough men forward to really trouble the opposing defenders.
|
123 |
+
England had suffered innumerable heartbreaks on penalties over the years and this time it was Italy's turn to inflict yet more pain on beleaguered English fans as Marcus Rashford, Jadon Sancho and Bukayo Saka all missed from the spot.
|
124 |
+
|
125 |
+
|
126 |
+
===========
|
127 |
+
Summary:
|
128 |
+
|
129 |
+
- Italy beat England 1-0 in the Euro 2020 final.
|
130 |
+
- Luke Shaw's goal gave England a lead it looked like it would hold onto all night.
|
131 |
+
- Leonardo Bonucci scored the equalizer for Italy.
|
132 |
+
- Marcus Rashford, Jadon Sancho and Bukayo Saka all missed.
|
133 |
+
```
|
config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "./t5-full-models-12k/stat-checkpoint-6000",
|
3 |
+
"architectures": [
|
4 |
+
"T5ForConditionalGeneration"
|
5 |
+
],
|
6 |
+
"d_ff": 2816,
|
7 |
+
"d_kv": 64,
|
8 |
+
"d_model": 1024,
|
9 |
+
"decoder_start_token_id": 0,
|
10 |
+
"dropout_rate": 0.1,
|
11 |
+
"eos_token_id": 1,
|
12 |
+
"feed_forward_proj": "gated-gelu",
|
13 |
+
"gradient_checkpointing": false,
|
14 |
+
"initializer_factor": 1.0,
|
15 |
+
"is_encoder_decoder": true,
|
16 |
+
"layer_norm_epsilon": 1e-06,
|
17 |
+
"model_type": "t5",
|
18 |
+
"num_decoder_layers": 24,
|
19 |
+
"num_heads": 16,
|
20 |
+
"num_layers": 24,
|
21 |
+
"output_past": true,
|
22 |
+
"pad_token_id": 0,
|
23 |
+
"relative_attention_num_buckets": 32,
|
24 |
+
"tie_word_embeddings": false,
|
25 |
+
"tokenizer_class": "T5Tokenizer",
|
26 |
+
"torch_dtype": "float32",
|
27 |
+
"transformers_version": "4.9.0.dev0",
|
28 |
+
"use_cache": true,
|
29 |
+
"vocab_size": 35000
|
30 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d7d6bfb2959ba117cf44d11e0f7d33b175fce72b1f2f17458c4e150ccfeba57
|
3 |
+
size 3156380325
|
scheduler.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e32bd296dbe0d71091a57a3210b6f53ad1fe87d520fb368e9cfdbc61eb5237bc
|
3 |
+
size 623
|
special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>"}
|
spiece.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0b1638b19c09773fb382835ebed2a612ba15aceff181dd79703365aad4ae6821
|
3 |
+
size 847294
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 0, "additional_special_tokens": null, "sp_model_kwargs": {}, "special_tokens_map_file": "/root/.cache/huggingface/transformers/83fa3c1d6e6d11afc201e70615946ccb1b2464e8d577d134c3c97287f0ab5037.294ebaa4cd17bb284635004c92d2c4d522ec488c828dcce0c2471b6f28e3fe82", "name_or_path": "./t5-full-models-12k/stat-checkpoint-6000", "tokenizer_class": "T5Tokenizer"}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aec94cf0829b76ab2c3927d7eba16c7f880c0ab4079b02415b9e903b7c844883
|
3 |
+
size 2735
|