a-bab commited on
Commit
3a93ef8
·
1 Parent(s): 645234c

Upload 2 files

Browse files
Files changed (3) hide show
  1. .gitattributes +1 -0
  2. Arabic-SQuAD.json +3 -0
  3. app.py +96 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Arabic-SQuAD.json filter=lfs diff=lfs merge=lfs -text
Arabic-SQuAD.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b60a3bf0f71ec884bc286e6c0a1336bdc7e5f89c6a3d63a723e4c5129c373a9a
3
+ size 51323713
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ from datasets import Dataset
4
+ from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, pipeline
5
+
6
+ # Load the Dataset
7
+ with open('./data/Arabic-SQuAD.json', 'r', encoding='utf-8') as file:
8
+ soqal_dataset = json.load(file)
9
+
10
+ # Convert JSON to Hugging Face Dataset
11
+ def convert_to_dataset(dataset_dict):
12
+ data = []
13
+ for article in dataset_dict['data']:
14
+ for paragraph in article['paragraphs']:
15
+ context = paragraph['context']
16
+ for qa in paragraph['qas']:
17
+ question = qa['question']
18
+ id = qa['id']
19
+ answers = qa.get('answers', [])
20
+ if answers:
21
+ text = answers[0]['text']
22
+ start = answers[0]['answer_start']
23
+ data.append({'context': context, 'question': question, 'id': id, 'answer_text': text, 'start_position': start})
24
+ return Dataset.from_dict({'context': [d['context'] for d in data],
25
+ 'question': [d['question'] for d in data],
26
+ 'answer_text': [d['answer_text'] for d in data],
27
+ 'id': [d['id'] for d in data],
28
+ 'start_position': [d['start_position'] for d in data]})
29
+
30
+ soqal_formatted_dataset = convert_to_dataset(soqal_dataset)
31
+
32
+ # Tokenize Dataset
33
+ tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
34
+ # Adjust the tokenization function to include the start and end positions of the answer
35
+ def tokenize_function(examples):
36
+ # Encode the context and question to get input_ids, attention_mask, and token_type_ids
37
+ encodings = tokenizer(examples['context'], examples['question'], truncation=True, padding='max_length', max_length=512)
38
+
39
+ # Assign the start_positions and end_positions to the encodings
40
+ start_positions = examples['start_position']
41
+ end_positions = [start + len(answer) for start, answer in zip(start_positions, examples['answer_text'])]
42
+
43
+ encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
44
+ return encodings
45
+
46
+ # Assuming 'soqal_formatted_dataset' is of 'Dataset' type
47
+ tokenized_soqal_datasets = soqal_formatted_dataset.map(tokenize_function, batched=True)
48
+
49
+ # Splitting the Dataset
50
+ small_train_dataset = tokenized_soqal_datasets.select([i for i in range(0, len(tokenized_soqal_datasets), 2)]) # 50% train
51
+ small_eval_dataset = tokenized_soqal_datasets.select([i for i in range(1, len(tokenized_soqal_datasets), 2)]) # 50% eval
52
+
53
+
54
+ # Initialize Model and Trainer
55
+ model = AutoModelForQuestionAnswering.from_pretrained("aubmindlab/bert-base-arabertv02")
56
+
57
+ training_args = TrainingArguments(
58
+ output_dir='./results',
59
+ num_train_epochs=3,
60
+ per_device_train_batch_size=4,
61
+ per_device_eval_batch_size=4,
62
+ warmup_steps=500,
63
+ weight_decay=0.01,
64
+ logging_dir='./logs',
65
+ logging_steps=100,
66
+ do_train=True,
67
+ do_eval=True,
68
+ evaluation_strategy="epoch",
69
+ save_strategy="epoch",
70
+ push_to_hub=False,
71
+ )
72
+
73
+ trainer = Trainer(
74
+ model=model,
75
+ args=training_args,
76
+ train_dataset=small_train_dataset, # Use the training dataset here
77
+ eval_dataset=small_eval_dataset, # Use the evaluation dataset here
78
+ )
79
+
80
+
81
+ # Train and Save Model
82
+ trainer.train()
83
+ trainer.save_model("./arabic_qa_model")
84
+
85
+ # Evaluate Model
86
+ results = trainer.evaluate()
87
+ print(results)
88
+
89
+ # Test Model after Training
90
+ nlp = pipeline("question-answering", model=model, tokenizer=tokenizer)
91
+
92
+ context = "يرجى وضع النص العربي هنا الذي يحتوي على المعلومات."
93
+ question = "ما هو السؤال الذي تريد الإجابة عليه؟"
94
+
95
+ answer = nlp(question=question, context=context)
96
+ print(answer)