Create diacritics.py
Browse files- diacritics.py +39 -0
diacritics.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
2 |
+
import torch
|
3 |
+
import hazm
|
4 |
+
|
5 |
+
# بارگذاری مدل ParsBERT
|
6 |
+
model_name = "HooshvareLab/bert-fa-base-uncased"
|
7 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
8 |
+
model = AutoModelForTokenClassification.from_pretrained(model_name)
|
9 |
+
|
10 |
+
def add_diacritics(text):
|
11 |
+
# نرمالسازی و توکنسازی
|
12 |
+
normalizer = hazm.Normalizer()
|
13 |
+
text = normalizer.normalize(text)
|
14 |
+
words = hazm.word_tokenize(text)
|
15 |
+
|
16 |
+
# پردازش ورودی برای مدل
|
17 |
+
inputs = tokenizer(words, return_tensors="pt", is_split_into_words=True)
|
18 |
+
|
19 |
+
# پیشبینی مدل
|
20 |
+
with torch.no_grad():
|
21 |
+
outputs = model(**inputs).logits
|
22 |
+
|
23 |
+
# دریافت لیبلهای پیشبینیشده
|
24 |
+
predictions = torch.argmax(outputs, dim=2).tolist()[0]
|
25 |
+
|
26 |
+
# قوانین اضافه کردن اعراب
|
27 |
+
diacritics = {1: 'َ', 2: 'ِ', 3: 'ُ'} # فتحه، کسره، ضمه
|
28 |
+
result = []
|
29 |
+
|
30 |
+
for word, prediction in zip(words, predictions):
|
31 |
+
if prediction in diacritics:
|
32 |
+
word += diacritics[prediction]
|
33 |
+
result.append(word)
|
34 |
+
|
35 |
+
# بازسازی جمله با رعایت علائم نگارشی
|
36 |
+
final_text = " ".join(result)
|
37 |
+
final_text = final_text.replace(" ،", "،").replace(" .", ".").replace(" ؛", "؛")
|
38 |
+
|
39 |
+
return final_text
|