Mohamed-Sami commited on
Commit
c37e08c
1 Parent(s): cd0969a

Create preprocess.py

Browse files
Files changed (1) hide show
  1. preprocess.py +18 -0
preprocess.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re , string
2
+ from textacy.preprocessing.resources import (RE_EMAIL , RE_URL , RE_NUMBER ,
3
+ RE_NUMBER , RE_EMOJI , RE_SHORT_URL , RE_PHONE_NUMBER
4
+ )
5
+
6
+ NON_ARABIC_RE = re.compile(r"[%s]"%string.ascii_letters)
7
+
8
+ def clean_text(text:str)->str:
9
+ '''remove unwanted data'''
10
+ patterns = [RE_EMAIL , RE_EMOJI , RE_NUMBER , RE_PHONE_NUMBER , RE_SHORT_URL , RE_URL , NON_ARABIC_RE]
11
+
12
+ for pattern in patterns:
13
+
14
+ text = pattern.sub("" , text)
15
+
16
+ return text
17
+
18
+