File size: 4,041 Bytes
c63a5d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import re
def mask_email(txt):
"""
Finds an email pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the email pattern
Returns
-----------
str
masked email string as ' [email] '
list
list with the found pattern(s)
"""
pattern=r'[^\s]+@[^\s]+'
sub=' [email] '
return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
def mask_url(txt):
"""
Finds an url pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the url pattern
Returns
-----------
str
masked url string as ' [url] '
list
list with the found pattern(s)
"""
pattern='http\S+'
pattern2='www\S+'
sub=' [url] '
txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
return txt, find+find2
def mask_oab(txt):
"""
Finds an OAB (which stands for Order of Attorneys of Brazil) pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the OAB pattern
Returns
-----------
str
masked OAB string as ' [oab] '
list
list with the found pattern(s)
"""
find=[]
pattern='OAB\s?[:-]?\s?\d+\s?/?\s?[A-Z]?[A-Z]?'
pattern2='OAB\s?/?\s?[A-Z]?[A-Z]?\s?[:-]?\s?\d+'
sub=' [oab] '
txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
return txt, find+find2
def mask_data(txt):
"""
Finds a date-format pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the date
Returns
-----------
str
masked date string as ' [data] '
list
list with the found pattern(s)
"""
pattern="\d{2}\s?\/\s?\d{2}\s?\/\s?\d{4}"
sub=' [data] '
return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
def mask_processo(txt, num=15):
"""
Finds a lawsuit number pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the lawsuit number pattern
Returns
-----------
str
masked lawsuit number string as ' [processo] '
list
list with the found pattern(s)
"""
pattern="\d{"+str(num)+",}" #consideramos números com mais de 15 dígitos como sendo o número de um processo
sub=' [processo] '
return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
def mask_numero(txt):
"""
Finds a number pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the number pattern
Returns
-----------
str
masked number string as ' [numero] '
list
list with the found pattern(s)
"""
pattern="\d+"
sub=' [numero] '
return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
def mask_valor(txt):
"""
Finds a value pattern and then masks it.
Parameters
-----------
txt: str
A piece of text containing the value pattern
Returns
-----------
str
masked value string as ' [valor] '
list
list with the found pattern(s)
"""
pattern="R\s?\$\s?\d+[.,]?\d+[.,]?\d+"
sub=' [valor] '
return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I) |