Projeto commited on
Commit
c63a5d2
1 Parent(s): deda615

Create mask_functions.py

Browse files
Files changed (1) hide show
  1. legalnlp/mask_functions.py +161 -0
legalnlp/mask_functions.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ def mask_email(txt):
4
+
5
+ """
6
+ Finds an email pattern and then masks it.
7
+ Parameters
8
+ -----------
9
+ txt: str
10
+ A piece of text containing the email pattern
11
+ Returns
12
+ -----------
13
+ str
14
+ masked email string as ' [email] '
15
+ list
16
+ list with the found pattern(s)
17
+ """
18
+
19
+
20
+ pattern=r'[^\s]+@[^\s]+'
21
+ sub=' [email] '
22
+
23
+ return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
24
+
25
+ def mask_url(txt):
26
+
27
+ """
28
+ Finds an url pattern and then masks it.
29
+ Parameters
30
+ -----------
31
+ txt: str
32
+ A piece of text containing the url pattern
33
+ Returns
34
+ -----------
35
+ str
36
+ masked url string as ' [url] '
37
+ list
38
+ list with the found pattern(s)
39
+ """
40
+
41
+ pattern='http\S+'
42
+ pattern2='www\S+'
43
+ sub=' [url] '
44
+
45
+ txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
46
+ txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
47
+
48
+ return txt, find+find2
49
+
50
+ def mask_oab(txt):
51
+
52
+ """
53
+ Finds an OAB (which stands for Order of Attorneys of Brazil) pattern and then masks it.
54
+ Parameters
55
+ -----------
56
+ txt: str
57
+ A piece of text containing the OAB pattern
58
+ Returns
59
+ -----------
60
+ str
61
+ masked OAB string as ' [oab] '
62
+ list
63
+ list with the found pattern(s)
64
+ """
65
+
66
+ find=[]
67
+ pattern='OAB\s?[:-]?\s?\d+\s?/?\s?[A-Z]?[A-Z]?'
68
+ pattern2='OAB\s?/?\s?[A-Z]?[A-Z]?\s?[:-]?\s?\d+'
69
+ sub=' [oab] '
70
+
71
+ txt, find = re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
72
+ txt, find2 = re.sub(pattern2, sub, txt, flags=re.I), re.findall(pattern2, txt, flags=re.I)
73
+
74
+ return txt, find+find2
75
+
76
+ def mask_data(txt):
77
+
78
+ """
79
+ Finds a date-format pattern and then masks it.
80
+ Parameters
81
+ -----------
82
+ txt: str
83
+ A piece of text containing the date
84
+ Returns
85
+ -----------
86
+ str
87
+ masked date string as ' [data] '
88
+ list
89
+ list with the found pattern(s)
90
+ """
91
+
92
+
93
+ pattern="\d{2}\s?\/\s?\d{2}\s?\/\s?\d{4}"
94
+ sub=' [data] '
95
+
96
+ return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
97
+
98
+ def mask_processo(txt, num=15):
99
+
100
+ """
101
+ Finds a lawsuit number pattern and then masks it.
102
+ Parameters
103
+ -----------
104
+ txt: str
105
+ A piece of text containing the lawsuit number pattern
106
+ Returns
107
+ -----------
108
+ str
109
+ masked lawsuit number string as ' [processo] '
110
+ list
111
+ list with the found pattern(s)
112
+ """
113
+
114
+ pattern="\d{"+str(num)+",}" #consideramos números com mais de 15 dígitos como sendo o número de um processo
115
+ sub=' [processo] '
116
+
117
+ return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
118
+
119
+ def mask_numero(txt):
120
+
121
+ """
122
+ Finds a number pattern and then masks it.
123
+ Parameters
124
+ -----------
125
+ txt: str
126
+ A piece of text containing the number pattern
127
+ Returns
128
+ -----------
129
+ str
130
+ masked number string as ' [numero] '
131
+ list
132
+ list with the found pattern(s)
133
+
134
+ """
135
+
136
+ pattern="\d+"
137
+ sub=' [numero] '
138
+
139
+ return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)
140
+
141
+ def mask_valor(txt):
142
+
143
+ """
144
+ Finds a value pattern and then masks it.
145
+ Parameters
146
+ -----------
147
+ txt: str
148
+ A piece of text containing the value pattern
149
+ Returns
150
+ -----------
151
+ str
152
+ masked value string as ' [valor] '
153
+ list
154
+ list with the found pattern(s)
155
+ """
156
+
157
+
158
+ pattern="R\s?\$\s?\d+[.,]?\d+[.,]?\d+"
159
+ sub=' [valor] '
160
+
161
+ return re.sub(pattern, sub, txt, flags=re.I), re.findall(pattern, txt, flags=re.I)