imomayiz commited on
Commit
dc5bb62
·
1 Parent(s): cfd11ff

feat: a python script with functions used for processing and analysis of requests and interventions data

Browse files
Files changed (1) hide show
  1. src/data_analysis.py +242 -0
src/data_analysis.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains some functions used to analyze the data from requests and interventions.
3
+ """
4
+
5
+ import re
6
+ import datetime as dt
7
+ import pandas as pd
8
+ import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ from torch import Tensor
11
+ from transformers import AutoModel, AutoTokenizer
12
+ import torch.nn.functional as F
13
+
14
+
15
+ SUPPLIES_TAGS = {
16
+ 'alimentation': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
17
+ 'eau': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
18
+ 'food': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
19
+ 'water': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
20
+ 'nourriture': 'ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
21
+ 'medical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
22
+ 'médical': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
23
+ 'doctor': 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
24
+ 'vêtements': 'VÊTEMENTS / CLOTHES / الملابس',
25
+ 'clothes': 'VÊTEMENTS / CLOTHES / الملابس',
26
+ 'secours': 'SECOURS / RESCUE / الإنقاذ',
27
+ 'rescue': 'SECOURS / RESCUE / الإنقاذ',
28
+ 'refuge': 'REFUGE / SHELTER / المأوى',
29
+ 'shelter': 'REFUGE / SHELTER / المأوى',
30
+ 'couvertures': 'COUVERTURES / COVERS / البطانيات',
31
+ 'covers': 'COUVERTURES / COVERS / البطانيات',
32
+ 'pharmaceuticals': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
33
+ 'medicaments': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
34
+ 'pharmacy': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
35
+ 'medicine': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
36
+ 'blankets': 'COUVERTURES / COVERS / البطانيات',
37
+ 'tents': 'REFUGE / SHELTER / المأوى',
38
+ 'couches': 'PHARMACEUTICALS / MEDICAMENTS / الأدوية'
39
+ }
40
+
41
+ SUPPLIES_NEEDS_CATEGORIES = ['ALIMENTATION ET EAU / FOOD AND WATER / الغذاء والماء',
42
+ 'ASSISTANCE MÉDICALE / MEDICAL ASSISTANCE / المساعدة الطبية',
43
+ 'VÊTEMENTS / CLOTHES / الملابس',
44
+ 'SECOURS / RESCUE / الإنقاذ',
45
+ 'REFUGE / SHELTER / المأوى',
46
+ 'COUVERTURES / COVERS / البطانيات',
47
+ # 'KITCHEN TOOLS / USTENSILES DE CUISINE / أدوات المطبخ',
48
+ 'PHARMACEUTICALS / MEDICAMENTS / الأدوية',
49
+ 'OTHER']
50
+
51
+ TRANSLATION_DICT = {
52
+ 'أغطية': 'covers',
53
+ 'أسرة': 'beds',
54
+ 'وسادات': 'pillows',
55
+ 'مصابح': 'lamps',
56
+ 'خيام': 'tents',
57
+ 'ألعاب أطفال': 'toys',
58
+ 'قليل من المواد الغذائية': 'food',
59
+ 'افرشة': 'covers',
60
+ 'جلباب': 'clothes',
61
+ 'ملابس': 'clothes',
62
+ 'لديهم كل شيء': 'unknown'
63
+ }
64
+
65
+
66
+ def clean_text(text):
67
+ """
68
+ remove special characters from text
69
+ """
70
+ pattern = re.compile(r'[\u200e\xa0()\u200f]')
71
+ cleaned_text = pattern.sub('', text)
72
+ return cleaned_text
73
+
74
+
75
+ def contains_arabic(text):
76
+ """
77
+ check if the text contains arabic characters
78
+ """
79
+ arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
80
+ if type(text)!=str:
81
+ return False
82
+ return arabic_pattern.search(text) is not None
83
+
84
+
85
+ def arabic_to_latin_punctuation(text):
86
+ """
87
+ replace arabic punctuation with latin punctuation
88
+ """
89
+ punctuation_mapping = {
90
+ '،': ',',
91
+ '؛': ';',
92
+ 'ـ': '_',
93
+ '؟': '?',
94
+ '٪': '%',
95
+ '٫': '.',
96
+ }
97
+
98
+ for arabic_punct, latin_punct in punctuation_mapping.items():
99
+ text = text.replace(arabic_punct, latin_punct)
100
+
101
+ return text
102
+
103
+
104
+ def plot_timeline(df: pd.DataFrame, today: dt.datetime, date_col: str):
105
+ """Plot the timeline of requests and interventions.
106
+ """
107
+ df_past = df[df[date_col]<=today.date()]
108
+ df_future = df[df[date_col]>today.date()]
109
+
110
+ count_past = (df_past
111
+ .groupby(date_col)
112
+ .size()
113
+ .rename('count')
114
+ .reset_index())
115
+ past_date_range = pd.date_range(start=min(count_past[date_col]),
116
+ end=today.date(),
117
+ freq='D')
118
+ count_past = (count_past
119
+ .set_index(date_col)
120
+ .reindex(past_date_range, fill_value=0)
121
+ .reset_index())
122
+
123
+ if len(df_future)>0:
124
+ count_future = df_future.groupby(date_col).size().rename('count').reset_index()
125
+ future_date_range = pd.date_range(start=today.date()+dt.timedelta(days=1),
126
+ end=max(count_future[date_col]),
127
+ freq='D')
128
+ count_future = (count_future
129
+ .set_index(date_col)
130
+ .reindex(future_date_range, fill_value=0)
131
+ .reset_index())
132
+ else:
133
+ count_future = pd.DataFrame()
134
+
135
+ bridge_date = today.date()
136
+ bridge_data = pd.DataFrame(
137
+ {'index': bridge_date, 'form_date':count_past.iloc[-1]['count']}, index=[0])
138
+ count_future = pd.concat([bridge_data, count_future], ignore_index=True)
139
+
140
+ # Plot
141
+ fig = go.Figure()
142
+ # past
143
+ fig.add_trace(go.Scatter(x=count_past['index'],
144
+ y=count_past['count'],
145
+ mode='lines',
146
+ name='Past Interventions',
147
+ line=dict(color='blue')))
148
+ # future
149
+ fig.add_trace(go.Scatter(x=count_future['index'],
150
+ y=count_future['count'],
151
+ mode='lines',
152
+ name='Future Interventions',
153
+ line=dict(color='orange')))
154
+
155
+ fig.add_vline(x=today.date(), line_dash="dash", line_color="black")
156
+
157
+ fig.update_layout(yaxis_title="#", xaxis_title='date')
158
+ return fig
159
+
160
+
161
+ def classify_supplies_rule_based(text: pd.DataFrame, keep_raw: bool = False):
162
+ """ Classifies text into supplies categories from SUPPLIES_TAGS
163
+ using a rule-based approach."""
164
+ classes = []
165
+ lowercase_text = text.lower() # case-insensitive matching
166
+
167
+ for keyword, category in SUPPLIES_TAGS.items():
168
+ if keyword in lowercase_text:
169
+ classes.append(category)
170
+
171
+ if keep_raw:
172
+ classes.append(lowercase_text)
173
+
174
+ elif not classes:
175
+ classes.append('OTHER')
176
+
177
+ return list(set(classes))
178
+
179
+
180
+ def classify_multilingual_field_e5(df: pd.DataFrame,
181
+ field_to_tag: str = 'supplies',
182
+ categories: list = SUPPLIES_NEEDS_CATEGORIES):
183
+ """
184
+ Tag supplies/requests into categories using multilingual-e5-large model.
185
+ Returns a dataframe with a new column containing the list of predicted categories.
186
+ Requires CUDA
187
+ """
188
+ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
189
+ last_hidden = last_hidden_states.masked_fill(
190
+ ~attention_mask[..., None].bool(), 0.0)
191
+ return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
192
+
193
+ tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-large')
194
+ model = AutoModel.from_pretrained('intfloat/multilingual-e5-large')
195
+ model.cuda()
196
+
197
+ # classify ar supplies
198
+ processed_df = df.copy()
199
+ values_to_classify = processed_df[field_to_tag]
200
+
201
+ mapped_inputs = dict()
202
+
203
+ for text in values_to_classify:
204
+ gt = [f"{s}" for s in categories]
205
+ qr = [f"{v}" for v in re.split("\.|,| و", text)]
206
+ input_texts = qr + gt
207
+
208
+ # Tokenize the input texts
209
+ batch_dict = tokenizer(
210
+ input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')
211
+ batch_dict = {k: v.cuda() for k, v in batch_dict.items()}
212
+
213
+ outputs = model(**batch_dict)
214
+ embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
215
+
216
+ # normalize embeddings
217
+ embeddings = F.normalize(embeddings, p=2, dim=1)
218
+ scores = (embeddings[:len(qr)] @ embeddings[len(qr):].T) * 100
219
+
220
+ mapped_inputs[text] = list(
221
+ set([categories[int(scores[i,:].argmax())] for i in range(len(qr))]))
222
+
223
+ processed_df.loc[values_to_classify.index, f'{field_to_tag}_category'] = list(
224
+ mapped_inputs.values())
225
+
226
+ return processed_df
227
+
228
+
229
+ def plot_categories_share(raw_df: pd.DataFrame,
230
+ today: dt.datetime,
231
+ field: str = 'supplies'):
232
+ """
233
+ Plot the share of each category of requests/supplies.
234
+ """
235
+ df = raw_df[[field, f'{field}_category']].explode(f'{field}_category')
236
+ pie_data = df.groupby(f'{field}_category', as_index=False).size().rename('n')
237
+ fig = px.pie(pie_data,
238
+ names=f'{field}_category',
239
+ values='n',
240
+ title=f'# per {field} category up till {today.date()}',
241
+ labels={f'{field}_category': f'{field}', 'n': '%'})
242
+ return fig