Gosse Minnema commited on
Commit
b11ac48
·
1 Parent(s): 0f2a300

Add sociofillmore code, load dataset via private dataset repo

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +13 -0
  2. app.py +0 -4
  3. docker_commands.sh +8 -0
  4. requirements.txt +0 -0
  5. resources/RAI_sources_mr.xlsx +0 -0
  6. resources/active_frames_full.csv +1229 -0
  7. resources/crashes_frame_list.txt +14 -0
  8. resources/crashes_frame_to_roles.csv +11 -0
  9. resources/crashes_sources.csv +440 -0
  10. resources/deep_frame_cache.json +0 -0
  11. resources/dep_labels.txt +159 -0
  12. resources/femicide_frame_list.txt +23 -0
  13. resources/femicides_frame_to_roles.csv +16 -0
  14. resources/fn_frames_to_roles.json +0 -0
  15. resources/migration_frame_list.txt +56 -0
  16. sociofillmore/__init__.py +0 -0
  17. sociofillmore/__init__.pyc +0 -0
  18. sociofillmore/__pycache__/__init__.cpython-311.pyc +0 -0
  19. sociofillmore/__pycache__/__init__.cpython-37.pyc +0 -0
  20. sociofillmore/__pycache__/__init__.cpython-39.pyc +0 -0
  21. sociofillmore/common/__init__.py +0 -0
  22. sociofillmore/common/__pycache__/__init__.cpython-37.pyc +0 -0
  23. sociofillmore/common/__pycache__/__init__.cpython-39.pyc +0 -0
  24. sociofillmore/common/__pycache__/analyze_text.cpython-37.pyc +0 -0
  25. sociofillmore/common/__pycache__/analyze_text.cpython-39.pyc +0 -0
  26. sociofillmore/common/__pycache__/split_lome_files.cpython-39.pyc +0 -0
  27. sociofillmore/common/analyze_text.py +1046 -0
  28. sociofillmore/common/convert_comms.py +208 -0
  29. sociofillmore/common/filter_lang.py +32 -0
  30. sociofillmore/common/get_nltk_fn_roles.py +11 -0
  31. sociofillmore/common/pos_based_targetid.py +31 -0
  32. sociofillmore/common/split_lome_files.py +22 -0
  33. sociofillmore/crashes/__pycache__/utils.cpython-37.pyc +0 -0
  34. sociofillmore/crashes/__pycache__/utils.cpython-39.pyc +0 -0
  35. sociofillmore/crashes/generate_templates.py +277 -0
  36. sociofillmore/crashes/make_bechdel_dicts.py +90 -0
  37. sociofillmore/crashes/predict_bechdel.py +500 -0
  38. sociofillmore/crashes/split_data.py +240 -0
  39. sociofillmore/crashes/utils.py +16 -0
  40. sociofillmore/femicides/compare_lome_models.py +296 -0
  41. sociofillmore/femicides/evalita_err_analysis.py +182 -0
  42. sociofillmore/femicides/extract_texts.py +15 -0
  43. sociofillmore/femicides/split_data.py +235 -0
  44. sociofillmore/migration/cda_classify.py +338 -0
  45. sociofillmore/migration/cda_classify_.py +140 -0
  46. sociofillmore/migration/extract_political_ratings.py +17 -0
  47. sociofillmore/migration/preprocess.py +57 -0
  48. sociofillmore/migration/split_data.py +85 -0
  49. sociofillmore/migration/split_lome_predictions.py +54 -0
  50. sociofillmore/scoring/eval/__pycache__/analyze_final_questionnaire.cpython-37.pyc +0 -0
Dockerfile ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ WORKDIR /app
3
+ ADD . /app
4
+ RUN mkdir /nltk_data
5
+ RUN mkdir /.allennlp
6
+ RUN mkdir /.cache
7
+ RUN mkdir /.local
8
+ RUN chmod -R 777 /nltk_data
9
+ RUN chmod -R 777 /.allennlp
10
+ RUN chmod -R 777 /.cache
11
+ RUN chmod -R 777 /.local
12
+ RUN pip install -v -r requirements.combined.txt
13
+ CMD ["sh", "docker_commands.sh"]
app.py DELETED
@@ -1,4 +0,0 @@
1
- import os
2
- from gradio_client import Client
3
-
4
- client = Client("responsibility-framing/sociofillmore", hf_token=os.getenv("HF_TOKEN"))
 
 
 
 
 
docker_commands.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # cd spanfinder/
2
+ # python -m sociolome.lome_webserver &
3
+ # cd ..
4
+ git clone https://gossminn:$HF_TOKEN@huggingface.co/datasets/responsibility-framing/sociofillmore-datasets
5
+ cp -r sociofillmore-datasets/data .
6
+ cp -r sociofillmore-datasets/output .
7
+
8
+ python -m sociofillmore.webapp.app 0.0.0.0
requirements.txt ADDED
File without changes
resources/RAI_sources_mr.xlsx ADDED
Binary file (8.21 kB). View file
 
resources/active_frames_full.csv ADDED
@@ -0,0 +1,1229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ frame,active,notes
2
+ Abandonment,TRUE,
3
+ Abounding_with,FALSE,
4
+ Absorb_heat,FALSE,
5
+ Abundance,FALSE,
6
+ Abusing,TRUE,
7
+ Access_scenario,FALSE,
8
+ Accompaniment,FALSE,
9
+ Accomplishment,TRUE,
10
+ Accoutrements,FALSE,
11
+ Accuracy,FALSE,
12
+ Achieving_first,TRUE,
13
+ Active_substance,TRUE,
14
+ Activity,TRUE,
15
+ Activity_abandoned_state,TRUE,
16
+ Activity_done_state,TRUE,
17
+ Activity_finish,TRUE,
18
+ Activity_ongoing,TRUE,
19
+ Activity_pause,TRUE,
20
+ Activity_paused_state,TRUE,
21
+ Activity_prepare,TRUE,
22
+ Activity_ready_state,FALSE,
23
+ Activity_resume,TRUE,
24
+ Activity_start,TRUE,
25
+ Activity_stop,TRUE,
26
+ Actually_occurring_entity,FALSE,
27
+ Addiction,FALSE,
28
+ Adding_up,TRUE,
29
+ Adducing,TRUE,
30
+ Adjacency,FALSE,
31
+ Adjusting,TRUE,
32
+ Adopt_selection,TRUE,
33
+ Aesthetics,FALSE,
34
+ Affirm_or_deny,TRUE,
35
+ Age,FALSE,
36
+ Aggregate,FALSE,
37
+ Aging,FALSE,
38
+ Agree_or_refuse_to_act,TRUE,
39
+ Agriculture,TRUE,
40
+ Aiming,TRUE,
41
+ Alignment_image_schema,FALSE,
42
+ Alliance,FALSE,
43
+ Alternatives,FALSE,
44
+ Alternativity,FALSE,
45
+ Amalgamation,FALSE,
46
+ Amassing,TRUE,"according to def, Recipient arg ‘sometimes agentive’, but from EN examples seems unergative"
47
+ Ambient_temperature,FALSE,
48
+ Ammunition,FALSE,
49
+ Amounting_to,FALSE,
50
+ Animals,FALSE,
51
+ Annoyance,FALSE,
52
+ Appeal,TRUE,
53
+ Appellations,FALSE,
54
+ Apply_heat,TRUE,
55
+ Appointing,TRUE,
56
+ Architectural_part,FALSE,
57
+ Arithmetic,FALSE,
58
+ Arithmetic_commutative,FALSE,
59
+ Arithmetic_non-commutative,FALSE,
60
+ Armor,FALSE,
61
+ Arraignment,FALSE,
62
+ Arranging,TRUE,
63
+ Arrest,TRUE,
64
+ Arriving,FALSE,
65
+ Arson,TRUE,
66
+ Artifact,FALSE,
67
+ Artifact_subpart,FALSE,
68
+ Artificiality,FALSE,
69
+ Artistic_style,FALSE,
70
+ Assemble,TRUE,
71
+ Assessing,TRUE,
72
+ Assigned_location,FALSE,
73
+ Assistance,TRUE,
74
+ Asymmetric_reciprocality,TRUE,
75
+ Atonement,TRUE,
76
+ Attaching,TRUE,
77
+ Attack,TRUE,
78
+ Attempt,TRUE,
79
+ Attempt_action_scenario,FALSE,
80
+ Attempt_distant_interaction_scenario,FALSE,
81
+ Attempt_means,TRUE,
82
+ Attempt_obtain_food_scenario,FALSE,
83
+ Attempt_obtain_mineral_scenario,FALSE,
84
+ Attempt_suasion,TRUE,
85
+ Attempting_and_resolving_scenario,FALSE,
86
+ Attending,TRUE,
87
+ Attention,TRUE,
88
+ Attention_getting,FALSE,
89
+ Attitude_description,FALSE,
90
+ Attributed_information,FALSE,
91
+ Attributes,FALSE,
92
+ Authority,TRUE,
93
+ Avoiding,TRUE,
94
+ Awareness,TRUE,
95
+ Awareness_change_scenario,FALSE,
96
+ Awareness_situation,FALSE,
97
+ Awareness_status,FALSE,
98
+ Bail_decision,TRUE,
99
+ Basis_for_attribute,FALSE,
100
+ Be_in_agreement_on_action,TRUE,
101
+ Be_in_agreement_on_assessment,TRUE,
102
+ Be_on_alert,FALSE,
103
+ Be_subset_of,FALSE,
104
+ Be_translation_equivalent,FALSE,
105
+ Bearing_arms,TRUE,
106
+ Beat_opponent,TRUE,
107
+ Becoming,FALSE,
108
+ Becoming_a_member,TRUE,
109
+ Becoming_attached,FALSE,
110
+ Becoming_aware,TRUE,
111
+ Becoming_detached,FALSE,
112
+ Becoming_dry,FALSE,
113
+ Becoming_separated,FALSE,
114
+ Becoming_silent,TRUE,
115
+ Becoming_visible,FALSE,
116
+ Behind_the_scenes,TRUE,
117
+ Being_active,TRUE,
118
+ Being_at_risk,FALSE,
119
+ Being_attached,FALSE,
120
+ Being_awake,TRUE,
121
+ Being_born,FALSE,
122
+ Being_contained_within,FALSE,
123
+ Being_detached,FALSE,
124
+ Being_dry,FALSE,
125
+ Being_employed,TRUE,
126
+ Being_in_captivity,FALSE,
127
+ Being_in_category,FALSE,
128
+ Being_in_control,TRUE,
129
+ Being_in_effect,FALSE,
130
+ Being_in_operation,TRUE,
131
+ Being_incarcerated,TRUE,e.g. ‘serve time’
132
+ Being_included,FALSE,
133
+ Being_located,TRUE,e.g. ‘sit on’
134
+ Being_named,FALSE,
135
+ Being_necessary,FALSE,
136
+ Being_obligated,FALSE,
137
+ Being_obligatory,FALSE,"? only verb ‘behoove’, not sure about properties"
138
+ Being_operational,TRUE,"e.g. ‘works, is working’"
139
+ Being_pregnant,FALSE,
140
+ Being_questionable,FALSE,
141
+ Being_relevant,FALSE,"? ‘pertain ‘seems ergative syntactically, but not sure"
142
+ Being_rotted,FALSE,
143
+ Being_up_to_it,FALSE,
144
+ Being_wet,FALSE,
145
+ Besieging,TRUE,
146
+ Beyond_compare,FALSE,
147
+ Billing,TRUE,
148
+ Biological_area,FALSE,
149
+ Biological_classification,FALSE,
150
+ Biological_entity,FALSE,
151
+ Biological_mechanisms,TRUE,
152
+ Biological_urge,FALSE,
153
+ Birth_scenario,FALSE,
154
+ Board_vehicle,TRUE,
155
+ Body_decoration,FALSE,
156
+ Body_description_holistic,FALSE,
157
+ Body_description_part,FALSE,
158
+ Body_mark,FALSE,
159
+ Body_movement,TRUE,
160
+ Body_parts,FALSE,
161
+ Bond_maturation,TRUE,
162
+ Borrowing,TRUE,
163
+ Boundary,FALSE,
164
+ Bounded_entity,FALSE,
165
+ Bounded_region,FALSE,
166
+ Bragging,TRUE,
167
+ Breaking_apart,FALSE,
168
+ Breaking_off,FALSE,
169
+ Breaking_out_captive,TRUE,
170
+ Breathing,TRUE,
171
+ Bringing,TRUE,
172
+ Building,TRUE,
173
+ Building_subparts,FALSE,
174
+ Buildings,FALSE,
175
+ Bungling,TRUE,
176
+ Burying,TRUE,
177
+ Business_closure,TRUE,'founder’ seems ambiguous (?foundered efforts? foundered business?) but ‘close doors’ is clearly unergative
178
+ Businesses,FALSE,
179
+ Cache,FALSE,
180
+ Calendric_unit,FALSE,
181
+ Candidness,FALSE,
182
+ Capability,TRUE,
183
+ Capacity,FALSE,verbs difficult to judge (e.g. ‘a stadium seats’ ??)
184
+ Capital_stock,FALSE,
185
+ Cardinal_numbers,FALSE,
186
+ Carry_goods,TRUE,
187
+ Catastrophe,TRUE,
188
+ Catching_fire,FALSE,
189
+ Categorization,TRUE,
190
+ Causation,TRUE,
191
+ Causation_scenario,TRUE,
192
+ Cause_benefit_or_detriment,TRUE,
193
+ Cause_bodily_experience,TRUE,
194
+ Cause_change,TRUE,
195
+ Cause_change_of_consistency,TRUE,
196
+ Cause_change_of_phase,TRUE,
197
+ Cause_change_of_position_on_a_scale,TRUE,
198
+ Cause_change_of_strength,TRUE,
199
+ Cause_emotion,TRUE,
200
+ Cause_expansion,TRUE,
201
+ Cause_fluidic_motion,TRUE,
202
+ Cause_harm,TRUE,
203
+ Cause_impact,TRUE,
204
+ Cause_motion,TRUE,
205
+ Cause_proliferation_in_number,TRUE,
206
+ Cause_temperature_change,TRUE,
207
+ Cause_to_amalgamate,TRUE,
208
+ Cause_to_be_dry,TRUE,
209
+ Cause_to_be_included,TRUE,
210
+ Cause_to_be_sharp,TRUE,
211
+ Cause_to_be_wet,TRUE,
212
+ Cause_to_burn,TRUE,
213
+ Cause_to_continue,TRUE,
214
+ Cause_to_end,TRUE,
215
+ Cause_to_experience,TRUE,
216
+ Cause_to_fragment,TRUE,
217
+ Cause_to_land,TRUE,
218
+ Cause_to_make_noise,TRUE,
219
+ Cause_to_make_progress,TRUE,
220
+ Cause_to_move_in_place,TRUE,
221
+ Cause_to_perceive,TRUE,
222
+ Cause_to_resume,TRUE,
223
+ Cause_to_rot,TRUE,
224
+ Cause_to_start,TRUE,
225
+ Cause_to_wake,TRUE,
226
+ Ceasing_to_be,FALSE,
227
+ Certainty,TRUE,
228
+ Change_accessibility,TRUE,
229
+ Change_direction,TRUE,"1st FE is called Theme, but verbs (‘turn’) pass test"
230
+ Change_event_duration,TRUE,
231
+ Change_event_time,TRUE,
232
+ Change_of_consistency,FALSE,
233
+ Change_of_leadership,TRUE,Selector is not 1st FE but seems to be the subj of all of the verbs in the frame
234
+ Change_of_phase,FALSE,'the lake froze’ → ‘the frozen lake’ => unaccusative
235
+ Change_of_phase_scenario,FALSE,
236
+ Change_of_quantity_of_possession,TRUE,
237
+ Change_of_temperature,FALSE,
238
+ Change_operational_state,TRUE,
239
+ Change_position_on_a_scale,FALSE,
240
+ Change_post-state,FALSE,
241
+ Change_posture,TRUE,
242
+ Change_resistance,TRUE,
243
+ Change_tool,TRUE,
244
+ Chaos,FALSE,
245
+ Chatting,TRUE,
246
+ Chemical-sense_description,TRUE,
247
+ Chemical_potency,FALSE,
248
+ Choosing,TRUE,
249
+ Circumscribed_existence,FALSE,
250
+ Citing,TRUE,
251
+ Claim_ownership,TRUE,
252
+ Clemency,FALSE,no verbs but describes active event
253
+ Closure,TRUE,
254
+ Clothing,FALSE,
255
+ Clothing_parts,FALSE,
256
+ Co-association,FALSE,
257
+ Cogitation,TRUE,
258
+ Cognitive_connection,TRUE,
259
+ Coincidence,FALSE,
260
+ Collaboration,TRUE,
261
+ Collocation_image_schema,FALSE,
262
+ Colonization,TRUE,
263
+ Color,FALSE,
264
+ Color_qualities,FALSE,
265
+ Come_down_with,TRUE,
266
+ Come_into_effect,FALSE,"not sure, ‘terms and conditions apply’ → ‘the applied terms and conditions’?"
267
+ Come_together,TRUE,
268
+ Coming_to_be,FALSE,
269
+ Coming_to_believe,TRUE,
270
+ Coming_up_with,TRUE,
271
+ Commemorative,FALSE,
272
+ Commerce_buy,TRUE,
273
+ Commerce_collect,TRUE,
274
+ Commerce_goods-transfer,TRUE,
275
+ Commerce_money-transfer,TRUE,
276
+ Commerce_pay,TRUE,
277
+ Commerce_scenario,TRUE,
278
+ Commerce_sell,TRUE,
279
+ Commercial_transaction,TRUE,
280
+ Commitment,TRUE,
281
+ Committing_crime,TRUE,
282
+ Commonality,FALSE,
283
+ Communicate_categorization,TRUE,
284
+ Communication,TRUE,
285
+ Communication_manner,TRUE,
286
+ Communication_means,TRUE,
287
+ Communication_noise,TRUE,
288
+ Communication_response,TRUE,
289
+ Commutation,TRUE,
290
+ Commutative_process,TRUE,"hard to judge, no examples"
291
+ Commutative_statement,FALSE,
292
+ Compatibility,FALSE,"Difficult, but it’s 'rhyming words’ not ‘rhymed words’; ‘matched’/’matching’ both possible but ‘matched’ implies an agent (probably a different frame)"
293
+ Competition,TRUE,
294
+ Complaining,TRUE,
295
+ Completeness,FALSE,
296
+ Compliance,TRUE,e.g. ‘obey’
297
+ Concessive,FALSE,
298
+ Condition_symptom_relation,TRUE,
299
+ Conditional_occurrence,FALSE,
300
+ Conditional_scenario,FALSE,
301
+ Conduct,TRUE,
302
+ Confronting_problem,TRUE,
303
+ Connecting_architecture,FALSE,
304
+ Connectors,FALSE,
305
+ Conquering,TRUE,
306
+ Contact_image_schema,FALSE,
307
+ Contacting,TRUE,
308
+ Container_focused_placing,TRUE,
309
+ Container_focused_removing,TRUE,
310
+ Containers,FALSE,
311
+ Containing,TRUE,
312
+ Containment_scenario,FALSE,
313
+ Contingency,TRUE,
314
+ Continued_state_of_affairs,FALSE,
315
+ Contrary_circumstances,FALSE,
316
+ Contrition,TRUE,
317
+ Control,TRUE,
318
+ Controller_object,FALSE,
319
+ Convey_importance,TRUE,
320
+ Convoy,FALSE,
321
+ Cooking_creation,TRUE,
322
+ Corporal_punishment,TRUE,
323
+ Correctness,FALSE,
324
+ Corroding,FALSE,
325
+ Corroding_caused,TRUE,
326
+ Cotheme,TRUE,"1st FE is called Theme, but verbs (e.g. ‘follow’) seem agentive"
327
+ Counterattack,TRUE,
328
+ Court_examination,TRUE,
329
+ Craft,FALSE,
330
+ Create_physical_artwork,TRUE,
331
+ Create_representation,TRUE,
332
+ Creating,TRUE,
333
+ Crime_scenario,FALSE,
334
+ Criminal_investigation,TRUE,
335
+ Criminal_process,FALSE,
336
+ Cure,TRUE,
337
+ Custom,FALSE,
338
+ Cutting,TRUE,
339
+ Cycle_of_existence_scenario,FALSE,
340
+ Cycle_of_life_and_death,FALSE,
341
+ Damaging,TRUE,
342
+ Daring,TRUE,
343
+ Dead_or_alive,FALSE,
344
+ Death,FALSE,
345
+ Deception_end,TRUE,
346
+ Deception_scenario,FALSE,
347
+ Deception_success,TRUE,
348
+ Deciding,TRUE,
349
+ Defending,TRUE,
350
+ Degree,FALSE,
351
+ Degree_of_processing,FALSE,
352
+ Delimitation_of_diversity,FALSE,
353
+ Delimited_state_scenario,FALSE,
354
+ Delivery,TRUE,
355
+ Deny_or_grant_permission,TRUE,
356
+ Departing,TRUE,Directional movement verbs
357
+ Deserving,TRUE,"verbs don’t seem to be able to be used intransitively, but seem ‘active-like’ (e.g. ‘justify’)"
358
+ Desirability,TRUE,? e.g. ‘the movie rocks/sucks’ (metaphorical but derives from active verb?)
359
+ Desirable_event,TRUE,
360
+ Desiring,TRUE,
361
+ Destiny,FALSE,
362
+ Destroying,TRUE,
363
+ Detaching,TRUE,
364
+ Detaining,TRUE,
365
+ Detonate_explosive,TRUE,
366
+ Differentiation,TRUE,
367
+ Difficulty,FALSE,
368
+ Dimension,FALSE,"only one verb (‘measure’), cannot be intransitive"
369
+ Direction,FALSE,
370
+ Directional_locative_relation,FALSE,
371
+ Disaster_scenario,FALSE,
372
+ Discussion,TRUE,
373
+ Disembarking,TRUE,Directional movement verbs
374
+ Disgraceful_situation,FALSE,
375
+ Dispersal,TRUE,
376
+ Distance_scenario,FALSE,
377
+ Distant_operated_IED,FALSE,
378
+ Distinctiveness,TRUE,'characterize’
379
+ Distributed_abundanced,FALSE,
380
+ Distributed_position,TRUE,?? not sure any of the verbs can ever be intransitive
381
+ Diversity,FALSE,
382
+ Documents,FALSE,
383
+ Dodging,TRUE,
384
+ Domain,FALSE,
385
+ Dominate_competitor,TRUE,
386
+ Dominate_situation,TRUE,
387
+ Domination,TRUE,
388
+ Dough_rising,FALSE,
389
+ Downing,TRUE,
390
+ Dressing,TRUE,
391
+ Drop_in_on,TRUE,
392
+ Dunking,TRUE,
393
+ Duplication,TRUE,
394
+ Duration_description,FALSE,
395
+ Duration_relation,FALSE,'it lasted’ → *’the lasted thing’ (but ??’the persisted thing’)
396
+ Duration_scenario,FALSE,
397
+ Dying,FALSE,
398
+ Dynamic_situation_scenario,FALSE,
399
+ Dynamism,FALSE,
400
+ Earnings_and_losses,TRUE,
401
+ Eclipse,TRUE,
402
+ Economy,FALSE,
403
+ Education_teaching,TRUE,
404
+ Electricity,FALSE,
405
+ Elusive_goal,FALSE,
406
+ Emanating,FALSE,
407
+ Emergency,FALSE,
408
+ Emergency_fire,FALSE,
409
+ Emitting,TRUE,
410
+ Emotion_directed,TRUE,
411
+ Emotion_heat,TRUE,
412
+ Emotions,FALSE,
413
+ Emotions_by_possibility,FALSE,
414
+ Emotions_by_stimulus,FALSE,
415
+ Emotions_of_mental_activity,TRUE,
416
+ Emotions_success_or_failure,FALSE,Contains no verbs but theoretically possible that it would?
417
+ Emphasizing,TRUE,
418
+ Employee_scenario,FALSE,
419
+ Employer_scenario,FALSE,
420
+ Employing,TRUE,
421
+ Employment_continue,FALSE,
422
+ Employment_end,FALSE,
423
+ Employment_scenario,FALSE,
424
+ Employment_start,FALSE,
425
+ Emptying,TRUE,
426
+ Encoding,TRUE,
427
+ Encounter,TRUE,
428
+ Endangering,TRUE,
429
+ Endeavor_failure,FALSE,
430
+ Enforcing,TRUE,
431
+ Enter_awareness,TRUE,
432
+ Entering_of_plea,TRUE,?
433
+ Entity,FALSE,
434
+ Entourage,FALSE,
435
+ Erasing,TRUE,
436
+ Escaping,TRUE,
437
+ Estimated_value,FALSE,
438
+ Estimating,TRUE,
439
+ Evading,TRUE,
440
+ Evaluative_comparison,TRUE,
441
+ Event,FALSE,
442
+ Event_endstate,FALSE,
443
+ Event_initial_state,FALSE,
444
+ Event_instance,FALSE,
445
+ Eventive_affecting,FALSE,
446
+ Eventive_cognizer_affecting,TRUE,
447
+ Evidence,TRUE,
448
+ Evoking,TRUE,
449
+ Examination,TRUE,
450
+ Exchange,TRUE,
451
+ Exchange_currency,TRUE,
452
+ Exclude_member,TRUE,
453
+ Excreting,TRUE,
454
+ Execute_plan,TRUE,
455
+ Execution,TRUE,
456
+ Exemplar,FALSE,
457
+ Exemplariness,FALSE,
458
+ Exercising,TRUE,
459
+ Existence,FALSE,
460
+ Expansion,FALSE,
461
+ Expectation,TRUE,
462
+ Expected_location_of_person,FALSE,
463
+ Expend_resource,TRUE,
464
+ Expensiveness,TRUE,
465
+ Experience_bodily_harm,FALSE,syntactically ambiguous? ‘I broke my leg’ / ‘mi sono rotto la gamba’ / ‘je me suis cassé la jambe’ → not sure how to classify this construction in romance languages?
466
+ Experiencer_focused_emotion,TRUE,
467
+ Experimentation,TRUE,
468
+ Expertise,TRUE,
469
+ Explaining_the_facts,TRUE,
470
+ Explosion,FALSE,
471
+ Exporting,TRUE,
472
+ Expressing_publicly,TRUE,
473
+ Extradition,TRUE,
474
+ Extreme_point,FALSE,
475
+ Extreme_value,FALSE,
476
+ Facial_expression,FALSE,
477
+ Fairness_evaluation,FALSE,
478
+ Fall_asleep,FALSE,
479
+ Fall_for,TRUE,"'buy’, ‘swallow’ seem clear transitive, ‘fall for’ could maybe be unaccusative??"
480
+ Fame,TRUE,
481
+ Familiarity,TRUE,
482
+ Fastener,FALSE,
483
+ Fear,TRUE,
484
+ Feeling,TRUE,
485
+ Feigning,TRUE,
486
+ Fields,FALSE,
487
+ Fighting_activity,FALSE,
488
+ Filling,TRUE,
489
+ Fining,TRUE,
490
+ Finish_competition,TRUE,
491
+ Finish_game,TRUE,
492
+ Fire_break,FALSE,
493
+ Fire_burning,TRUE,
494
+ Fire_emergency_scenario,FALSE,
495
+ Fire_end_scenario,FALSE,
496
+ Fire_going_out,FALSE,
497
+ Fire_stopping_scenario,FALSE,
498
+ Firefighting,TRUE,
499
+ Firing,TRUE,
500
+ Firing_point,FALSE,
501
+ First_experience,FALSE,
502
+ First_rank,FALSE,
503
+ Fleeing,TRUE,
504
+ Fluidic_motion,TRUE,
505
+ Food,FALSE,
506
+ Food_gathering,TRUE,
507
+ Foreign_or_domestic_country,FALSE,
508
+ Forging,TRUE,
509
+ Forgiveness,TRUE,
510
+ Forgoing,TRUE,
511
+ Forming_relationships,TRUE,
512
+ Fragmentation_scenario,FALSE,
513
+ Freeing_from_confinement,TRUE,
514
+ Frequency,FALSE,
515
+ Friction,TRUE,"difficult, but “Theme exterts pressure and experiences resistance” could be interpreted as somehow (quasi-)agentive"
516
+ Friendly_or_hostile,FALSE,
517
+ Front_for,TRUE,(front.v ?)
518
+ Frugality,TRUE,
519
+ Fugitive,FALSE,
520
+ Fullness,FALSE,
521
+ Function,TRUE,"difficult: inanimate theme, but “exists to perform Activity” so perhaps (metaphorically) can be seen as active?"
522
+ Funding,TRUE,
523
+ Gathering_up,TRUE,
524
+ Gesture,TRUE,
525
+ Get_a_job,TRUE,
526
+ Getting,TRUE,"somewhat difficult: most verbs (obtain, acquire, procure) imply active action, but the definition of the frame and verbs like “get” can be seen as passive.
527
+
528
+ Interesting: in Dutch there is “verkrijgen” (obtain) vs. “krijgen” (get) were this distinction is encoded morphologically"
529
+ Getting_scenario,FALSE,
530
+ Getting_triggered,FALSE,
531
+ Getting_underway,TRUE,
532
+ Getting_up,TRUE,
533
+ Getting_vehicle_underway,TRUE,
534
+ Give_impression,TRUE,not sure but these all seem (semantically) causative somehow? e.g. “smell good” ~= “cause pleasant taste perception”?
535
+ Giving,TRUE,
536
+ Giving_birth,TRUE,
537
+ Giving_in,TRUE,
538
+ Giving_scenario,FALSE,
539
+ Gizmo,FALSE,
540
+ Go_into_shape,FALSE,"per description “Theme goed into a shape without being made to do so by an agent”, but does that make it agentive?"
541
+ Goal,FALSE,
542
+ Going_back_on_a_commitment,TRUE,
543
+ Government_institution,FALSE,
544
+ Gradable_artistic_quality,FALSE,
545
+ Gradable_attributes,FALSE,
546
+ Gradable_proximity,FALSE,
547
+ Graph_shape,FALSE,
548
+ Grasp,TRUE,
549
+ Grinding,TRUE,
550
+ Grooming,TRUE,
551
+ Ground_up,FALSE,
552
+ Growing_food,TRUE,
553
+ Guest_and_host,FALSE,
554
+ Guilt_or_innocence,FALSE,
555
+ Gusto,FALSE,
556
+ Hair_configuration,FALSE,
557
+ Halt,TRUE,"motion verbs (‘has stopped’ etc)
558
+
559
+ 'stop’ seems to imply agent? ‘the cyclist stopped’, ‘the train stopped’ (metaphorical?), but ??the falling rock stopped (?)"
560
+ Have_as_requirement,TRUE,"metaphorical use of demand, require, take?"
561
+ Have_as_translation_equivalent,FALSE,
562
+ Have_associated,FALSE,metaphorical use of have?
563
+ Have_visitor_over,FALSE,
564
+ Having_commercial_agreement,FALSE,
565
+ Having_or_lacking_access,FALSE,"verb ‘access’ is agentive but seems to be misclassified in this frame (it means to enter, not to have access)"
566
+ Health_response,FALSE,
567
+ Hearsay,TRUE,
568
+ Heat_potential,FALSE,
569
+ Hedging,FALSE,
570
+ Heralding,TRUE,
571
+ Hiding_objects,TRUE,
572
+ Hindering,TRUE,
573
+ Hiring,TRUE,
574
+ Historic_event,FALSE,
575
+ History,FALSE,
576
+ History_scenario,FALSE,
577
+ Hit_or_miss,TRUE,
578
+ Hit_target,TRUE,
579
+ Holding_off_on,TRUE,
580
+ Hospitality,FALSE,
581
+ Hostile_encounter,TRUE,
582
+ Hunting,TRUE,
583
+ Hunting_scenario,FALSE,
584
+ Hunting_success_or_failure,TRUE,
585
+ Identicality,FALSE,
586
+ Identity,FALSE,
587
+ Identity_scenario,FALSE,
588
+ Idiosyncrasy,FALSE,
589
+ Image_schema,FALSE,
590
+ Imitating,TRUE,
591
+ Immobilization,TRUE,
592
+ Impact,TRUE,
593
+ Import_export_scenario,TRUE,
594
+ Importance,TRUE,
595
+ Importing,TRUE,
596
+ Imposing_obligation,TRUE,
597
+ Impression,FALSE,
598
+ Imprisonment,TRUE,
599
+ Improvement_or_decline,FALSE,
600
+ Improvised_explosive_device,FALSE,
601
+ Inclination,FALSE,
602
+ Inclusion,TRUE,
603
+ Inclusion_scenario,FALSE,
604
+ Increment,FALSE,
605
+ Indicating,TRUE,
606
+ Indigenous_origin,FALSE,
607
+ Individual_history,FALSE,
608
+ Ineffability,FALSE,
609
+ Infecting,TRUE,
610
+ Influencing_potential,FALSE,
611
+ Information,FALSE,
612
+ Information_display,FALSE,
613
+ Infrastructure,FALSE,
614
+ Ingest_substance,TRUE,
615
+ Ingestion,TRUE,
616
+ Ingredients,FALSE,
617
+ Inherent_purpose,FALSE,
618
+ Inhibit_motion_scenario,FALSE,
619
+ Inhibit_movement,TRUE,
620
+ Inspecting,TRUE,
621
+ Installing,TRUE,
622
+ Instance,FALSE,
623
+ Institutionalization,TRUE,
624
+ Institutions,FALSE,
625
+ Intentional_deception,TRUE,
626
+ Intentional_traversing,TRUE,
627
+ Intentionally_act,TRUE,
628
+ Intentionally_affect,TRUE,
629
+ Intentionally_create,TRUE,
630
+ Intercepting,TRUE,
631
+ Interior_profile_relation,FALSE,
632
+ Interrupt_process,TRUE,
633
+ Intoxicants,FALSE,
634
+ Intoxication,FALSE,
635
+ Invading,TRUE,
636
+ Invasion_scenario,FALSE,
637
+ Irregular_combatants,FALSE,
638
+ Isolated_places,FALSE,
639
+ Judgment,TRUE,
640
+ Judgment_communication,TRUE,
641
+ Judgment_direct_address,TRUE,
642
+ Judgment_of_intensity,FALSE,
643
+ Judicial_body,FALSE,
644
+ Jury_deliberation,TRUE,
645
+ Just_found_out,FALSE,
646
+ Justifying,TRUE,
647
+ Key,FALSE,
648
+ Kidnapping,TRUE,
649
+ Killing,TRUE,
650
+ Kinship,FALSE,
651
+ Knot_creation,TRUE,
652
+ Knot_creation_scenario,FALSE,
653
+ Labeling,TRUE,
654
+ Labor_product,FALSE,
655
+ Launch_process,TRUE,
656
+ Law,FALSE,
657
+ Law_enforcement_agency,FALSE,
658
+ Leadership,TRUE,
659
+ Leaving_traces,FALSE,
660
+ Left_to_do,FALSE,
661
+ Legal_rulings,TRUE,
662
+ Legality,FALSE,
663
+ Lending,TRUE,
664
+ Level_of_force_exertion,FALSE,
665
+ Level_of_force_resistance,FALSE,
666
+ Level_of_light,FALSE,
667
+ Light_movement,TRUE,metaphorical
668
+ Likelihood,FALSE,
669
+ Limitation,FALSE,
670
+ Limiting,TRUE,
671
+ Linguistic_meaning,TRUE,? metaphorical active?
672
+ Lively_place,TRUE,? ‘the park buzzes’ can be seen as metaphorically active?
673
+ Living_conditions,FALSE,
674
+ Locale,FALSE,
675
+ Locale_by_characteristic_entity,FALSE,
676
+ Locale_by_collocation,FALSE,
677
+ Locale_by_event,FALSE,
678
+ Locale_by_ownership,FALSE,
679
+ Locale_by_use,FALSE,
680
+ Locale_closure,FALSE,
681
+ Locating,TRUE,
682
+ Location_in_time,FALSE,
683
+ Location_of_light,FALSE,seems anticausative of Light_movement?
684
+ Location_on_path,FALSE,
685
+ Locative_relation,TRUE,
686
+ Locative_scenario,FALSE,
687
+ Lodging_scenario,FALSE,
688
+ Lose_possession,FALSE,
689
+ Lose_possession_scenario,FALSE,
690
+ Losing,TRUE,'losing sth’ could be both passive or active; in Dutch “heb verloren/ben verloren” both possible (sensitive to the contrast?)
691
+ Losing_it,TRUE,
692
+ Losing_someone,TRUE,
693
+ Losing_track_of,FALSE,
694
+ Losing_track_of_perceiver,TRUE,
695
+ Losing_track_of_theme,TRUE,
696
+ Luck,FALSE,
697
+ Make_acquaintance,TRUE,
698
+ Make_agreement_on_action,TRUE,
699
+ Make_cognitive_connection,TRUE,
700
+ Make_compromise,TRUE,
701
+ Make_noise,TRUE,
702
+ Making_arrangements,TRUE,
703
+ Making_faces,TRUE,
704
+ Manipulate_into_doing,TRUE,
705
+ Manipulate_into_shape,TRUE,
706
+ Manipulation,TRUE,
707
+ Manner,FALSE,
708
+ Manner_of_life,TRUE,
709
+ Manufacturing,TRUE,
710
+ Margin_of_resolution,FALSE,
711
+ Mass_motion,FALSE,?
712
+ Mathematical_relationship,FALSE,
713
+ Means,FALSE,
714
+ Measurable_attributes,FALSE,
715
+ Measure_area,FALSE,
716
+ Measure_by_action,FALSE,
717
+ Measure_duration,FALSE,
718
+ Measure_mass,FALSE,
719
+ Measure_of_distance_and_length,FALSE,
720
+ Measure_scenario,FALSE,
721
+ Measure_volume,FALSE,
722
+ Measures,FALSE,
723
+ Medical_conditions,FALSE,
724
+ Medical_instruments,FALSE,
725
+ Medical_interaction_scenario,FALSE,
726
+ Medical_intervention,TRUE,
727
+ Medical_professionals,FALSE,
728
+ Medical_specialties,FALSE,
729
+ Medium,FALSE,
730
+ Meet_specifications,TRUE,
731
+ Meet_with,TRUE,
732
+ Meet_with_response,TRUE,? ‘meet with critizism’ ~= elicit critizism?
733
+ Member_of_military,FALSE,
734
+ Membership,FALSE,? belong
735
+ Memorization,TRUE,
736
+ Memory,TRUE,
737
+ Mental_activity,FALSE,
738
+ Mental_property,FALSE,
739
+ Mental_stimulus_exp_focus,FALSE,
740
+ Mental_stimulus_stimulus_focus,FALSE,
741
+ Mention,TRUE,
742
+ Military,FALSE,
743
+ Military_operation,TRUE,
744
+ Mining,TRUE,
745
+ Misdeed,TRUE,
746
+ Money,FALSE,
747
+ Morality_evaluation,FALSE,
748
+ Motion,FALSE,"? this frame is meant to have the non-active versions of the motion verbs (as opposed to Self_motion, Operate_vehicle etc), but in practice it seems many examples are actually active"
749
+ Motion_directional,FALSE,?
750
+ Motion_noise,FALSE,?
751
+ Motion_scenario,FALSE,
752
+ Moving_in_place,FALSE,?
753
+ Name_conferral,TRUE,
754
+ Namesake,FALSE,
755
+ Natural_features,FALSE,
756
+ Needing,TRUE,
757
+ Negation,FALSE,
758
+ Negative_conditional,FALSE,
759
+ Network,FALSE,
760
+ Noise_makers,FALSE,
761
+ Non-commutative_process,TRUE,active when Calculator (non-core) is present; there are no examples so not clear how often this is the case
762
+ Non-commutative_statement,FALSE,
763
+ Non-gradable_proximity,FALSE,
764
+ Noncombatant,FALSE,
765
+ Notability,FALSE,
766
+ Notification_of_charges,TRUE,
767
+ Nuclear_process,FALSE,
768
+ Objective_influence,TRUE,
769
+ Obligation_scenario,FALSE,
770
+ Obscurity,FALSE,
771
+ Obviousness,FALSE,
772
+ Occupy_rank,FALSE,'he ranks second’ ~= ‘he is ranked second’
773
+ Offenses,FALSE,
774
+ Offering,TRUE,
775
+ Offshoot,FALSE,
776
+ Omen,TRUE,Predictive_phenomenon (quasi-actively) provides cues for something
777
+ Ontogeny,FALSE,
778
+ Openness,FALSE,
779
+ Operate_vehicle,TRUE,
780
+ Operate_vehicle_scenario,FALSE,
781
+ Operating_a_system,TRUE,
782
+ Operational_testing,TRUE,
783
+ Opinion,TRUE,
784
+ Opportunity,FALSE,
785
+ Optical_image,FALSE,
786
+ Ordinal_numbers,FALSE,
787
+ Organization,FALSE,
788
+ Origin,FALSE,
789
+ Others_situation_as_stimulus,TRUE,
790
+ Out_of_existence,FALSE,
791
+ Pardon,TRUE,
792
+ Part_edge,FALSE,
793
+ Part_inner_outer,FALSE,
794
+ Part_ordered_segments,FALSE,
795
+ Part_orientational,FALSE,
796
+ Part_piece,FALSE,
797
+ Part_whole,FALSE,
798
+ Partiality,TRUE,
799
+ Participation,TRUE,
800
+ Partitive,FALSE,
801
+ Passing,TRUE,
802
+ Passing_off,TRUE,
803
+ Path_shape,FALSE,? are syntactically unergative
804
+ Path_traveled,FALSE,
805
+ Patrolling,TRUE,
806
+ Pattern,FALSE,
807
+ People,FALSE,
808
+ People_along_political_spectrum,FALSE,
809
+ People_by_age,FALSE,
810
+ People_by_jurisdiction,FALSE,
811
+ People_by_military_specialty,FALSE,
812
+ People_by_morality,FALSE,
813
+ People_by_origin,FALSE,
814
+ People_by_religion,FALSE,
815
+ People_by_residence,FALSE,
816
+ People_by_vocation,FALSE,
817
+ Perception,FALSE,
818
+ Perception_active,TRUE,
819
+ Perception_body,TRUE,? 'my head hurts’ → ‘my head causes pain sensation’ (?)
820
+ Perception_experience,TRUE,"? these are ‘passive experience’ verbs, but syntactically active/unergative; you can argue this kind of perception still implies active processing on the part of the experiencer"
821
+ Performers,FALSE,
822
+ Performers_and_roles,TRUE,"'feature.v’ is an odd one because agent/patient are reversed (NB only one annotated example, Fes seem not correct)"
823
+ Performing_arts,FALSE,
824
+ Personal_relationship,TRUE,"Mostly nouns/adj, but verbs 'sleep with’, ‘befriend’ are active"
825
+ Personal_success,TRUE,
826
+ Physical_artworks,FALSE,
827
+ Physical_entity,FALSE,
828
+ Piracy,TRUE,
829
+ Placing,TRUE,
830
+ Placing_scenario,FALSE,
831
+ Planned_trajectory,FALSE,
832
+ Planting,TRUE,
833
+ Plants,FALSE,
834
+ Point_of_dispute,FALSE,
835
+ Political_actions,TRUE,
836
+ Political_locales,FALSE,
837
+ Popularity,FALSE,
838
+ Posing_as,TRUE,
839
+ Position_on_a_scale,FALSE,
840
+ Possession,TRUE,? owning as action?
841
+ Possibility,FALSE,
842
+ Post_getting,FALSE,
843
+ Post_giving,FALSE,
844
+ Post_lose_possession,FALSE,
845
+ Post_receiving,FALSE,
846
+ Post_transfer,FALSE,
847
+ Posture,TRUE,
848
+ Practice,TRUE,
849
+ Praiseworthiness,FALSE,
850
+ Prank,FALSE,
851
+ Pre_getting,FALSE,
852
+ Pre_giving,FALSE,
853
+ Pre_lose_possession,FALSE,
854
+ Pre_receiving,FALSE,
855
+ Pre_transfer,FALSE,
856
+ Precariousness,TRUE,only verb teeter.v
857
+ Precipitation,FALSE,
858
+ Predicament,FALSE,
859
+ Predicting,TRUE,
860
+ Preference,TRUE,
861
+ Preferred_alternative_scenario,FALSE,
862
+ Preliminaries,FALSE,
863
+ Presence,FALSE,
864
+ Presentation_of_mitigation,FALSE,
865
+ Preserving,TRUE,
866
+ Prevarication,TRUE,
867
+ Prevent_or_allow_possession,TRUE,
868
+ Preventing_or_letting,TRUE,
869
+ Price_per_unit,FALSE,
870
+ Prison,FALSE,
871
+ Probability,FALSE,
872
+ Process,FALSE,
873
+ Process_completed_state,FALSE,
874
+ Process_continue,FALSE,
875
+ Process_end,FALSE,
876
+ Process_initial_state,FALSE,
877
+ Process_pause,FALSE,
878
+ Process_resume,FALSE,
879
+ Process_start,FALSE,
880
+ Process_stop,FALSE,
881
+ Process_stopped_state,FALSE,
882
+ Process_uncompleted_state,FALSE,
883
+ Processing_materials,TRUE,
884
+ Procreative_sex,TRUE,
885
+ Product_delivery,FALSE,
886
+ Product_development,TRUE,
887
+ Product_development_scenario,FALSE,
888
+ Product_line,FALSE,
889
+ Progression,FALSE,
890
+ Prohibiting_or_licensing,TRUE,
891
+ Project,FALSE,
892
+ Proliferating_in_number,FALSE,
893
+ Prominence,FALSE,
894
+ Proper_reference,FALSE,
895
+ Proportion,FALSE,
896
+ Proportional_quantity,FALSE,
897
+ Protecting,TRUE,
898
+ Protest,TRUE,
899
+ Provide_lodging,TRUE,
900
+ Proximity_image_schema,FALSE,
901
+ Public_services,FALSE,
902
+ Publishing,TRUE,
903
+ Punctual_perception,FALSE,
904
+ Purpose,FALSE,
905
+ Putting_out_fire,TRUE,
906
+ Quantified_mass,FALSE,
907
+ Quantity,FALSE,
908
+ Quarreling,TRUE,
909
+ Questioning,TRUE,
910
+ Quitting,TRUE,
911
+ Quitting_a_place,TRUE,
912
+ Race_descriptor,FALSE,
913
+ Range,FALSE,
914
+ Rank,FALSE,
915
+ Ranked_expectation,FALSE,
916
+ Rape,TRUE,
917
+ Rashness,FALSE,
918
+ Rate_description,FALSE,
919
+ Rate_quantification,FALSE,
920
+ Ratification,TRUE,
921
+ Reading_activity,TRUE,
922
+ Reading_aloud,TRUE,
923
+ Reading_perception,TRUE,
924
+ Reason,FALSE,
925
+ Reasoning,TRUE,
926
+ Reassuring,TRUE,
927
+ Rebellion,TRUE,
928
+ Receive_visitor_scenario,FALSE,
929
+ Receiving,TRUE,
930
+ Receiving_scenario,FALSE,
931
+ Reciprocality,FALSE,
932
+ Recording,TRUE,
933
+ Records,FALSE,
934
+ Recovery,FALSE,
935
+ Redirecting,TRUE,
936
+ Reference_text,FALSE,
937
+ Referring_by_name,TRUE,
938
+ Reforming_a_system,TRUE,
939
+ Regard,TRUE,
940
+ Region_with_portal,FALSE,
941
+ Reject_leadership,TRUE,does not exist in NLTK
942
+ Rejuvenation,TRUE,
943
+ Relating_concepts,TRUE,?
944
+ Relation,FALSE,
945
+ Relation_between_individuals,FALSE,
946
+ Relational_location,FALSE,
947
+ Relational_natural_features,FALSE,
948
+ Relational_political_locales,FALSE,
949
+ Relational_quantity,FALSE,
950
+ Relative_time,FALSE,?
951
+ Releasing,TRUE,
952
+ Releasing_from_custody,FALSE,
953
+ Reliance,TRUE,?
954
+ Reliance_on_expectation,TRUE,
955
+ Religious_belief,TRUE,
956
+ Remainder,FALSE,
957
+ Remembering_experience,TRUE,
958
+ Remembering_information,TRUE,
959
+ Remembering_to_do,TRUE,
960
+ Removing,TRUE,
961
+ Removing_scenario,FALSE,
962
+ Render_nonfunctional,TRUE,
963
+ Renting,TRUE,
964
+ Renting_out,TRUE,
965
+ Renunciation,TRUE,
966
+ Reparation,TRUE,
967
+ Repayment,TRUE,
968
+ Repel,TRUE,
969
+ Replacing,TRUE,
970
+ Reporting,TRUE,
971
+ Representative,FALSE,
972
+ Representing,TRUE,?
973
+ Request,TRUE,
974
+ Request_entity,TRUE,
975
+ Required_event,TRUE,?
976
+ Requirement_scenario,FALSE,
977
+ Rescuing,TRUE,
978
+ Research,TRUE,
979
+ Reserving,TRUE,
980
+ Reshaping,TRUE,
981
+ Residence,TRUE,
982
+ Resolve_problem,TRUE,
983
+ Respond_to_proposal,TRUE,
984
+ Response,TRUE,
985
+ Response_scenario,FALSE,
986
+ Responsibility,FALSE,
987
+ Rest,FALSE,
988
+ Result_of_attempt_scenario,FALSE,
989
+ Resurrection,TRUE,? similar to self_motion?
990
+ Retaining,TRUE,
991
+ Reveal_secret,TRUE,
992
+ Revenge,TRUE,
993
+ Revolution,FALSE,
994
+ Rewards_and_punishments,TRUE,
995
+ Ride_vehicle,TRUE,?
996
+ Rising_to_a_challenge,TRUE,
997
+ Risk_scenario,FALSE,
998
+ Risky_situation,FALSE,
999
+ Rite,TRUE,
1000
+ Roadways,FALSE,
1001
+ Robbery,TRUE,
1002
+ Rope_manipulation,TRUE,
1003
+ Rotting,FALSE,
1004
+ Run_risk,TRUE,
1005
+ Sacrificing_for,TRUE,
1006
+ Satisfying,TRUE,
1007
+ Scarcity,FALSE,
1008
+ Scheduling,TRUE,
1009
+ Scope,FALSE,
1010
+ Scouring,TRUE,
1011
+ Scrutinizing_for,FALSE,
1012
+ Scrutiny,TRUE,
1013
+ Searching_scenario,FALSE,
1014
+ Secrecy_status,FALSE,
1015
+ See_through,TRUE,
1016
+ Seeking,TRUE,
1017
+ Seeking_to_achieve,TRUE,
1018
+ Self_control,TRUE,
1019
+ Self_motion,TRUE,?
1020
+ Sending,TRUE,
1021
+ Sensation,FALSE,
1022
+ Sent_items,FALSE,
1023
+ Sentencing,TRUE,
1024
+ Separating,TRUE,
1025
+ Sequence,FALSE,
1026
+ Serving_in_capacity,TRUE,
1027
+ Set_of_interrelated_entities,FALSE,
1028
+ Set_relation,FALSE,
1029
+ Setting_back_burn,FALSE,
1030
+ Setting_fire,TRUE,
1031
+ Setting_out,TRUE,
1032
+ Severity_of_offense,FALSE,
1033
+ Sex,TRUE,
1034
+ Sexual_reproduction_scenario,FALSE,
1035
+ Shaped_part,FALSE,
1036
+ Shapes,FALSE,
1037
+ Sharing,TRUE,
1038
+ Sharpness,FALSE,
1039
+ Shoot_projectiles,TRUE,
1040
+ Shooting_scenario,FALSE,
1041
+ Shopping,TRUE,
1042
+ Short_selling,TRUE,
1043
+ Sidereal_appearance,FALSE,?
1044
+ Sign,TRUE,
1045
+ Sign_agreement,TRUE,
1046
+ Silencing,TRUE,
1047
+ Similarity,TRUE,? mimic etc
1048
+ Simple_name,FALSE,
1049
+ Simple_naming,TRUE,
1050
+ Simultaneity,FALSE,
1051
+ Size,FALSE,
1052
+ Sleep,TRUE,
1053
+ Sleep_wake_cycle,FALSE,
1054
+ Smuggling,TRUE,
1055
+ Soaking,TRUE,
1056
+ Soaking_up,TRUE,? metaphorical
1057
+ Sociability,FALSE,
1058
+ Social_behavior_evaluation,FALSE,
1059
+ Social_connection,FALSE,
1060
+ Social_desirability,FALSE,
1061
+ Social_event,FALSE,
1062
+ Social_event_collective,FALSE,
1063
+ Social_event_individuals,TRUE,
1064
+ Social_interaction_evaluation,FALSE,
1065
+ Socially_significant_history_scenario,FALSE,
1066
+ Sole_instance,FALSE,
1067
+ Sound_level,FALSE,
1068
+ Sound_movement,FALSE,?
1069
+ Sounds,FALSE,
1070
+ Source_of_getting,FALSE,
1071
+ Source_path_goal,FALSE,
1072
+ Spatial_co-location,FALSE,
1073
+ Spatial_contact,FALSE,
1074
+ Speak_on_topic,TRUE,
1075
+ Specific_individual,FALSE,
1076
+ Speed_description,FALSE,
1077
+ Spelling_and_pronouncing,TRUE,
1078
+ Sports_jargon,FALSE,
1079
+ Stage_of_progress,FALSE,
1080
+ Standing_by,TRUE,
1081
+ State,FALSE,
1082
+ State_continue,FALSE,
1083
+ State_of_entity,FALSE,
1084
+ Statement,TRUE,
1085
+ Stimulate_emotion,TRUE,? metaphorical
1086
+ Stimulus_focus,FALSE,
1087
+ Stinginess,TRUE,
1088
+ Store,FALSE,
1089
+ Storing,TRUE,
1090
+ Strictness,FALSE,
1091
+ Studying,TRUE,
1092
+ Suasion,TRUE,
1093
+ Subjective_influence,TRUE,
1094
+ Subjective_temperature,FALSE,?
1095
+ Submitting_documents,TRUE,
1096
+ Subordinates_and_superiors,TRUE,
1097
+ Subsisting,TRUE,
1098
+ Substance,FALSE,
1099
+ Substance_by_phase,FALSE,
1100
+ Subversion,TRUE,
1101
+ Success_or_failure,TRUE,
1102
+ Successful_action,TRUE,
1103
+ Successfully_communicate_message,TRUE,
1104
+ Sufficiency,FALSE,
1105
+ Suicide_attack,TRUE,? there are no verbs but there is an agentive example (‘they kamikazed the base’)
1106
+ Suitability,FALSE,
1107
+ Summarizing,TRUE,
1108
+ Supply,TRUE,
1109
+ Supporting,TRUE,
1110
+ Surpassing,TRUE,metaphorical
1111
+ Surrendering,TRUE,
1112
+ Surrendering_possession,TRUE,
1113
+ Surrounding,TRUE,
1114
+ Surviving,TRUE,
1115
+ Suspicion,TRUE,
1116
+ Symmetrical_collective_reciprocality,FALSE,
1117
+ System,FALSE,
1118
+ System_complexity,FALSE,
1119
+ Take_place_of,TRUE,
1120
+ Taking,TRUE,
1121
+ Taking_captive,TRUE,
1122
+ Taking_sides,TRUE,
1123
+ Taking_time,TRUE,
1124
+ Talking_into,TRUE,
1125
+ Tasting,TRUE,
1126
+ Team,FALSE,
1127
+ Telling,TRUE,
1128
+ Temperature,FALSE,
1129
+ Temporal_collocation,FALSE,
1130
+ Temporal_pattern,FALSE,
1131
+ Temporal_subregion,FALSE,
1132
+ Temporary_group,FALSE,
1133
+ Temporary_leave,TRUE,
1134
+ Temporary_stay,TRUE,
1135
+ Temporary_transfer_scenario,FALSE,
1136
+ Terms_of_agreement,FALSE,
1137
+ Terrorism,FALSE,
1138
+ Text,FALSE,
1139
+ Text_creation,TRUE,
1140
+ Theft,TRUE,
1141
+ Thermodynamic_phase,FALSE,
1142
+ Thriving,TRUE,
1143
+ Thwarting,TRUE,
1144
+ Time_period_of_action,FALSE,
1145
+ Time_vector,FALSE,
1146
+ Timespan,FALSE,
1147
+ Timetable,FALSE,
1148
+ Tolerating,TRUE,
1149
+ Tool_purpose,FALSE,
1150
+ Topic,TRUE,
1151
+ Touring,TRUE,
1152
+ Toxic_substance,FALSE,
1153
+ Transfer,TRUE,
1154
+ Transfer_scenario,FALSE,
1155
+ Transition_to_a_quality,FALSE,
1156
+ Transition_to_a_situation,FALSE,
1157
+ Transition_to_a_state,FALSE,
1158
+ Transitive_action,FALSE,
1159
+ Translating,TRUE,
1160
+ Transportation_status,FALSE,
1161
+ Trap,FALSE,
1162
+ Travel,TRUE,
1163
+ Traversing,TRUE,
1164
+ Treating_and_mistreating,TRUE,
1165
+ Trendiness,FALSE,
1166
+ Trial,FALSE,
1167
+ Triggering,TRUE,
1168
+ Trust,TRUE,
1169
+ Try_defendant,TRUE,
1170
+ Trying_out,TRUE,
1171
+ Turning_out,FALSE,
1172
+ Type,FALSE,
1173
+ Typicality,FALSE,
1174
+ Unattributed_information,TRUE,'rumor.v’ only exists in passive form
1175
+ Undergo_change,FALSE,
1176
+ Undergo_transformation,FALSE,
1177
+ Undergoing,FALSE,
1178
+ Undergoing_scenario,FALSE,
1179
+ Undressing,TRUE,
1180
+ Unemployment_rate,FALSE,
1181
+ Use_firearm,TRUE,
1182
+ Use_vehicle,FALSE,
1183
+ Used_up,FALSE,
1184
+ Usefulness,TRUE,
1185
+ Using,TRUE,
1186
+ Using_resource,TRUE,
1187
+ Vehicle,FALSE,
1188
+ Vehicle_departure_initial_stage,TRUE,? metaphorical
1189
+ Vehicle_landing,TRUE,? metaphorical
1190
+ Vehicle_subpart,FALSE,
1191
+ Verdict,TRUE,
1192
+ Verification,TRUE,
1193
+ Version_sequence,FALSE,
1194
+ Victim_operated_IED,FALSE,
1195
+ Violence,FALSE,
1196
+ Visit_host,FALSE,empty
1197
+ Visit_host_arrival,FALSE,empty
1198
+ Visit_host_departure,FALSE,empty
1199
+ Visit_host_stay,FALSE,empty
1200
+ Visiting,TRUE,
1201
+ Visiting_scenario,FALSE,
1202
+ Visiting_scenario_arrival,FALSE,
1203
+ Visiting_scenario_departing,FALSE,
1204
+ Visiting_scenario_stay,FALSE,
1205
+ Visitor_and_host,FALSE,
1206
+ Visitor_arrival,FALSE,
1207
+ Visitor_departure,FALSE,
1208
+ Visitor_scenario,FALSE,
1209
+ Vocalizations,FALSE,
1210
+ Volubility,FALSE,
1211
+ Wagering,TRUE,
1212
+ Waiting,TRUE,
1213
+ Waking_up,TRUE,
1214
+ Want_suspect,FALSE,
1215
+ Warning,TRUE,
1216
+ Waver_between_options,TRUE,
1217
+ Wealthiness,FALSE,
1218
+ Weapon,FALSE,
1219
+ Wearing,TRUE,
1220
+ Weather,FALSE,
1221
+ Wholes_and_parts,FALSE,
1222
+ Willingness,FALSE,
1223
+ Win_prize,TRUE,
1224
+ Withdraw_from_participation,TRUE,
1225
+ Within_distance,FALSE,
1226
+ Word_relations,FALSE,
1227
+ Work,TRUE,
1228
+ Working_a_post,TRUE,
1229
+ Worry,TRUE,
resources/crashes_frame_list.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # based on Gosse's presentation in 1st meeting with Marco
2
+ Killing
3
+ Death
4
+ Impact
5
+ Catch_fire
6
+ Cause_harm
7
+
8
+ # extra to make more comparable with femicide data
9
+ Causation
10
+ Cause_motion
11
+ Dead_or_alive
12
+ Emotion_directed
13
+ Event
14
+ Experience_bodily_harm
resources/crashes_frame_to_roles.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ frame,role:perpetrator_like,role:victim_like,role:cause_like,notes
2
+ Catch_fire,-,-,-,
3
+ Causation,Causer,Affected,Cause,
4
+ Cause_harm,Agent,Victim,Cause,
5
+ Cause_motion,-,-,-,does not seem to usually refer to the main murder event
6
+ Dead_or_alive,-,Protagonist,Explanation,
7
+ Death,-,Protagonist,Cause,
8
+ Emotion_directed,-,-,-,does not seem to usually refer to the main murder event
9
+ Event,-,-,-,does not involve any participants
10
+ Experience_bodily_harm,Experiencer|Body_part,-,-,
11
+ Killing,Killer,Victim,Cause,
resources/crashes_sources.csv ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ProviderName,ProviderFreq,ProviderNameCorr,RegionalScope,ContentType,MediumType,Country,Province,Locality,MediaCollection,MediaOwner,Duplicate,Notes
2
+ noordhollandsdagblad.nl,620,Noordhollands Dagblad,Regional,GeneralNews,Newspaper,Netherlands,Noord-Holland,-,Mediahuis Nederland,Mediahuis,,
3
+ gelderlander.nl,553,De Gelderlander,Regional,GeneralNews,Newspaper,Netherlands,Gelderland,-,ADR Nieuwsmedia,DPG Media,,
4
+ nhnieuws.nl,479,NH Nieuws,Regional,GeneralNews,TV-Radio,Netherlands,Gelderland,-,-,[public],,
5
+ rtvoost.nl,409,RTV Oost,Regional,GeneralNews,TV-Radio,Netherlands,Overijssel,-,-,[public],,
6
+ www.ad.nl,399,Algemeen Dagblad,National,GeneralNews,Newspaper,Netherlands,-,-,ADR Nieuwsmedia,DPG Media,,
7
+ rtvutrecht.nl,374,RTV Utrecht,National,GeneralNews,TV-Radio,Netherlands,Utrecht,-,-,[public],,
8
+ destentor.nl,326,De Stentor,Regional,GeneralNews,Newspaper,Netherlands,Flevoland|Gelderland|Overijssel,-,ADR Nieuwsmedia,DPG Media,,
9
+ omroepbrabant.nl,227,Omroep Brabant,Regional,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,-,-,[public],,
10
+ haarlemsdagblad.nl,219,Haarlems Dagblad,Local,GeneralNews,Newspaper,Netherlands,Noord-Holland,Haarlem,Mediahuis Nederland,Mediahuis,,
11
+ www.ed.nl,206,Eindhovens Dagblad,Local,GeneralNews,Newspaper,Netherlands,Noord-Brabant,Eindhoven,ADR Nieuwsmedia,DPG Media,,
12
+ www.bd.nl,198,Brabants Dagblad,Regional,GeneralNews,Newspaper,Netherlands,Noord-Brabant,-,ADR Nieuwsmedia,DPG Media,,
13
+ weblogzwolle.nl,191,Weblog Zwolle,Local,GeneralNews,OnlineOnly,Netherlands,Overijssel,Zwolle,-,[independent],,
14
+ www.at5.nl,173,AT5,Local,GeneralNews,TV-Radio,Netherlands,Noord-Holland,Amsterdam,-,[public],,
15
+ rtvfocuszwolle.nl,168,RTV Focus Zwolle,Local,GeneralNews,TV-Radio,Netherlands,Overijssel,Zwolle,-,[independent],,
16
+ hvzeeland.nl,163,HVZeeland.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zeeland,-,-,[independent],,
17
+ omroepgelderland.nl,142,Omroep Brabant,Regional,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,-,-,[public],,
18
+ 1Limburg | Nieuws en sport uit Limburg,135,1Limburg,Regional,GeneralNews,OnlineOnly,Netherlands,Limburg_NL,-,-,[public],,"belongs to ""1L"" tv/radio channel, but separate brand"
19
+ www.hln.be,132,Het Laatste Nieuws,National,GeneralNews,Newspaper,Belgium,-,-,-,DPG Media,,
20
+ telegraaf.nl,124,De Telegraaf,National,GeneralNews,Newspaper,Netherlands,-,-,Mediahuis Nederland,Mediahuis,,
21
+ amstelveenz.nl,109,AmstelveenZ,Local,GeneralNews,Magazine,Netherlands,Noord-Holland,Amstelveen,-,[independent],,
22
+ tubantia.nl,105,Tubantia,Regional,GeneralNews,Magazine,Netherlands,Overijssel,-,ADR Nieuwsmedia,DPG Media,,
23
+ leidschdagblad.nl,100,Leidsch Dagblad,Local,GeneralNews,Magazine,Netherlands,Zuid-Holland,Leiden,-,DPG Media,,
24
+ bndestem.nl,92,BN DeStem,Regional,GeneralNews,Newspaper,Netherlands,Noord-Brabant|Zeeland,-,ADR Nieuwsmedia,DPG Media,,
25
+ nos.nl,92,NOS,National,GeneralNews,TV-Radio,Netherlands,-,-,NPO,[public],,
26
+ hartvannederland.nl,90,Hart van Nederland,National,GeneralNews,TV-Radio,Netherlands,-,-,SBS6,Talpa Network,,
27
+ Dagblad van het Noorden,85,Dagblad van het Noorden,Regional,GeneralNews,Newspaper,Netherlands,Drenthe|Groningen,-,NDC Mediagroep,Mediahuis,,
28
+ rtvdrenthe.nl,84,RTV Drenthe,Regional,GeneralNews,TV-Radio,Netherlands,Drenthe,-,-,[public],,
29
+ rtvnoord.nl,74,RTV Noord,Regional,GeneralNews,TV-Radio,Netherlands,Groningen,-,-,[public],,
30
+ 112groningen.nl,72,112Groningen.nl,Local,EmergenciesNews,OnlineOnly,Netherlands,Groningen,Groningen,-,[independent],,
31
+ www.nu.nl,70,NU.nl ,National,GeneralNews,OnlineOnly,Netherlands,-,-,-,DPG Media,,
32
+ omroepwest.nl,67,Omroep West,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],,
33
+ RTV Rijnmond,62,RTV Rijnmond,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],,
34
+ www.pzc.nl,62,Provinciale Zeeuwse Courant,Regional,GeneralNews,Newspaper,Netherlands,Zeeland,-,-,[public],,
35
+ rijnmond.nl,61,RTV Rijnmond,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],"""RTV Rijnmond""",
36
+ 112Twente.nl,59,112Twente,Regional,GeneralNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
37
+ Het Nieuwsblad,52,Het Nieuwsblad,National,GeneralNews,Newspaper,Belgium,-,-,-,[independent],,
38
+ hbvl.be,50,Het Belang van Limburg,Regional,GeneralNews,Newspaper,Belgium,Limburg_BE,-,Mediahuis België,Mediahuis,,
39
+ www.vrt.be,48,VRT,National,GeneralNews,TV-Radio,Belgium,-,-,-,[public],,
40
+ De Limburger,47,De Limburger,Regional,GeneralNews,Newspaper,Netherlands,Limburg_NL,-,Mediahuis Limburg,Mediahuis,,
41
+ 112fryslan.nl,45,112 Fryslân,Regional,EmergenciesNews,OnlineOnly,Netherlands,Friesland,-,-,[independent],,
42
+ 112HM.nl,44,112HM.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
43
+ omroepzeeland.nl,43,Omroep Zeeland,Regional,GeneralNews,TV-Radio,Netherlands,Zeeland,-,-,[public],,
44
+ GelreNieuws.nl,41,GelreNieuws,Regional,GeneralNews,OnlineOnly,Netherlands,Gelderland,-,-,Persbureau Heitink,,
45
+ twitter.com,39,Twitter,International,SocialMedia,OnlineOnly,-,-,-,-,-,,
46
+ Het Parool,39,Het Parool,National,GeneralNews,Newspaper,Netherlands,-,-,-,DPG Media,,strong local/regional focus but published nationally
47
+ nieuwsblad.be,38,Het Nieuwsblad,National,GeneralNews,Newspaper,Belgium,-,-,-,[independent],"""Het Nieuwsblad""",
48
+ dvhn.nl,33,Dagblad van het Noorden,Regional,GeneralNews,Newspaper,Netherlands,Drenthe|Groningen,-,NDC Mediagroep,Mediahuis,,
49
+ politie.nl,33,Politie,National,OrganizationSpecific,OnlineOnly,Netherlands,-,-,-,-,,official website of the Dutch National Police
50
+ 112twente.nl,32,112Twente,Regional,GeneralNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],"""112Twente.nl""",
51
+ hardnieuws.nl,32,Hardnieuws,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
52
+ 112 Overijssel,32,112 Overijssel,Regional,EmergenciesNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
53
+ www.lc.nl,30,Leeuwarder Courant,Regional,GeneralNews,Newspaper,Netherlands,Friesland,-,-,[independent],,
54
+ rtlnieuws.nl,29,RTL Nieuws,National,GeneralNews,TV-Radio,Netherlands,-,-,RTL Nederland,RTL Group,,
55
+ rtva.nl,28,RTVA,Local,GeneralNews,TV-Radio,Netherlands,Noord-Holland,Amstelveen,-,[independent],,
56
+ Leeuwarder Courant,28,Leeuwarder Courant,Regional,GeneralNews,Newspaper,Netherlands,Friesland,-,-,[independent],"""www.lc.nl""",
57
+ Gazet van Antwerpen,26,Gazet van Antwerpen,Regional,GeneralNews,Newspaper,Belgium,Antwerpen,-,Mediahuis België,Mediahuis,,
58
+ District8.net,26,District8,Regional,GeneralNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
59
+ Focus en WTV,24,Focus-WTV,Regional,GeneralNews,TV-Radio,Belgium,West-Vlaanderen,-,-,Roularta Media Group,,
60
+ studio040.nl,24,Studio 040,Local,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,Eindhoven,-,[independent],,
61
+ 112-Overijssel,24,112 Overijssel,Regional,EmergenciesNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
62
+ omroepflevoland.nl,23,Omroep Flevoland,Regional,GeneralNews,TV-Radio,Netherlands,Flevoland,-,-,[public],,
63
+ De Utrechtse Internet Courant,20,De Utrechtse Internet Courant,Local,GeneralNews,OnlineOnly,Netherlands,Utrecht,Utrecht,-,[independent],,
64
+ www.wos.nl,19,WOS,Local,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,Maassluis,-,[independent],,
65
+ wos.nl,19,WOS,Local,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,Maassluis,-,[independent],"""www.wos.nl""",
66
+ OOG Radio en Televisie,18,OOG,Local,GeneralNews,TV-Radio,Netherlands,Groningen,Groningen,-,[independent],,
67
+ 112barneveld.nl,17,112 Barneveld,Local,EmergenciesNews,TV-Radio,Netherlands,Gelderland,Barneveld,-,112Press,,
68
+ 112hm.nl,17,112HM.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
69
+ flashphoto.nl,17,FlashPhoto,Local,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,Rotterdam,-,[independent],,"specialized in photography, also (emergency) news"
70
+ TVOOST - Regionaal nieuws uit Oost-Vlaanderen,16,TV Oost,Regional,GeneralNews,TV-Radio,Belgium,Oost-Vlaanderen,-,-,Concentra,,
71
+ zwollenu.nl,15,ZwolleNu,Local,GeneralNews,OnlineOnly,Netherlands,Overijssel,Zwolle,-,[independent],,
72
+ 112ede.nl,15,112 Ede,Local,EmergenciesNews,OnlineOnly,Netherlands,Gelderland,Ede,-,112Press,,
73
+ 112brabant.nl,13,112 Brabant,Regional,EmergenciesNews,OnlineOnly,Netherlands,Noord-Brabant,-,-,[independent],,
74
+ TVL - Dagelijks nieuws uit Limburg,13,TVL,Regional,GeneralNews,TV-Radio,Belgium,Limburg_BE,-,-,Concentra,,
75
+ oogtv.nl,13,OOG,Local,GeneralNews,TV-Radio,Netherlands,Groningen,Groningen,-,[independent],"""OOG""",
76
+ zhzactueel.nl,12,ZHZ Actueel,Regional,EmergenciesNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[independent],,
77
+ www.nrc.nl,12,NRC,National,GeneralNews,Newspaper,Netherlands,-,-,NRC Media,Mediahuis,,
78
+ stedendriehoek.net,12,Nieuwsblad Stedendriehoek,Regional,GeneralNews,Newspaper,Netherlands,Gelderland|Overijssel,-,-,[independent],,
79
+ ijmuidercourant.nl,11,IJmuider Courant,Local,GeneralNews,Newspaper,Netherlands,Noord-Holland,IJmuiden,Mediahuis Nederland,Mediahuis,,
80
+ Meternieuws.nl,10,Meter Nieuws,Regional,EmergenciesNews,OnlineOnly,Netherlands,Drenthe|Groningen|Overijssel,-,-,[independent],,
81
+ deswollenaer.nl,10,De Swollenaer,Local,GeneralNews,Newspaper,Netherlands,Overijssel,Zwolle,-,Brug Media,,
82
+ alkmaarcentraal.nl,10,Alkmaar Centraal,Local,GeneralNews,OnlineOnly,Netherlands,Noord-Holland,Alkmaar,-,[independent],,
83
+ 112Vandaag,10,112Vandaag,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
84
+ mediatv.nl,10,MediaTV,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
85
+ "gelderlander.nl, het laatste nieuws uit binnen- en buitenland, sport en show",10,De Gelderlander,Regional,GeneralNews,Newspaper,Netherlands,Gelderland,-,ADR Nieuwsmedia,DPG Media,"""gelderlander.nl""",
86
+ Weertdegekste.nl,9,,,,,,,,,,,
87
+ WâldNet,9,,,,,,,,,,,
88
+ transport-online.nl,9,,,,,,,,,,,
89
+ noordernieuws.nl,9,,,,,,,,,,,
90
+ regiopurmerend.nl,8,,,,,,,,,,,
91
+ https://www.vlaardingen24.nl,8,,,,,,,,,,,
92
+ Groninger Gezinsbode,8,,,,,,,,,,,
93
+ "Ring TV | Jouw zender, Jouw nieuws",8,,,,,,,,,,,
94
+ blikopnieuws.nl,8,,,,,,,,,,,
95
+ edestad.nl,8,,,,,,,,,,,
96
+ steenwijkercourant.nl,8,,,,,,,,,,,
97
+ nieuwsopbeeld.nl,8,,,,,,,,,,,
98
+ ROB-tv - Regionale Omroep Brabant,8,,,,,,,,,,,
99
+ barneveldsekrant.nl,8,,,,,,,,,,,
100
+ https://www.schiedam24.nl,8,,,,,,,,,,,
101
+ Sleutelstad.nl,7,,,,,,,,,,,
102
+ Unity NU is de nieuwssite voor de regio Leiden [www.unity.nu],7,,,,,,,,,,,
103
+ 112WestFriesland.nl,7,,,,,,,,,,,
104
+ 112vallei.nl,7,,,,,,,,,,,
105
+ Omroep Gelderland,7,,,,,,,,,,,
106
+ Het Belang van Limburg,7,,,,,,,,,,,
107
+ sleutelstad.nl,7,,,,,,,,,,,
108
+ Bredavandaag|HétnieuwsuitBreda,6,,,,,,,,,,,
109
+ alarmeringen.nl,6,,,,,,,,,,,
110
+ stedendriehoek.nl,6,,,,,,,,,,,
111
+ halstadcentraal.nl,6,,,,,,,,,,,
112
+ Westlanders.nu,6,,,,,,,,,,,
113
+ ATV - Antwerpse televisie,6,,,,,,,,,,,
114
+ Stefan Verkerk Fotografie en Webdesign,6,,,,,,,,,,,
115
+ De Gooi- en Eemlander,6,,,,,,,,,,,
116
+ alphens.nl,6,,,,,,,,,,,
117
+ 112nieuwsonline.nl,6,,,,,,,,,,,
118
+ zwollezuidnieuws.nl,6,,,,,,,,,,,
119
+ 1Limburg,5,,,,,,,,,,,
120
+ denoordoostpolder.nl,5,,,,,,,,,,,
121
+ 112provincieutrecht.nl,5,,,,,,,,,,,
122
+ rtvzaanstreek.nl,5,,,,,,,,,,,
123
+ nederweert24.nl,5,,,,,,,,,,,
124
+ Nieuws dat je raakt. 24/24u – Nnieuws.be,5,,,,,,,,,,,
125
+ nieuws.nl,5,,,,,,,,,,,
126
+ RTV Oost,5,,,,,,,,,,,
127
+ regio15.nl,5,,,,,,,,,,,
128
+ De Standaard,5,,,,,,,,,,,
129
+ flevopost.nl,5,,,,,,,,,,,
130
+ regionieuwshoogeveen.nl,5,,,,,,,,,,,
131
+ petershotnews.nl | Nieuws & fotografie,5,,,,,,,,,,,
132
+ Nieuws op Beeld - Altijd het laatste (112) nieuws vanuit de regio Rotterdam-Rijnmond!,5,,,,,,,,,,,
133
+ ZwolleZuidNieuws: alles wat Zwolle Zuid beweegt!,4,,,,,,,,,,,
134
+ Telegraaf,4,,,,,,,,,,,
135
+ RTV Utrecht,4,,,,,,,,,,,
136
+ regioleidscherijn.nl,4,,,,,,,,,,,
137
+ Hart van Nederland,4,,,,,,,,,,,
138
+ dagblad070.nl,4,,,,,,,,,,,
139
+ nuus.be,4,,,,,,,,,,,
140
+ onswestfriesland.nl,4,,,,,,,,,,,
141
+ waldnet.nl,4,,,,,,,,,,,
142
+ NU,4,,,,,,,,,,,
143
+ www.gva.be,4,,,,,,,,,,,
144
+ bunniksnieuws.nl,4,,,,,,,,,,,
145
+ dalfsennet.nl,4,,,,,,,,,,,
146
+ 112heuvelrug.nl,4,,,,,,,,,,,
147
+ hartvanlansingerland.nl,4,,,,,,,,,,,
148
+ "AD.nl, het laatste nieuws uit binnen- en buitenland, sport en show",4,,,,,,,,,,,
149
+ bruzz.be,4,,,,,,,,,,,
150
+ Vlissingen-Internetbode,3,,,,,,,,,,,
151
+ Blik op nieuws,3,,,,,,,,,,,
152
+ limburg24.nl,3,,,,,,,,,,,
153
+ www.gld.nl,3,,,,,,,,,,,
154
+ 112zwolle.nl,3,,,,,,,,,,,
155
+ omroepvenray.nl,3,,,,,,,,,,,
156
+ lokaalgelderland.nl,3,,,,,,,,,,,
157
+ destadgorinchem.nl,3,,,,,,,,,,,
158
+ 112veenendaal.nl,3,,,,,,,,,,,
159
+ denhaagfm.nl,3,,,,,,,,,,,
160
+ facebook.com,3,,,,,,,,,,,
161
+ 112midden-zeeland.nl,3,,,,,,,,,,,
162
+ de Volkskrant,3,,,,,,,,,,,
163
+ meppelercourant.nl,3,,,,,,,,,,,
164
+ Neustadt-Geflüster,3,,,,,,,,,,,
165
+ goudsdagblad.nl,3,,,,,,,,,,,
166
+ schie.nu,3,,,,,,,,,,,
167
+ oozo.nl,3,,,,,,,,,,,
168
+ www.rd.nl,3,,,,,,,,,,,
169
+ voorburgsdagblad.nl,3,,,,,,,,,,,
170
+ NieuwsOverijssel.nl,3,,,,,,,,,,,
171
+ ZwolleZuidNieuws: alles wat Zwolle-Zuid beweegt!,3,,,,,,,,,,,
172
+ 112inbeeld.nl,3,,,,,,,,,,,
173
+ bredavandaag.nl,3,,,,,,,,,,,
174
+ De Jutter | De Hofgeest,2,,,,,,,,,,,
175
+ Woerden.TV,2,,,,,,,,,,,
176
+ knipselkrant-curacao.com,2,,,,,,,,,,,
177
+ heerenveensecourant.nl,2,,,,,,,,,,,
178
+ ThePostOnline,2,,,,,,,,,,,
179
+ regio8.nl,2,,,,,,,,,,,
180
+ BarendrechtNU.nl,2,,,,,,,,,,,
181
+ "pzc.nl, het laatste nieuws uit binnen- en buitenland, sport en show",2,,,,,,,,,,,
182
+ weespernieuws.nl,2,,,,,,,,,,,
183
+ Amstelveenz,2,,,,,,,,,,,
184
+ stadtiel.nl,2,,,,,,,,,,,
185
+ gouweijsselnieuws.nl,2,,,,,,,,,,,
186
+ Nieuws op Beeld,2,,,,,,,,,,,
187
+ heerhugowaardcentraal.nl,2,,,,,,,,,,,
188
+ nieuwsbladdezaankanter.nl,2,,,,,,,,,,,
189
+ www.avs.be,2,,,,,,,,,,,
190
+ haarlemsweekblad.nl,2,,,,,,,,,,,
191
+ yomyom.net,2,,,,,,,,,,,
192
+ mooirooi.nl,2,,,,,,,,,,,
193
+ oisterwijknieuws.nl,2,,,,,,,,,,,
194
+ rtv-apeldoorn.nl,2,,,,,,,,,,,
195
+ 112amersfoort.nl,2,,,,,,,,,,,
196
+ dedemsvaartsecourant.nl,2,,,,,,,,,,,
197
+ ed.nl,2,,,,,,,,,,,
198
+ soestercourant.nl,2,,,,,,,,,,,
199
+ heemsteedsecourant.nl,2,,,,,,,,,,,
200
+ 112hoogezand.nl,2,,,,,,,,,,,
201
+ hetstreekblad.nl,2,,,,,,,,,,,
202
+ NRC,2,,,,,,,,,,,
203
+ 112nieuws.net,2,,,,,,,,,,,
204
+ De Limburger Mobile,2,,,,,,,,,,,
205
+ 0297.nl,2,,,,,,,,,,,
206
+ drachtstercourant.nl,2,,,,,,,,,,,
207
+ Sittard-Geleen,2,,,,,,,,,,,
208
+ hoogenlaag.nl,2,,,,,,,,,,,
209
+ drentsnieuws.nl,2,,,,,,,,,,,
210
+ brugnieuws.nl,2,,,,,,,,,,,
211
+ medemblikactueel.nl,2,,,,,,,,,,,
212
+ rechtspraak.nl,2,,,,,,,,,,,
213
+ gooieneembode.nl,2,,,,,,,,,,,
214
+ arenalokaal.nl,2,,,,,,,,,,,
215
+ DitisdeZaanstreek.nl,2,,,,,,,,,,,
216
+ hcnieuws.nl,2,,,,,,,,,,,
217
+ https://www.heerhugowaardsdagblad.nl/,2,,,,,,,,,,,
218
+ schagenfm.nl,2,,,,,,,,,,,
219
+ hv-almere.nl,2,,,,,,,,,,,
220
+ 112achterhoek-nieuws.nl,2,,,,,,,,,,,
221
+ peelenmaasvenray.nl,2,,,,,,,,,,,
222
+ frieslandactueel.nl,2,,,,,,,,,,,
223
+ www.rtv.be,2,,,,,,,,,,,
224
+ hoogeveenschecourant.nl,2,,,,,,,,,,,
225
+ Nieuws Apeldoorn Direct,2,,,,,,,,,,,
226
+ nieuwsuitberkelland.nl,2,,,,,,,,,,,
227
+ 112meerlanden.nl,2,,,,,,,,,,,
228
+ internetbode.nl,2,,,,,,,,,,,
229
+ nieuw-volendam.nl,2,,,,,,,,,,,
230
+ katwijkactueel.nl,2,,,,,,,,,,,
231
+ 112schiedam.nl,2,,,,,,,,,,,
232
+ compactmedia.nl,2,,,,,,,,,,,
233
+ culemborgsecourant.nl,2,,,,,,,,,,,
234
+ Alphens.nl,2,,,,,,,,,,,
235
+ 112ijmond.nl,2,,,,,,,,,,,
236
+ detoren.net,2,,,,,,,,,,,
237
+ gorkumsnieuws.nl,2,,,,,,,,,,,
238
+ Redactie24.be,2,,,,,,,,,,,
239
+ wnl.tv,2,,,,,,,,,,,
240
+ alarmeringdroid.nl,1,,,,,,,,,,,
241
+ HCNieuws,1,,,,,,,,,,,
242
+ frontpage.fok.nl,1,,,,,,,,,,,
243
+ 112vdg.nl,1,,,,,,,,,,,
244
+ Ede Stad,1,,,,,,,,,,,
245
+ my net rosh haayin,1,,,,,,,,,,,
246
+ Noordhollands Dagblad,1,,,,,,,,,,,
247
+ Zundert-Internetbode,1,,,,,,,,,,,
248
+ defeanster.nl,1,,,,,,,,,,,
249
+ heerhugowaardalife.nl,1,,,,,,,,,,,
250
+ inteylingen.nl,1,,,,,,,,,,,
251
+ The News Herald,1,,,,,,,,,,,
252
+ Rijswijk.TV,1,,,,,,,,,,,
253
+ Leidsch Dagblad,1,,,,,,,,,,,
254
+ mynetkrayot,1,,,,,,,,,,,
255
+ OldambtNu.nl,1,,,,,,,,,,,
256
+ instagram.com,1,,,,,,,,,,,
257
+ Bonaire.Nu,1,,,,,,,,,,,
258
+ nieuwsbladdekoerier.nl,1,,,,,,,,,,,
259
+ BergenopZoom-Internetbode,1,,,,,,,,,,,
260
+ 1twente.nl,1,,,,,,,,,,,
261
+ www.rtl.de,1,,,,,,,,,,,
262
+ tvvalkenburg.tv,1,,,,,,,,,,,
263
+ alarmfase1.nl,1,,,,,,,,,,,
264
+ gids.tv,1,,,,,,,,,,,
265
+ RTV Uitgeest,1,,,,,,,,,,,
266
+ De Telegraaf,1,,,,,,,,,,,
267
+ 112-dokkum.nl,1,,,,,,,,,,,
268
+ wijksnieuws.nl,1,,,,,,,,,,,
269
+ hetkontakt.nl,1,,,,,,,,,,,
270
+ landelijkeorganisatieverkeersslachtoffers.nl,1,,,,,,,,,,,
271
+ rtv.be,1,,,,,,,,,,,
272
+ indebuurt Ede,1,,,,,,,,,,,
273
+ 112 groningen.nl,1,,,,,,,,,,,
274
+ Ik hou van Arnhem,1,,,,,,,,,,,
275
+ 112hardenberg.nu,1,,,,,,,,,,,
276
+ stadwageningen.nl,1,,,,,,,,,,,
277
+ ridderkerksdagblad.nl,1,,,,,,,,,,,
278
+ geenstijl.nl,1,,,,,,,,,,,
279
+ dewoudenberger.nl,1,,,,,,,,,,,
280
+ https://www.alkmaarsdagblad.nl/,1,,,,,,,,,,,
281
+ nieuwsbladnof.nl,1,,,,,,,,,,,
282
+ Nieuwe Meerbode,1,,,,,,,,,,,
283
+ looopings.nl,1,,,,,,,,,,,
284
+ amstelveensnieuwsblad.nl,1,,,,,,,,,,,
285
+ texelsecourant.nl,1,,,,,,,,,,,
286
+ anwb.nl,1,,,,,,,,,,,
287
+ indebuurt Delft,1,,,,,,,,,,,
288
+ https://www.zutphen24.nl,1,,,,,,,,,,,
289
+ Teylingen,1,,,,,,,,,,,
290
+ 112Midden-Zeeland,1,,,,,,,,,,,
291
+ noorderkrant.nl,1,,,,,,,,,,,
292
+ onswestbrabant.nl,1,,,,,,,,,,,
293
+ lindanieuws.nl,1,,,,,,,,,,,
294
+ 112persfotografie.nl,1,,,,,,,,,,,
295
+ antilliaansdagblad.com,1,,,,,,,,,,,
296
+ Site-Knack-NL,1,,,,,,,,,,,
297
+ alblasserdamsnieuws.nl,1,,,,,,,,,,,
298
+ 112harderwijk.nl,1,,,,,,,,,,,
299
+ l1.nl,1,,,,,,,,,,,
300
+ Nederweert24,1,,,,,,,,,,,
301
+ Radio.NL,1,,,,,,,,,,,
302
+ LokaalGelderland,1,,,,,,,,,,,
303
+ hoekschnieuws.nl,1,,,,,,,,,,,
304
+ nieuwsbladgeldermalsen.nl,1,,,,,,,,,,,
305
+ Veenendaalse Krant,1,,,,,,,,,,,
306
+ 112-nederland.nl,1,,,,,,,,,,,
307
+ demorgen.be,1,,,,,,,,,,,
308
+ www.gic.nl,1,,,,,,,,,,,
309
+ Unity NU is de nieuwssite voor de regio Leiden,1,,,,,,,,,,,
310
+ Middelburg-Internetbode,1,,,,,,,,,,,
311
+ groot-waterland.nl,1,,,,,,,,,,,
312
+ regiobodeonline.nl,1,,,,,,,,,,,
313
+ Nudrenthe.nl | Boven op het Nieuws |,1,,,,,,,,,,,
314
+ Gocar.be,1,,,,,,,,,,,
315
+ KW.be - Nieuws uit West-Vlaanderen,1,,,,,,,,,,,
316
+ harenerweekblad.nl,1,,,,,,,,,,,
317
+ nbcnews.com,1,,,,,,,,,,,
318
+ Omroep Brabant,1,,,,,,,,,,,
319
+ 112apeldoorn.nl,1,,,,,,,,,,,
320
+ linda.nl,1,,,,,,,,,,,
321
+ assercourant.nl,1,,,,,,,,,,,
322
+ prorail.nl,1,,,,,,,,,,,
323
+ bbc.co.uk,1,,,,,,,,,,,
324
+ schipholregio.nl,1,,,,,,,,,,,
325
+ lequipe.fr,1,,,,,,,,,,,
326
+ Politie.nl,1,,,,,,,,,,,
327
+ welingelichtekringen.nl,1,,,,,,,,,,,
328
+ destadamersfoort.nl,1,,,,,,,,,,,
329
+ curacaonieuws.nu,1,,,,,,,,,,,
330
+ Incidenten Apeldoorn e.o.,1,,,,,,,,,,,
331
+ arubanieuws.nu,1,,,,,,,,,,,
332
+ Vrij Nederland,1,,,,,,,,,,,
333
+ Omroep Brabant,1,,,,,,,,,,,
334
+ hetdeventernieuws.nl,1,,,,,,,,,,,
335
+ Krimpenerwaard,1,,,,,,,,,,,
336
+ avrotros.nl,1,,,,,,,,,,,
337
+ elpais.com.co,1,,,,,,,,,,,
338
+ 112marum.nl,1,,,,,,,,,,,
339
+ https://www.denheldersdagblad.nl/,1,,,,,,,,,,,
340
+ ZHZActueel,1,,,,,,,,,,,
341
+ bashinform.ru,1,,,,,,,,,,,
342
+ FOK!,1,,,,,,,,,,,
343
+ bx1.be,1,,,,,,,,,,,
344
+ denhelderactueel.nl,1,,,,,,,,,,,
345
+ www.bbc.com,1,,,,,,,,,,,
346
+ eemskrant.nl,1,,,,,,,,,,,
347
+ Regio Leidsche Rijn,1,,,,,,,,,,,
348
+ Omroep Zeeland,1,,,,,,,,,,,
349
+ topics.nl,1,,,,,,,,,,,
350
+ HetKrantje-Online.nl,1,,,,,,,,,,,
351
+ https://www.langedijkerdagblad.nl/,1,,,,,,,,,,,
352
+ SpoorPro.nl,1,,,,,,,,,,,
353
+ radio2.be,1,,,,,,,,,,,
354
+ Metronieuws.nl,1,,,,,,,,,,,
355
+ caribischnetwerk.ntr.nl,1,,,,,,,,,,,
356
+ het-westerkwartier.nl,1,,,,,,,,,,,
357
+ rijschoolpro.nl,1,,,,,,,,,,,
358
+ rn7.nl,1,,,,,,,,,,,
359
+ Eemskrant,1,,,,,,,,,,,
360
+ HS-Krant,1,,,,,,,,,,,
361
+ grootheerenveen.nl,1,,,,,,,,,,,
362
+ RTV Zaanstreek,1,,,,,,,,,,,
363
+ joustercourant.nl,1,,,,,,,,,,,
364
+ 112vlissingen-souburg.nl,1,,,,,,,,,,,
365
+ 112 Groningen,1,,,,,,,,,,,
366
+ ZuidOosthoeker,1,,,,,,,,,,,
367
+ AD.nl,1,,,,,,,,,,,
368
+ Eemskrant | Nieuws uit de regio,1,,,,,,,,,,,
369
+ Steenwijkerland,1,,,,,,,,,,,
370
+ 112tv.nl,1,,,,,,,,,,,
371
+ Groningen,1,,,,,,,,,,,
372
+ Reno Gazette Journal,1,,,,,,,,,,,
373
+ haspengouwsnieuws.be,1,,,,,,,,,,,
374
+ stellingwerf.nl,1,,,,,,,,,,,
375
+ globo.com,1,,,,,,,,,,,
376
+ 112lansingerland.nu,1,,,,,,,,,,,
377
+ bicycling.com,1,,,,,,,,,,,
378
+ woldercourant.nl,1,,,,,,,,,,,
379
+ omroepalmere.nl,1,,,,,,,,,,,
380
+ Den Helder actueel,1,,,,,,,,,,,
381
+ rtvhattem.nl,1,,,,,,,,,,,
382
+ WNL,1,,,,,,,,,,,
383
+ Omroep Venray,1,,,,,,,,,,,
384
+ Dagblad070,1,,,,,,,,,,,
385
+ friesenieuwsflitsen.nl,1,,,,,,,,,,,
386
+ Kampen Online,1,,,,,,,,,,,
387
+ dailymail.co.uk,1,,,,,,,,,,,
388
+ https://112hm.nl/2021/07/21/ernstig-ongeval-op-de-hoogeveenseweg-hazerswoude-dorp-veroorzaakt-door-overstekende-hond/,1,,,,,,,,,,,
389
+ 112insteenwijkerland.nl,1,,,,,,,,,,,
390
+ varnws.nl,1,,,,,,,,,,,
391
+ actu.fr,1,,,,,,,,,,,
392
+ hetkompashardinxveld-giessendam.nl,1,,,,,,,,,,,
393
+ uitkijkpost.nl,1,,,,,,,,,,,
394
+ RN7,1,,,,,,,,,,,
395
+ NOS,1,,,,,,,,,,,
396
+ uitzendinggemist.net,1,,,,,,,,,,,
397
+ Nachrichten aus Leipzig - Leipziger Zeitung,1,,,,,,,,,,,
398
+ twentefm.nl,1,,,,,,,,,,,
399
+ Sergevanduijnhoven's Blog,1,,,,,,,,,,,
400
+ Barneveldse Krant,1,,,,,,,,,,,
401
+ leuvenactueel.be,1,,,,,,,,,,,
402
+ https://www.schagerdagblad.nl/,1,,,,,,,,,,,
403
+ coevordenhuisaanhuis.nl,1,,,,,,,,,,,
404
+ blinker.co.il,1,,,,,,,,,,,
405
+ Genderendigitaal,1,,,,,,,,,,,
406
+ De Gelderlander,1,,,,,,,,,,,
407
+ dagblad010.nl,1,,,,,,,,,,,
408
+ traumaheli-mmt.nl,1,,,,,,,,,,,
409
+ limburger.nl,1,,,,,,,,,,,
410
+ Roosendaal-Internetbode,1,,,,,,,,,,,
411
+ bommelerwaardgids.nl,1,,,,,,,,,,,
412
+ Alkmaar Centraal,1,,,,,,,,,,,
413
+ IJsselmondeNieuws en omstreken op facebook.com,1,,,,,,,,,,,
414
+ theguardian.com,1,,,,,,,,,,,
415
+ 112 Vlissingen & Souburg,1,,,,,,,,,,,
416
+ rtvpurmerend.nl,1,,,,,,,,,,,
417
+ Site-KW-NL,1,,,,,,,,,,,
418
+ 10yan.com,1,,,,,,,,,,,
419
+ petershotnews.nl,1,,,,,,,,,,,
420
+ Dumbarton and Vale of Leven Reporter,1,,,,,,,,,,,
421
+ cyclingweekly.com,1,,,,,,,,,,,
422
+ hanzestad.nl,1,,,,,,,,,,,
423
+ emmen.nu,1,,,,,,,,,,,
424
+ foxreno.com,1,,,,,,,,,,,
425
+ De Krant van Midden-Drenthe,1,,,,,,,,,,,
426
+ BBC,1,,,,,,,,,,,
427
+ 112drachten.nl,1,,,,,,,,,,,
428
+ brummensnieuws.nl,1,,,,,,,,,,,
429
+ Streetsblog New York City,1,,,,,,,,,,,
430
+ De Heemsteder,1,,,,,,,,,,,
431
+ indebuurt Utrecht,1,,,,,,,,,,,
432
+ westfriesweekblad.nl,1,,,,,,,,,,,
433
+ 1istochnik.ru,1,,,,,,,,,,,
434
+ kipa.co.il,1,,,,,,,,,,,
435
+ veluweland.nl,1,,,,,,,,,,,
436
+ DNN - Dresdner Neueste Nachrichten,1,,,,,,,,,,,
437
+ 112wijchensnieuws.nl,1,,,,,,,,,,,
438
+ delpher.nl,1,,,,,,,,,,,
439
+ indebuurt Doetinchem,1,,,,,,,,,,,
440
+ news4jax.com,1,,,,,,,,,,,
resources/deep_frame_cache.json ADDED
The diff for this file is too large to render. See raw diff
 
resources/dep_labels.txt ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ acl:relcl↑
2
+ acl:relcl↓
3
+ acl↑
4
+ acl↓
5
+ advcl↑
6
+ advcl↓
7
+ advmod↑
8
+ advmod↓
9
+ amod↑
10
+ amod↓
11
+ appos↑
12
+ appos↓
13
+ aux:pass↑
14
+ aux:pass↓
15
+ aux↑
16
+ aux↓
17
+ case↑
18
+ case↓
19
+ ccomp↑
20
+ ccomp↓
21
+ cc↑
22
+ cc↓
23
+ compound:prt↑
24
+ compound:prt↓
25
+ compound↑
26
+ compound↓
27
+ conj↑
28
+ conj↓
29
+ cop↑
30
+ cop↓
31
+ csubj↑
32
+ csubj↓
33
+ dep↑
34
+ dep↓
35
+ det:poss↑
36
+ det:poss↓
37
+ det:predet↑
38
+ det:predet↓
39
+ det↑
40
+ det↓
41
+ discourse↑
42
+ discourse↓
43
+ expl:impers↑
44
+ expl:impers↓
45
+ expl:pass↓
46
+ expl:pv↓
47
+ expl↑
48
+ expl↓
49
+ fixed↑
50
+ fixed↓
51
+ flat:foreign↑
52
+ flat:name↑
53
+ flat:name↓
54
+ flat↑
55
+ flat↓
56
+ iobj↑
57
+ iobj↓
58
+ mark↑
59
+ mark↓
60
+ nmod:poss↑
61
+ nmod:poss↓
62
+ nmod↑
63
+ nmod↓
64
+ nsubj:pass↑
65
+ nsubj:pass↓
66
+ nsubj↑
67
+ nsubj↓
68
+ nummod↑
69
+ nummod↓
70
+ obj↑
71
+ obj↓
72
+ obl:agent↑
73
+ obl:agent↓
74
+ obl↑
75
+ obl↓
76
+ orphan↓
77
+ parataxis↑
78
+ parataxis↓
79
+ punct↑
80
+ punct↓
81
+ vocative↑
82
+ vocative↓
83
+ xcomp↑
84
+ xcomp↓
85
+ ↑--acl:relcl↓
86
+ ↑--acl↓
87
+ ↑--advcl↓
88
+ ↑--advmod↓
89
+ ↑--amod↓
90
+ ↑--appos↓
91
+ ↑--aux:pass↓
92
+ ↑--aux↓
93
+ ↑--case↓
94
+ ↑--ccomp↓
95
+ ↑--cc↓
96
+ ↑--compound:prt↓
97
+ ↑--compound↓
98
+ ↑--conj↓
99
+ ↑--cop↓
100
+ ↑--csubj↓
101
+ ↑--dep↓
102
+ ↑--det:poss↓
103
+ ↑--det↓
104
+ ↑--discourse↓
105
+ ↑--expl:impers↓
106
+ ↑--expl:pass↓
107
+ ↑--expl↓
108
+ ↑--fixed↓
109
+ ↑--flat:foreign↓
110
+ ↑--flat:name↓
111
+ ↑--flat↓
112
+ ↑--iobj↓
113
+ ↑--mark↓
114
+ ↑--nmod:poss↓
115
+ ↑--nmod↓
116
+ ↑--nsubj:pass↓
117
+ ↑--nsubj↓
118
+ ↑--nummod↓
119
+ ↑--obj↓
120
+ ↑--obl:agent↓
121
+ ↑--obl↓
122
+ ↑--parataxis↓
123
+ ↑--xcomp↓
124
+ ↓--acl:relcl↓
125
+ ↓--acl↓
126
+ ↓--advcl↓
127
+ ↓--advmod↓
128
+ ↓--amod↓
129
+ ↓--appos↓
130
+ ↓--aux:pass↓
131
+ ↓--aux↓
132
+ ↓--case↓
133
+ ↓--ccomp↓
134
+ ↓--cc↓
135
+ ↓--compound:prt↓
136
+ ↓--compound↓
137
+ ↓--conj↓
138
+ ↓--cop↓
139
+ ↓--dep↓
140
+ ↓--det:poss↓
141
+ ↓--det↓
142
+ ↓--expl:impers↓
143
+ ↓--expl↓
144
+ ↓--fixed↓
145
+ ↓--flat:name↓
146
+ ↓--flat↓
147
+ ↓--iobj↓
148
+ ↓--mark↓
149
+ ↓--nmod:poss↓
150
+ ↓--nmod↓
151
+ ↓--nsubj:pass↓
152
+ ↓--nsubj↓
153
+ ↓--nummod↓
154
+ ↓--obj↓
155
+ ↓--obl:agent↓
156
+ ↓--obl↓
157
+ ↓--parataxis↓
158
+ ↓--xcomp↓
159
+
resources/femicide_frame_list.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 1) based on Gaetana & Marion's template file
2
+ Abusing#Violence
3
+ Attack#Violence
4
+ Cause_harm#Violence
5
+ Hit_target#Violence
6
+ Killing#Murder
7
+ Rape#Violence
8
+ Use_firearm#Violence
9
+
10
+ # 2) based on Gosse's paper/document (Table 2)
11
+ Attack#Violence
12
+ Causation
13
+ Cause_harm#Violence
14
+ Cause_motion
15
+ Emotion_directed
16
+ Event#Murder
17
+ Quarreling
18
+
19
+ Dead_or_alive#Murder
20
+ Death#Murder
21
+ Experience_bodily_harm
22
+ Killing#Murder
23
+ Catastrophe#Murder
resources/femicides_frame_to_roles.csv ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ frame,role:perpetrator_like,role:victim_like,role:cause_like,notes
2
+ Abusing,Abuser,Victim,-,
3
+ Attack,Assailant,Victim,-,
4
+ Causation,Causer,Affected,Cause,
5
+ Cause_harm,Agent,Victim,Cause,
6
+ Cause_motion,-,-,-,does not seem to usually refer to the main murder event
7
+ Dead_or_alive,-,Protagonist,Explanation,
8
+ Death,-,Protagonist,Cause,
9
+ Emotion_directed,-,-,-,does not seem to usually refer to the main murder event
10
+ Event,-,-,-,does not involve any participants
11
+ Experience_bodily_harm,Experiencer|Body_part,-,-,
12
+ Hit_target,Agent,Target,-,
13
+ Killing,Killer,Victim,Cause,
14
+ Quarreling,-,-,-,core roles (Arguers/Arguer1/Arguer2) could denote either Perpetrator or victim
15
+ Rape,Perpetrator,Victim,-,
16
+ Use_firearm,Agent,Goal,-,
resources/fn_frames_to_roles.json ADDED
The diff for this file is too large to render. See raw diff
 
resources/migration_frame_list.txt ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARRIVING#Travel
2
+ CAUSE_MOTION#Travel
3
+ DEPARTING#Travel
4
+ FLUIDIC_MOTION#Travel
5
+ SELF_MOTION#Travel
6
+ TRAVEL#Travel
7
+ DISEMBARKING#Travel
8
+ RISKY_SITUATION#Travel
9
+ DEATH#Travel
10
+
11
+ CARDINAL_NUMBERS#Quantification
12
+ CHANGE_OF_QUANTITY_OF_POSSESSION#Quantification
13
+ CHANGE_POSITION_ON_A_SCALE#Quantification
14
+ FAMILIARITY#Quantification
15
+ INCREMENT#Quantification
16
+ PROLIFERATING_IN_NUMBER#Quantification
17
+ QUANTIFIED_MASS#Quantification
18
+ QUANTITY#Quantification
19
+
20
+ ABUSING#Crime
21
+ ARREST#Crime
22
+ COMMITTING_CRIME#Crime
23
+ INTENTIONAL_DECEPTION#Crime
24
+ KILLING#Crime
25
+ RAPE#Crime
26
+ ROBBERY#Crime
27
+ SMUGGLING#Crime
28
+ PROTEST#Crime
29
+ THEFT#Crime
30
+ CAUSE_HARM#Crime
31
+
32
+ HOSTILE_ENCOUNTER#Hostility
33
+ INVADING#Hostility
34
+ ATTACK#Hostility
35
+ WEAPON#Hostility
36
+
37
+ ARRANGING#Administration
38
+ MAKING_ARRANGEMENTS#Administration
39
+ DISCUSSION#Administration
40
+ EXECUTE_PLAN#Administration
41
+ LEADERSHIP#Administration
42
+ EXPEND_RESOURCE#Administration
43
+ GATHERING_UP#Administration
44
+ PLACING#Administration
45
+ POINT_OF_DISPUTE#Administration
46
+ INHIBIT_MOVEMENT#Administration
47
+ EXPENSIVENESS#Administration
48
+
49
+ ASSISTANCE#Humanizing
50
+ HIRING#Humanizing
51
+ INTENTIONALLY_CREATE#Humanizing
52
+ SOCIAL_EVENT#Humanizing
53
+ KINSHIP#Humanizing
54
+ COLLABORATION#Humanizing
55
+ EDUCATION_TEACHING#Humanizing
56
+ RESCUING#Humanizing
sociofillmore/__init__.py ADDED
File without changes
sociofillmore/__init__.pyc ADDED
Binary file (112 Bytes). View file
 
sociofillmore/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (187 Bytes). View file
 
sociofillmore/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (157 Bytes). View file
 
sociofillmore/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (169 Bytes). View file
 
sociofillmore/common/__init__.py ADDED
File without changes
sociofillmore/common/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (164 Bytes). View file
 
sociofillmore/common/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (176 Bytes). View file
 
sociofillmore/common/__pycache__/analyze_text.cpython-37.pyc ADDED
Binary file (22.8 kB). View file
 
sociofillmore/common/__pycache__/analyze_text.cpython-39.pyc ADDED
Binary file (23 kB). View file
 
sociofillmore/common/__pycache__/split_lome_files.cpython-39.pyc ADDED
Binary file (819 Bytes). View file
 
sociofillmore/common/analyze_text.py ADDED
@@ -0,0 +1,1046 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import json
3
+ import os
4
+ import sys
5
+ import argparse
6
+ import re
7
+ import tarfile
8
+ from collections import defaultdict
9
+ import dataclasses
10
+ from datetime import datetime
11
+ from typing import Any, Dict, List, Tuple, Optional
12
+
13
+ import pandas as pd
14
+ import spacy
15
+ from nltk.corpus import framenet as fn
16
+ from nltk.corpus.reader.framenet import FramenetError
17
+ from spacy.tokens import Token
18
+
19
+ from sociofillmore.crashes.utils import is_a_dutch_text
20
+
21
+ ITALIAN_ACTIVE_AUX = ["avere", "ha", "ho", "hai", "avete", "hanno", "abbiamo"]
22
+ DUTCH_ACTIVE_AUX = ["heb", "hebben", "heeft"]
23
+
24
+ active_frames_df = pd.read_csv("resources/active_frames_full.csv")
25
+ ACTIVE_FRAMES = active_frames_df[active_frames_df["active"]]["frame"].tolist()
26
+
27
+
28
+ IGNORE_DEP_LABELS = ["punct"]
29
+
30
+
31
+
32
+ DEEP_FRAMES = [
33
+ "Transitive_action",
34
+ "Causation",
35
+ "Transition_to_a_state",
36
+ "Event",
37
+ "State",
38
+ ]
39
+ # SYNTAX_ANALYSIS_CACHE_FILES = {
40
+ # "femicides/rai": "resources/rai_syntax_analysis_cache.json",
41
+ # "femicides/rai_main": "resources/rai_main_syntax_analysis_cache.json",
42
+ # "femicides/olv": "resources/olv_syntax_analysis_cache.json",
43
+ # "crashes/thecrashes": "resources/thecrashes_syntax_analysis_cache.json",
44
+ # "migration/pavia": "resources/migration_pavia_syntax_analysis_cache.json"
45
+ # }
46
+ SYNTAX_ANALYSIS_CACHE_FILES = {
47
+ "femicides/rai": "output/femicides/syntax_cache/rai_ALL",
48
+ "femicides/rai_main": "output/femicides/syntax_cache/rai_main",
49
+ "femicides/rai_ALL": "output/femicides/syntax_cache/rai_ALL",
50
+ "femicides/olv": "output/femicides/syntax_cache/olv",
51
+ "crashes/thecrashes": "output/crashes/syntax_cache/thecrashes",
52
+ "migration/pavia": "output/migration/syntax_cache/pavia",
53
+ }
54
+
55
+
56
+ DEEP_FRAMES_CACHE_FILE = "resources/deep_frame_cache.json"
57
+
58
+ DEP_LABEL_CACHE_FILE = "resources/dep_labels.txt"
59
+
60
+ POSSIBLE_CONSTRUCTIONS = [
61
+ "nonverbal",
62
+ "verbal:active",
63
+ "verbal:impersonal",
64
+ "verbal:reflexive",
65
+ "verbal:passive",
66
+ "verbal:unaccusative",
67
+ "other",
68
+ ]
69
+
70
+
71
+ def load_deep_frames_cache():
72
+ if os.path.isfile(DEEP_FRAMES_CACHE_FILE):
73
+ print("Loading deep frame cache...")
74
+ with open(DEEP_FRAMES_CACHE_FILE, encoding="utf-8") as f:
75
+ deep_frames_cache = json.load(f)
76
+ else:
77
+ deep_frames_cache = {}
78
+ return deep_frames_cache
79
+
80
+
81
+ # make spacy work with google app engine
82
+ # (see https://stackoverflow.com/questions/55228492/spacy-on-gae-standard-second-python-exceeds-memory-of-largest-instance)
83
+ # nlp = spacy.load("it_core_news_md")
84
+ nlp = None
85
+
86
+
87
+ @dataclasses.dataclass
88
+ class AnnotationSpan:
89
+ tokens_idx: List[int]
90
+ tokens_str: List[str]
91
+
92
+
93
+ @dataclasses.dataclass
94
+ class FrameStructure:
95
+ frame: str
96
+ deep_frame: str
97
+ target: Optional[AnnotationSpan]
98
+ roles: List[Tuple[str, AnnotationSpan]]
99
+ deep_roles: List[Tuple[str, AnnotationSpan]]
100
+
101
+
102
+ def make_syntax_cache(dataset, skip_fn=None):
103
+ print(f"make_syntax_cache({dataset})")
104
+
105
+ if dataset == "femicides/rai":
106
+ corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_blocks"
107
+ corpus = "rai"
108
+ spacy_model = "it_core_news_md"
109
+ elif dataset == "femicides/rai_main":
110
+ corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_main_blocks"
111
+ corpus = "rai_main"
112
+ spacy_model = "it_core_news_md"
113
+ elif dataset == "femicides/rai_ALL":
114
+ corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_ALL_blocks"
115
+ corpus = "rai_ALL"
116
+ spacy_model = "it_core_news_md"
117
+ elif dataset == "femicides/olv":
118
+ corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_olv_blocks"
119
+ corpus = "olv"
120
+ spacy_model = "it_core_news_md"
121
+ elif dataset == "crashes/thecrashes":
122
+ corpus_tarball = "output/crashes/lome/lome_0shot/multilabel_thecrashes_blocks"
123
+ corpus = "thecrashes"
124
+ spacy_model = "nl_core_news_md"
125
+ elif dataset == "migration/pavia":
126
+ corpus_tarball = "output/migration/lome/lome_0shot/multilabel_pavia_blocks"
127
+ # corpus_tarball = "output/migration/lome/lome_zs-tgt_ev-frm/multilabel_pavia.tar.gz"
128
+ corpus = "pavia"
129
+ spacy_model = "it_core_news_md"
130
+ else:
131
+ raise ValueError("Unsupported dataset!")
132
+
133
+ print("params:")
134
+ print(f"\tcorpus_tarball: {corpus_tarball}")
135
+ print(f"\tcorpus: {corpus}")
136
+ print(f"\tspacy: {spacy_model}")
137
+
138
+ print("processing files...")
139
+
140
+
141
+ for block in os.listdir(corpus_tarball):
142
+ print(block)
143
+
144
+ with tarfile.open(os.path.join(corpus_tarball, block)) as tar_in:
145
+
146
+ # check if output tarball exists
147
+ cache_location = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
148
+ if not os.path.isdir(cache_location):
149
+ os.makedirs(cache_location)
150
+
151
+ lome_files = [f for f in tar_in.getmembers(
152
+ ) if f.name.endswith(".comm.json")]
153
+
154
+ lome_files.sort(key=lambda file: file.name)
155
+ for file in lome_files:
156
+ print(f"\tprocessing file {file}")
157
+ doc_id = re.search(r"lome_(\d+)\.comm\.json", file.name).group(1)
158
+
159
+ skipped = False
160
+ if skip_fn is not None:
161
+ if skip_fn(doc_id):
162
+ print(f"\t\tskip_fn: skipping file {file}")
163
+ skipped = True
164
+
165
+ if skipped:
166
+ syntax_analyses = None
167
+ else:
168
+ file_obj = io.TextIOWrapper(tar_in.extractfile(file))
169
+ annotations = json.load(file_obj)
170
+
171
+ syntax_analyses = []
172
+ for sentence in annotations:
173
+ syntax_analyses.append(
174
+ syntax_analyze(sentence, spacy_model))
175
+
176
+ # use last two chars of filename as key
177
+ file_key = doc_id[:2]
178
+ cache_file = f"{cache_location}/{file_key}.json"
179
+ if os.path.isfile(cache_file):
180
+ with open(cache_file, encoding="utf-8") as f:
181
+ key_cache = json.load(f)
182
+ else:
183
+ key_cache = {}
184
+ key_cache[doc_id] = syntax_analyses
185
+ with open(cache_file, "w", encoding="utf-8") as f:
186
+ json.dump(key_cache, f)
187
+
188
+
189
+ def make_syntax_cache_key(filename):
190
+ doc_id = re.search(r"/\d+/lome_(\d+)\.comm\.json", filename).group(1)
191
+ return doc_id
192
+
193
+
194
+ def clean_sentence_(sentence):
195
+ idx_to_remove = []
196
+
197
+ for i, tok in enumerate(sentence["tokens"]):
198
+
199
+ # remove whitespace tokens
200
+ if not tok.strip():
201
+ idx_to_remove.append(i)
202
+
203
+ idx_to_remove.reverse()
204
+
205
+ for idx in idx_to_remove:
206
+ for annotation_list in sentence.values():
207
+ annotation_list.pop(idx)
208
+
209
+
210
+ def process_prediction_file(
211
+ filename: str,
212
+ dataset_name: str,
213
+ syntax_cache: str,
214
+ deep_frames_cache: dict,
215
+ tmp_cache: Optional[dict] = None,
216
+ file_obj: io.TextIOBase = None,
217
+ syntax_cache_key: Optional[str] = None,
218
+ deep_frames_list: Optional[List[str]] = None,
219
+ spacy_model: str = "it_core_news_md",
220
+ spacy_model_obj = None
221
+ ) -> Tuple[List, ...]:
222
+ """
223
+ Process a predictions JSON file
224
+ :param filename: path to the JSON file
225
+ :param syntax_cache: see `make_syntax_cache()`
226
+ :param spacy model: spacy model to be used for syntactic analysis
227
+ :param file_obj: already opened object corresponding to `filename`. If given, `file_obj` will be used instead
228
+ of loading it from `filename`. This is useful when reading the entire corpus from a tarball (which is what the
229
+ SocioFillmore webapp does)
230
+ :return:
231
+ """
232
+
233
+ print("Processing", filename)
234
+
235
+ if file_obj is not None:
236
+ annotations = json.load(file_obj)
237
+ else:
238
+ with open(filename, encoding="utf-8") as f:
239
+ annotations = json.load(f)
240
+
241
+ if syntax_cache is None:
242
+ syntax_analyses = []
243
+ for sentence in annotations:
244
+ syntax_analyses.append(syntax_analyze(sentence, spacy_model, spacy_model_obj))
245
+
246
+ else:
247
+ if syntax_cache_key is None:
248
+ syntax_cache_key = make_syntax_cache_key(filename)
249
+
250
+ if tmp_cache is not None and syntax_cache_key in tmp_cache:
251
+ syntax_analyses = tmp_cache[syntax_cache_key]
252
+
253
+ else:
254
+ with open(f"{syntax_cache}/{syntax_cache_key[:2]}.json", encoding="utf-8") as cache_file:
255
+ grouped_analyses = json.load(cache_file)
256
+ syntax_analyses = grouped_analyses[syntax_cache_key]
257
+ if tmp_cache is not None:
258
+ tmp_cache.clear()
259
+ tmp_cache.update(grouped_analyses)
260
+
261
+ fn_structures: List[Dict[int, FrameStructure]] = []
262
+ sentences: List[List[str]] = []
263
+ role_analyses: List[Dict[int, Dict[str, str]]] = []
264
+
265
+ for sent_idx, sentence in enumerate(annotations):
266
+
267
+ clean_sentence_(sentence)
268
+
269
+ try:
270
+ sent_structures = process_fn_sentence(
271
+ sentence, deep_frames_cache, deep_frames_list=deep_frames_list
272
+ )
273
+
274
+ # seems to occur for one specific file in the migration set, TODO find out what happens
275
+ except AttributeError:
276
+ print("Error processing FN annotations")
277
+ sent_structures = {}
278
+ syntax = syntax_analyses[sent_idx]
279
+
280
+ # disambiguate syntactic constructions
281
+ for fs in sent_structures.values():
282
+ target_idx = str(fs.target.tokens_idx[0])
283
+ if target_idx not in syntax:
284
+ print(
285
+ f"Prediction file {filename}: Cannot find syntactic information for target at idx={target_idx}")
286
+ continue
287
+ fs_syn = syntax[target_idx][-1]
288
+ disambiguate_cxs_(fs, fs_syn)
289
+
290
+ roles = process_syn_sem_roles(sent_structures, syntax)
291
+ role_analyses.append(roles)
292
+ sentences.append(sentence["tokens"])
293
+ fn_structures.append(sent_structures)
294
+
295
+ return sentences, fn_structures, syntax_analyses, role_analyses
296
+
297
+
298
+ def disambiguate_cxs_(struct: FrameStructure, tgt_syntax):
299
+ # no "_" at the beginning: no disambiguation needed
300
+ cx = tgt_syntax["syn_construction"]
301
+ if not cx.startswith("_"):
302
+ return
303
+
304
+ # print(struct.frame, struct.deep_frame)
305
+
306
+ # NB works only for the selected relevant frames! if any other frames are added, make sure to update this
307
+ if struct.deep_frame in ["Transitive_action", "Causation", "Emotion_directed", "Quarreling", "Impact", "Committing_crime"]:
308
+ frame_agentivity_type = "active"
309
+ elif struct.frame in ACTIVE_FRAMES:
310
+ frame_agentivity_type = "active"
311
+ elif struct.frame == "Event":
312
+ frame_agentivity_type = "impersonal"
313
+ else:
314
+ frame_agentivity_type = "unaccusative"
315
+
316
+ if cx == "_verbal:ACTIVE":
317
+ new_cx = f"verbal:{frame_agentivity_type}"
318
+ elif cx in ["_verbal:ADPOS", "_verbal:OTH_PART"]:
319
+ if frame_agentivity_type == "active":
320
+ new_cx = "verbal:passive"
321
+ else:
322
+ new_cx = f"verbal:{frame_agentivity_type}"
323
+ else:
324
+ raise ValueError(f"Unknown construction placeholder {cx}")
325
+
326
+ tgt_syntax["syn_construction"] = new_cx
327
+
328
+
329
+ def find_governed_roles(
330
+ syn_self: Dict[str, Any],
331
+ syn_children: List[Dict[str, Any]],
332
+ roles: List[Tuple[str, AnnotationSpan]],
333
+ ) -> Dict[str, str]:
334
+
335
+ roles_found = {}
336
+
337
+ # find roles that are governed by the predicate
338
+ for node in [syn_self] + syn_children:
339
+ for role_name, role_span in roles:
340
+ if node["lome_idx"] in role_span.tokens_idx:
341
+ dep_label = node["dependency"]
342
+ if role_name not in roles_found and dep_label not in IGNORE_DEP_LABELS:
343
+ if node == syn_self:
344
+ roles_found[role_name] = None
345
+ else:
346
+ roles_found[role_name] = dep_label + "↓"
347
+ return roles_found
348
+
349
+
350
+ def analyze_role_dependencies(
351
+ fn_struct,
352
+ syntax,
353
+ role_analysis=None,
354
+ tgt_idx=None,
355
+ min_depth=-10,
356
+ max_depth=10,
357
+ depth=0,
358
+ label_prefix="",
359
+ ):
360
+
361
+ if role_analysis is None:
362
+ role_analysis = {}
363
+
364
+ if tgt_idx is None:
365
+ tgt_idx = fn_struct.target.tokens_idx[0]
366
+
367
+ if depth > max_depth:
368
+ return role_analysis
369
+
370
+ if depth < min_depth:
371
+ return role_analysis
372
+
373
+ new_analysis = {}
374
+ new_analysis.update(role_analysis)
375
+ token_syntax = syntax[str(tgt_idx)][0]
376
+
377
+ def update_analysis(mapping):
378
+ for role, dep in mapping.items():
379
+ if role not in new_analysis:
380
+ if label_prefix:
381
+ if dep is None:
382
+ label = label_prefix
383
+ depth_label = depth
384
+ else:
385
+ label = label_prefix + "--" + dep
386
+ depth_label = depth + 1 if depth > 0 else depth - 1
387
+ else:
388
+ if dep is None:
389
+ label = "⋆"
390
+ depth_label = depth
391
+ else:
392
+ label = dep
393
+ depth_label = depth + 1 if depth > 0 else depth - 1
394
+ new_analysis[role] = label, depth_label
395
+
396
+ update_analysis(
397
+ find_governed_roles(
398
+ token_syntax, token_syntax["children"], fn_struct.roles)
399
+ )
400
+
401
+ # from the initial predicate: first try the children
402
+ if depth <= 0:
403
+ for child in token_syntax["children"]:
404
+ child_analysis = analyze_role_dependencies(
405
+ fn_struct,
406
+ syntax,
407
+ role_analysis=new_analysis,
408
+ tgt_idx=child["lome_idx"],
409
+ max_depth=max_depth,
410
+ min_depth=min_depth,
411
+ depth=depth - 1,
412
+ label_prefix=child["dependency"] + "↓"
413
+ )
414
+ new_analysis.update(child_analysis)
415
+
416
+ # ... then try the ancestors
417
+ if depth >= 0:
418
+ if not token_syntax["ancestors"]:
419
+ return new_analysis
420
+
421
+ first_ancestor = token_syntax["ancestors"][0]
422
+ return analyze_role_dependencies(
423
+ fn_struct,
424
+ syntax,
425
+ role_analysis=new_analysis,
426
+ tgt_idx=first_ancestor["lome_idx"],
427
+ max_depth=max_depth,
428
+ min_depth=min_depth,
429
+ depth=depth + 1,
430
+ label_prefix=token_syntax["dependency"] + "↑",
431
+ )
432
+
433
+ else:
434
+ return new_analysis
435
+
436
+
437
+ def process_syn_sem_roles(
438
+ sent_structures: Dict[int, FrameStructure], syntax: Dict[str, List[Dict[str, Any]]]
439
+ ) -> Dict[int, Dict[str, str]]:
440
+
441
+ analyses = defaultdict(dict)
442
+ # go through all frame targets
443
+ for struct in sent_structures.values():
444
+ tgt_idx = struct.target.tokens_idx[0]
445
+ role_deps = analyze_role_dependencies(struct, syntax, max_depth=10)
446
+ analyses[tgt_idx] = clean_role_deps(role_deps)
447
+ return analyses
448
+
449
+
450
+ def clean_role_deps(role_deps):
451
+ res = {}
452
+ for role, (dep_str, depth) in role_deps.items():
453
+ dep_parts = dep_str.split("--")
454
+ if len(dep_parts) == 1:
455
+ res[role] = dep_str, depth
456
+ else:
457
+ res[role] = "--".join([dp[-1]
458
+ for dp in dep_parts[:-1]] + [dep_parts[-1]]), depth
459
+ return res
460
+
461
+
462
+ def map_or_lookup_deep_frame(
463
+ frame: str, deep_frames_cache, save_modified_cache=False, deep_frames_list=None
464
+ ) -> Tuple[str, Dict[str, str]]:
465
+ if frame in deep_frames_cache:
466
+ return deep_frames_cache[frame]
467
+ else:
468
+ deep_frame, mapping = map_to_deep_frame(
469
+ frame, deep_frames_list=deep_frames_list
470
+ )
471
+ deep_frames_cache[frame] = [deep_frame, mapping]
472
+ if save_modified_cache:
473
+ with open(DEEP_FRAMES_CACHE_FILE, "w", encoding="utf-8") as f:
474
+ json.dump(deep_frames_cache, f)
475
+ return deep_frames_cache[frame]
476
+
477
+
478
+ def map_to_deep_frame(
479
+ frame: str,
480
+ target: Optional[str] = None,
481
+ mapping: Optional[Dict[str, str]] = None,
482
+ self_mapping: Optional[Dict[str, str]] = None,
483
+ deep_frames_list: Optional[List[str]] = None,
484
+ ) -> Tuple[str, Dict[str, str]]:
485
+
486
+ if deep_frames_list is None:
487
+ deep_frames_list = DEEP_FRAMES
488
+
489
+ # look up in FrameNet
490
+ try:
491
+ fn_entry = fn.frame(frame)
492
+ except FramenetError:
493
+ return frame, {}
494
+ except LookupError:
495
+ return frame, {}
496
+
497
+ # initial call: `target` == `frame`, mapping maps to self
498
+ if target is None:
499
+ target = frame
500
+ if mapping is None or self_mapping is None:
501
+ mapping = self_mapping = {role: role for role in fn_entry.FE.keys()}
502
+
503
+ # base case: our frame is a deep frame
504
+ if frame in deep_frames_list:
505
+ return frame, mapping
506
+
507
+ # otherwise, look at parents
508
+ inh_relations = [
509
+ fr
510
+ for fr in fn_entry.frameRelations
511
+ if fr.type.name == "Inheritance" and fr.Child == fn_entry
512
+ ]
513
+ parents = [fr.Parent for fr in inh_relations]
514
+
515
+ # no parents --> failure, return original frame
516
+ if not inh_relations:
517
+ return target, self_mapping
518
+
519
+ # one parent: follow that parent
520
+ if len(inh_relations) == 1:
521
+ parent_rel = inh_relations[0]
522
+ parent = parents[0]
523
+ new_mapping = define_fe_mapping(mapping, parent_rel)
524
+ return map_to_deep_frame(
525
+ parent.name, target, new_mapping, self_mapping, deep_frames_list
526
+ )
527
+
528
+ # more parents: check if any of them leads to a deep frame
529
+ deep_frames = []
530
+ deep_mappings = []
531
+ for parent_rel, parent in zip(inh_relations, parents):
532
+ new_mapping = define_fe_mapping(mapping, parent_rel)
533
+ final_frame, final_mapping = map_to_deep_frame(
534
+ parent.name, target, new_mapping, self_mapping, deep_frames_list
535
+ )
536
+ if final_frame in deep_frames_list:
537
+ deep_frames.append(final_frame)
538
+ deep_mappings.append(final_mapping)
539
+
540
+ for deep_frame in deep_frames_list:
541
+ if deep_frame in deep_frames:
542
+ idx = deep_frames.index(deep_frame)
543
+ return deep_frame, deep_mappings[idx]
544
+
545
+ # nothing found, return original frame
546
+ return target, self_mapping
547
+
548
+
549
+ def define_fe_mapping(mapping, parent_rel):
550
+ child_to_parent_mapping = {
551
+ fer.subFEName: fer.superFEName for fer in parent_rel.feRelations
552
+ }
553
+ target_to_parent_mapping = {
554
+ role: child_to_parent_mapping[mapping[role]]
555
+ for role in mapping
556
+ if mapping[role] in child_to_parent_mapping
557
+ }
558
+ return target_to_parent_mapping
559
+
560
+
561
+ def is_at_root(syntax_info):
562
+
563
+ # you should either be the actual root...
564
+ if syntax_info["dependency"] == "ROOT":
565
+ return True
566
+
567
+ # ... or be the subject of the root
568
+ if syntax_info["dependency"] == "nsubj" and syntax_info["ancestors"][0]["dependency"] == "ROOT":
569
+ return True
570
+
571
+ return False
572
+
573
+
574
+ def get_tarball_blocks(dataset, lome_model="lome_0shot"):
575
+ if dataset == "femicides/rai":
576
+ return f"output/femicides/lome/{lome_model}/multilabel_rai_ALL_blocks"
577
+ if dataset == "femicides/rai_main":
578
+ return f"output/femicides/lome/{lome_model}/multilabel_rai_main_blocks"
579
+ elif dataset == "femicides/olv":
580
+ return f"output/femicides/lome/{lome_model}/multilabel_olv_blocks"
581
+ elif dataset == "crashes/thecrashes":
582
+ return f"output/crashes/lome/{lome_model}/multilabel_thecrashes_blocks"
583
+ elif dataset == "migration/pavia":
584
+ return f"output/migration/lome/{lome_model}/multilabel_pavia_blocks"
585
+ else:
586
+ raise ValueError("Unsupported dataset!")
587
+
588
+
589
+ def analyze_single_document(doc_id, event_id, lome_model, dataset, texts_df, deep_frames_cache):
590
+ data_domain, data_corpus = dataset.split("/")
591
+
592
+ syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
593
+
594
+ print(dataset)
595
+
596
+ if dataset == "migration/pavia": # this is a hack, fix it!
597
+ pred_file_path = f"output/migration/lome/multilabel/{lome_model}/pavia/{event_id}/lome_{doc_id}.comm.json"
598
+ elif dataset == "femicides/olv":
599
+ pred_file_path = f"output/femicides/lome/lome_0shot/multilabel/olv/{event_id}/lome_{doc_id}.comm.json"
600
+ else:
601
+ pred_file_path = f"output/{data_domain}/lome/lome_0shot/multilabel/{data_corpus}/{event_id}/lome_{doc_id}.comm.json"
602
+ print(f"Analyzing file {pred_file_path}")
603
+
604
+ doc_id = os.path.basename(pred_file_path).split(".")[0].split("_")[1]
605
+ doc_key = doc_id[:2]
606
+ tarball = get_tarball_blocks(dataset, lome_model) + f"/block_{doc_key}.tar"
607
+ with tarfile.open(tarball, "r") as tar_f:
608
+ pred_file = io.TextIOWrapper(tar_f.extractfile(pred_file_path))
609
+
610
+ (
611
+ sents,
612
+ pred_structures,
613
+ syntax_analyses,
614
+ role_analyses,
615
+ ) = process_prediction_file(
616
+ filename=pred_file_path,
617
+ dataset_name=dataset,
618
+ file_obj=pred_file,
619
+ syntax_cache=syntax_cache,
620
+ deep_frames_cache=deep_frames_cache
621
+ )
622
+ output = []
623
+ for sent, structs, syntax, roles in zip(
624
+ sents, pred_structures, syntax_analyses, role_analyses
625
+ ):
626
+ output.append(
627
+ {
628
+ "sentence": sent,
629
+ "fn_structures": [
630
+ dataclasses.asdict(fs) for fs in structs.values()
631
+ ],
632
+ "syntax": syntax,
633
+ "roles": roles,
634
+ "meta": {
635
+ "event_id": event_id,
636
+ "doc_id": doc_id,
637
+ "text_meta": get_text_meta(doc_id, texts_df),
638
+ },
639
+ }
640
+ )
641
+ return output
642
+
643
+
644
+ def get_text_meta(doc_id, texts_df):
645
+ row = texts_df[texts_df["text_id"] == int(doc_id)].iloc[0]
646
+ if "pubdate" in row:
647
+ pubdate = row["pubdate"] if not pd.isna(row["pubdate"]) else None
648
+ elif "pubyear" in row:
649
+ pubdate = int(row["pubyear"])
650
+ else:
651
+ pubdate = None
652
+ return {
653
+ "url": row["url"] if "url" in row else None,
654
+ "pubdate": pubdate,
655
+ "provider": row["provider"],
656
+ "title": row["title"] if not pd.isna(row["title"]) else None,
657
+ "days_after_event": int(row["days_after_event"]) if "days_after_event" in row and not pd.isna(row["days_after_event"]) else 0
658
+ }
659
+
660
+
661
+ def process_fn_sentence(
662
+ sentence, deep_frames_cache, post_process=True, deep_frames_list=None
663
+ ):
664
+ # frame structures in the sentence
665
+ sent_structures: Dict[int, FrameStructure] = {}
666
+
667
+ # role spans currently being built up (per structure + role name)
668
+ cur_spans: Dict[Tuple[int, str]] = {}
669
+ for token_idx, (token_str, frame_annos) in enumerate(
670
+ zip(sentence["tokens"], sentence["frame_list"])
671
+ ):
672
+ for fa in frame_annos:
673
+ # remove "virtual root" nonsense token
674
+ if "@@VIRTUAL_ROOT@@" in fa:
675
+ continue
676
+ fa = fa.split("@@")[0] # remove confidence score if it's there
677
+ anno, struct_id_str = fa.split("@")
678
+ struct_id = int(struct_id_str)
679
+ frame_name = anno.split(":")[1]
680
+ deep_frame, deep_frame_mapping = map_or_lookup_deep_frame(
681
+ frame_name, deep_frames_cache, deep_frames_list=deep_frames_list
682
+ )
683
+ if struct_id not in sent_structures:
684
+ sent_structures[struct_id] = FrameStructure(
685
+ frame=frame_name,
686
+ deep_frame=deep_frame,
687
+ target=None,
688
+ roles=[],
689
+ deep_roles=[],
690
+ )
691
+ cur_struct = sent_structures[struct_id]
692
+
693
+ # TODO: get rid of this hack
694
+ anno = anno.replace("I::", "I:")
695
+ anno = anno.replace("B::", "B:")
696
+
697
+ if anno.split(":")[0] == "T":
698
+ if cur_struct.target is None:
699
+ cur_struct.target = AnnotationSpan(
700
+ [token_idx], [token_str])
701
+ else:
702
+ cur_struct.target.tokens_idx.append(token_idx)
703
+ cur_struct.target.tokens_str.append(token_str)
704
+ elif anno.split(":")[0] == "B":
705
+ role_name = anno.split(":")[2]
706
+ role_span = AnnotationSpan([token_idx], [token_str])
707
+ cur_struct.roles.append((role_name, role_span))
708
+ if role_name in deep_frame_mapping:
709
+ cur_struct.deep_roles.append(
710
+ (deep_frame_mapping[role_name], role_span)
711
+ )
712
+ cur_spans[(struct_id, role_name)] = role_span
713
+ elif anno.split(":")[0] == "I":
714
+ role_name = anno.split(":")[2]
715
+ role_span = cur_spans[(struct_id, role_name)]
716
+ role_span.tokens_str.append(token_str)
717
+ role_span.tokens_idx.append(token_idx)
718
+
719
+ # post-process: remove punctuation in targets
720
+ if post_process:
721
+ for fs in sent_structures.values():
722
+ if len(fs.target.tokens_str) > 1:
723
+ target_tok_str_to_remove = []
724
+ target_tok_idx_to_remove = []
725
+ for tok_str, tok_idx in zip(fs.target.tokens_str, fs.target.tokens_idx):
726
+ if tok_str in ["``", "''", "`", "'", ".", ",", ";", ":"]:
727
+ target_tok_str_to_remove.append(tok_str)
728
+ target_tok_idx_to_remove.append(tok_idx)
729
+ for tok_str, tok_idx in zip(
730
+ target_tok_str_to_remove, target_tok_idx_to_remove
731
+ ):
732
+ fs.target.tokens_str.remove(tok_str)
733
+ fs.target.tokens_idx.remove(tok_idx)
734
+
735
+ return sent_structures
736
+
737
+
738
+ def map_back_spacy_lome_tokens(spacy_doc, lome_tokens):
739
+ if len(lome_tokens) > len(spacy_doc):
740
+ raise ValueError(
741
+ f"Cannot re-tokenize (#lome={len(lome_tokens)} // #spacy={len(spacy_doc)})"
742
+ )
743
+
744
+ spacy_to_lome = {}
745
+ lome_idx = 0
746
+ for spacy_idx, spacy_token in enumerate(spacy_doc):
747
+ spacy_to_lome[spacy_idx] = lome_idx
748
+
749
+ # whitespace after token: tokens correspond
750
+ if spacy_token.whitespace_:
751
+ lome_idx += 1
752
+ return spacy_to_lome
753
+
754
+
755
+ def get_syn_category(spacy_token):
756
+ if spacy_token.pos_ == "NOUN":
757
+ return "n"
758
+ if spacy_token.pos_ == "ADJ":
759
+ return "adj"
760
+ if spacy_token.pos_ == "ADV":
761
+ return "adv"
762
+ if spacy_token.pos_ == "ADP":
763
+ return "p"
764
+ if spacy_token.pos_ == "VERB":
765
+ if spacy_token.morph.get("VerbForm") == ["Fin"]:
766
+ return "v:fin"
767
+ if spacy_token.morph.get("VerbForm") == ["Part"]:
768
+ return "v:part"
769
+ if spacy_token.morph.get("VerbForm") == ["Ger"]:
770
+ return "v:ger"
771
+ if spacy_token.morph.get("VerbForm") == ["Inf"]:
772
+ return "v:inf"
773
+ return "other"
774
+
775
+
776
+ def syntax_analyze(sentence, spacy_model_name, spacy_model_obj=None) -> Dict[str, Dict[str, Any]]:
777
+ lome_tokens = sentence["tokens"]
778
+
779
+ # load spacy model locally (so that it works in GAE)
780
+ # global nlp
781
+ if spacy_model_obj is not None:
782
+ nlp = spacy_model_obj
783
+ else:
784
+ nlp = spacy.load(spacy_model_name)
785
+
786
+ spacy_doc = nlp(" ".join(lome_tokens))
787
+ analysis = defaultdict(list)
788
+ spacy_to_lome_tokens = map_back_spacy_lome_tokens(spacy_doc, lome_tokens)
789
+ for spacy_idx, token in enumerate(spacy_doc):
790
+ lome_idx = spacy_to_lome_tokens[spacy_idx]
791
+ syn_category = get_syn_category(token)
792
+ syn_construction = get_syn_construction(token, syn_category)
793
+ children = []
794
+ for c in token.children:
795
+ children.append(
796
+ {
797
+ "token": c.text,
798
+ "spacy_idx": c.i,
799
+ "lome_idx": spacy_to_lome_tokens[c.i],
800
+ "syn_category": get_syn_category(c),
801
+ "dependency": c.dep_,
802
+ }
803
+ )
804
+ ancestors = []
805
+ for a in token.ancestors:
806
+ ancestors.append(
807
+ {
808
+ "token": a.text,
809
+ "spacy_idx": a.i,
810
+ "lome_idx": spacy_to_lome_tokens[a.i],
811
+ "syn_category": get_syn_category(a),
812
+ "dependency": a.dep_,
813
+ }
814
+ )
815
+
816
+ # str key so that it doesn't change when converting to JSON
817
+ lome_key = str(lome_idx)
818
+ analysis[lome_key].append(
819
+ {
820
+ "token": token.text,
821
+ "dependency": token.dep_,
822
+ "spacy_idx": spacy_idx,
823
+ "lome_idx": lome_idx,
824
+ "syn_category": syn_category,
825
+ "syn_construction": syn_construction,
826
+ "children": children,
827
+ "ancestors": ancestors,
828
+ }
829
+ )
830
+ return analysis
831
+
832
+
833
+ def get_syn_construction(token: Token, syn_category: str) -> str:
834
+ if syn_category in ["n", "adj", "adv", "p"]:
835
+ return "nonverbal"
836
+
837
+ if syn_category.startswith("v:"):
838
+ # find reflexives
839
+ for c in token.children:
840
+ if c.lemma_.lower() in ["si", "zich", "zichzelf"]:
841
+ return "verbal:reflexive"
842
+
843
+ # find impersonal constructions
844
+ for c in token.children:
845
+ if c.dep_ == "expl":
846
+ return "verbal:impersonal"
847
+
848
+ # all other finite verbs/gerunds/infinites -> active construction
849
+ if syn_category in ["v:fin", "v:ger", "v:inf"]:
850
+ return "_verbal:ACTIVE"
851
+
852
+ if syn_category == "v:part":
853
+
854
+ if token.dep_ == "acl":
855
+ return "_verbal:ADPOS"
856
+
857
+ for c in token.children:
858
+
859
+ # passive subj or auxiliary present: it's a passive
860
+ if c.dep_ in ["nsubj:pass", "aux:pass"]:
861
+ return "verbal:passive"
862
+
863
+ # auxiliary "HAVE" (avere/hebben) present: it's an active
864
+ if (
865
+ c.dep_ == "aux"
866
+ and c.lemma_.lower() in ITALIAN_ACTIVE_AUX + DUTCH_ACTIVE_AUX
867
+ ):
868
+ return "verbal:active"
869
+
870
+ return "_verbal:OTH_PART"
871
+
872
+ return "other"
873
+
874
+
875
+ def get_syntax_info(struct: FrameStructure, syntax: Dict) -> Dict:
876
+ target_idx = str(struct.target.tokens_idx[0])
877
+ # print(target_idx, syntax)
878
+ syntax_for_target = syntax[target_idx]
879
+ return syntax_for_target[-1]
880
+
881
+
882
+ def enrich_texts_df(texts_df: pd.DataFrame, events_df: pd.DataFrame):
883
+ time_delta_rows: List[Optional[int]] = []
884
+ for idx, text_row in texts_df.iterrows():
885
+ try:
886
+ event_row = events_df[events_df["event:id"]
887
+ == text_row["event_id"]].iloc[0]
888
+ except IndexError:
889
+ print(f"Skipping {idx} (IndexError)")
890
+ time_delta_rows.append(None)
891
+ if "pubdate" not in text_row or pd.isna(text_row["pubdate"]) or pd.isna(event_row["event:date"]):
892
+ time_delta_rows.append(None)
893
+ else:
894
+ try:
895
+ pub_date = datetime.strptime(
896
+ text_row["pubdate"], "%Y-%m-%d %H:%M:%S")
897
+ event_date = datetime.strptime(
898
+ event_row["event:date"], "%Y-%m-%d")
899
+ time_delta = pub_date - event_date
900
+ time_delta_days = time_delta.days
901
+ time_delta_rows.append(time_delta_days)
902
+ except ValueError as e:
903
+ print(
904
+ f"\t\terror parsing dates, see below for more info:\n\t\t{e}")
905
+ time_delta_rows.append(None)
906
+
907
+ return texts_df.assign(days_after_event=time_delta_rows)
908
+
909
+
910
+ def read_frames_of_interest(dataset) -> List[str]:
911
+ if dataset in ["femicides/rai", "femicides/olv"]:
912
+ file = "resources/femicide_frame_list.txt"
913
+ elif dataset == "crashes/thecrashes":
914
+ file = "resources/crashes_frame_list.txt"
915
+ elif dataset == "migration/pavia":
916
+ file = "resources/migration_frame_list.txt"
917
+ else:
918
+ raise ValueError("Unsupported dataset")
919
+
920
+ frames = set()
921
+ with open(file, encoding="utf-8") as f:
922
+ for line in f:
923
+ line = line.strip()
924
+ if line.startswith("#") or not line:
925
+ continue
926
+ frames.add(line[0].upper() + line[1:].lower())
927
+ return sorted(frames)
928
+
929
+
930
+ def make_dep_label_cache():
931
+
932
+ labels = set()
933
+
934
+ for dataset in ["femicides/rai", "crashes/thecrashes", "migration/pavia"]:
935
+
936
+ tarball = (
937
+ "output/femicides/lome/lome_0shot/multilabel_rai.tar.gz"
938
+ if dataset == "femicides/rai"
939
+ else "output/crashes/lome/lome_0shot/multilabel_thecrashes.tar.gz"
940
+ if dataset == "crashes/thecrashes"
941
+ else "output/migration/lome/lome_0shot/multilabel_pavia.tar.gz"
942
+ )
943
+
944
+ spacy_model = (
945
+ "it_core_news_md" if dataset["femicides/rai",
946
+ "migration/pavia"] else "nl_core_news_md"
947
+ )
948
+
949
+ deep_frames_cache = load_deep_frames_cache(dataset)
950
+ syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
951
+
952
+ with tarfile.open(tarball, "r:gz") as tar_f:
953
+ for mem in [
954
+ m.name for m in tar_f.getmembers() if m.name.endswith(".comm.json")
955
+ ]:
956
+ if mem is None:
957
+ continue
958
+
959
+ print(mem)
960
+ mem_obj = io.TextIOWrapper(tar_f.extractfile(mem))
961
+ (_, _, _, role_analyses,) = process_prediction_file(
962
+ filename=mem,
963
+ dataset_name=dataset,
964
+ file_obj=mem_obj,
965
+ syntax_cache=syntax_cache,
966
+ deep_frames_cache=deep_frames_cache,
967
+ spacy_model=spacy_model,
968
+ )
969
+ if role_analyses is None:
970
+ print(f"\tSkipping file {mem}, no role analyses found")
971
+ continue
972
+ for sent_ra in role_analyses:
973
+ for ra in sent_ra.values():
974
+ for dep, _ in ra.values():
975
+ labels.add(dep)
976
+ with open(DEP_LABEL_CACHE_FILE, "w", encoding="utf-8") as f_out:
977
+ for label in sorted(labels):
978
+ f_out.write(label + os.linesep)
979
+
980
+
981
+ def analyze_external_file(file_in, file_out, spacy_model):
982
+ deep_frames_cache = load_deep_frames_cache()
983
+ (
984
+ sents,
985
+ pred_structures,
986
+ syntax_analyses,
987
+ role_analyses,
988
+ ) = process_prediction_file(file_in, "", None, deep_frames_cache, spacy_model_obj=spacy_model)
989
+ output = []
990
+ for sent, structs, syntax, roles in zip(
991
+ sents, pred_structures, syntax_analyses, role_analyses
992
+ ):
993
+ output.append(
994
+ {
995
+ "sentence": sent,
996
+ "fn_structures": [
997
+ dataclasses.asdict(fs) for fs in structs.values()
998
+ ],
999
+ "syntax": syntax,
1000
+ "roles": roles
1001
+ }
1002
+ )
1003
+ with open(file_out, "w", encoding="utf-8") as f_out:
1004
+ json.dump(output, f_out, indent=4)
1005
+
1006
+
1007
+ if __name__ == "__main__":
1008
+ ap = argparse.ArgumentParser()
1009
+ ap.add_argument("command", choices=[
1010
+ "make_syntax_cache", "make_dep_label_cache", "analyze_file"
1011
+ ])
1012
+ ap.add_argument("dataset", choices=["femicides/rai", "femicides/rai_main", "femicides/rai_ALL",
1013
+ "femicides/olv", "crashes/thecrashes", "migration/pavia", "*"])
1014
+ ap.add_argument("--input_file", type=str, default="")
1015
+ ap.add_argument("--output_file", type=str, default="")
1016
+ args = ap.parse_args()
1017
+
1018
+ if args.command == "make_syntax_cache":
1019
+
1020
+ if args.dataset == "*":
1021
+ raise ValueError(
1022
+ "Please specificy a dataset for `make_syntax_cache`")
1023
+
1024
+ if args.dataset == "crashes/thecrashes":
1025
+ make_syntax_cache(
1026
+ "crashes/thecrashes", skip_fn=lambda f: not is_a_dutch_text(f)
1027
+ )
1028
+ elif args.dataset == "femicides/rai":
1029
+ make_syntax_cache("femicides/rai")
1030
+ elif args.dataset == "femicides/rai_main":
1031
+ make_syntax_cache("femicides/rai_main")
1032
+ elif args.dataset == "femicides/rai_ALL":
1033
+ make_syntax_cache("femicides/rai_ALL")
1034
+ elif args.dataset == "femicides/olv":
1035
+ make_syntax_cache("femicides/olv")
1036
+ else:
1037
+ make_syntax_cache("migration/pavia")
1038
+
1039
+ elif args.command == "make_dep_label_cache":
1040
+ make_dep_label_cache()
1041
+
1042
+ elif args.command == "analyze_file":
1043
+ analyze_external_file(args.input_file, args.output_file)
1044
+
1045
+
1046
+
sociofillmore/common/convert_comms.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adapted from comm2multilabel.py from the Bert-for-FrameNet project (https://gitlab.com/gosseminnema/bert-for-framenet)
3
+ """
4
+
5
+ import dataclasses
6
+ import json
7
+ import os
8
+ import glob
9
+ import sys
10
+ from collections import defaultdict
11
+ from typing import List, Optional
12
+
13
+ import nltk
14
+ from concrete import Communication
15
+ from concrete.util import read_communication_from_file, lun, get_tokens
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class FrameAnnotation:
20
+ tokens: List[str] = dataclasses.field(default_factory=list)
21
+ pos: List[str] = dataclasses.field(default_factory=list)
22
+
23
+
24
+ @dataclasses.dataclass
25
+ class MultiLabelAnnotation(FrameAnnotation):
26
+ frame_list: List[List[str]] = dataclasses.field(default_factory=list)
27
+ lu_list: List[Optional[str]] = dataclasses.field(default_factory=list)
28
+
29
+ def to_txt(self):
30
+ for i, tok in enumerate(self.tokens):
31
+ yield f"{tok} {self.pos[i]} {'|'.join(self.frame_list[i]) or '_'} {self.lu_list[i] or '_'}"
32
+
33
+ @staticmethod
34
+ def from_txt(sentence_lines):
35
+
36
+ tokens = []
37
+ pos = []
38
+ frame_list = []
39
+ lu_list = []
40
+ for line in sentence_lines:
41
+
42
+ # ignore any spaces
43
+ if line.startswith(" "):
44
+ continue
45
+
46
+ columns = line.split()
47
+ tokens.append(columns[0])
48
+ pos.append(columns[1])
49
+
50
+ # read frame list, handle empty lists
51
+ if columns[2] == "_":
52
+ frame_list.append([])
53
+ else:
54
+ frame_list.append(columns[2].split("|"))
55
+
56
+ # read lu list, handle nulls
57
+ if columns[3] == "_":
58
+ lu_list.append(None)
59
+ else:
60
+ lu_list.append(columns[3])
61
+ return MultiLabelAnnotation(tokens, pos, frame_list, lu_list)
62
+
63
+ def get_label_set(self):
64
+ label_set = set()
65
+ for tok_labels in self.frame_list:
66
+ for label in tok_labels:
67
+ label_set.add(label)
68
+ return label_set
69
+
70
+
71
+ def convert_file(file, language="english", confidence_filter=0.0):
72
+ print("Reading input file...")
73
+ comm = read_communication_from_file(file)
74
+
75
+ print("Mapping sentences to situations...")
76
+ tok_uuid_to_situation = map_sent_to_situation(comm)
77
+
78
+ print("# sentences with situations:", len(tok_uuid_to_situation))
79
+
80
+ for section in lun(comm.sectionList):
81
+ for sentence in lun(section.sentenceList):
82
+ tokens = get_tokens(sentence.tokenization)
83
+ situations = tok_uuid_to_situation[sentence.tokenization.uuid.uuidString]
84
+ tok_to_annos = map_tokens_to_annotations(comm, situations, confidence_filter)
85
+
86
+ frame_list, tok_list = prepare_ml_lists(language, tok_to_annos, tokens)
87
+
88
+ ml_anno = MultiLabelAnnotation(tok_list, ["_" for _ in tok_list], frame_list,
89
+ [None for _ in tok_list])
90
+ yield ml_anno
91
+
92
+
93
+ def prepare_ml_lists(language, tok_to_annos, tokens):
94
+ tok_list = []
95
+ frame_list = []
96
+ for tok_idx, tok in enumerate(tokens):
97
+ # split tokens that include punctuation
98
+ split_tok = nltk.word_tokenize(tok.text, language=language)
99
+ tok_list.extend(split_tok)
100
+ tok_anno = []
101
+ for anno in tok_to_annos.get(tok_idx, []):
102
+ tok_anno.append(anno)
103
+ frame_list.extend([list(tok_anno) for _ in split_tok])
104
+
105
+ # remove annotations from final punctuation & solve BIO weird stuff
106
+ for idx, (tok, frame_annos) in enumerate(zip(tok_list, frame_list)):
107
+ if tok in ",.:;\"'`«»":
108
+ to_delete = []
109
+ for fa in frame_annos:
110
+ if fa.startswith("T:"):
111
+ compare_fa = fa
112
+ else:
113
+ compare_fa = "I" + fa[1:]
114
+
115
+ if idx == len(tok_list) - 1:
116
+ to_delete.append(fa)
117
+ elif compare_fa not in frame_list[idx + 1]:
118
+ to_delete.append(fa)
119
+
120
+ for fa in to_delete:
121
+ frame_annos.remove(fa)
122
+
123
+ for fa_idx, fa in enumerate(frame_annos):
124
+
125
+ if fa.startswith("B:"):
126
+ # check if we had exactly the same label the token before
127
+ if idx > 0 and fa in frame_list[idx - 1]:
128
+ frame_annos[fa_idx] = "I" + fa[1:]
129
+
130
+ return frame_list, tok_list
131
+
132
+
133
+ def map_tokens_to_annotations(comm: Communication, situations: List[str], confidence_filter: float):
134
+ tok_to_annos = defaultdict(list)
135
+ for sit_idx, sit_uuid in enumerate(situations):
136
+ situation = comm.situationMentionForUUID[sit_uuid]
137
+ if situation.confidence < confidence_filter:
138
+ continue
139
+
140
+ frame_type = situation.situationKind
141
+ tgt_tokens = situation.tokens.tokenIndexList
142
+
143
+ if frame_type == "@@VIRTUAL_ROOT@@":
144
+ continue
145
+
146
+ for tok_id in tgt_tokens:
147
+ tok_to_annos[tok_id].append(f"T:{frame_type}@{sit_idx:02}@@{situation.confidence}")
148
+ for arg in situation.argumentList:
149
+ if arg.confidence < confidence_filter:
150
+ continue
151
+
152
+ fe_type = arg.role
153
+ fe_tokens = arg.entityMention.tokens.tokenIndexList
154
+ for tok_n, tok_id in enumerate(fe_tokens):
155
+ if tok_n == 0:
156
+ bio = "B"
157
+ else:
158
+ bio = "I"
159
+ tok_to_annos[tok_id].append(f"{bio}:{frame_type}:{fe_type}@{sit_idx:02}@@{arg.confidence}")
160
+ return tok_to_annos
161
+
162
+
163
+ def map_sent_to_situation(comm):
164
+ tok_uuid_to_situation = defaultdict(list)
165
+ for situation in comm.situationMentionSetList:
166
+ for mention in situation.mentionList:
167
+ tok_uuid_to_situation[mention.tokens.tokenizationId.uuidString].append(mention.uuid.uuidString)
168
+ return tok_uuid_to_situation
169
+
170
+
171
+ def main():
172
+ file_in = sys.argv[1]
173
+ language = sys.argv[2]
174
+ output_directory = sys.argv[3]
175
+ confidence_filter = float(sys.argv[4])
176
+ split_by_migration_files = False
177
+
178
+ file_in_base = os.path.basename(file_in)
179
+ file_out = f"{output_directory}/lome_{file_in_base}"
180
+ multi_label_annos = list(convert_file(file_in, language=language, confidence_filter=confidence_filter))
181
+ multi_label_json = [dataclasses.asdict(anno) for anno in multi_label_annos]
182
+
183
+ if split_by_migration_files:
184
+ files = glob.glob("output/migration/split_data/split_dev10_sep_txt_files/*.orig.txt")
185
+ files.sort(key=lambda f: int(f.split("/")[-1].rstrip(".orig.txt")))
186
+
187
+ for anno, file in zip(multi_label_annos, files):
188
+ basename = file.split("/")[-1].rstrip(".orig.txt")
189
+ spl_file_out = f"{output_directory}/{basename}"
190
+ with open(f"{spl_file_out}.txt", "w", encoding="utf-8") as f_txt:
191
+ for line in anno.to_txt():
192
+ f_txt.write(line + os.linesep)
193
+ f_txt.write(os.linesep)
194
+
195
+ else:
196
+ print(file_out)
197
+ with open(f"{file_out}.json", "w", encoding="utf-8") as f_json:
198
+ json.dump(multi_label_json, f_json, indent=4)
199
+
200
+ with open(f"{file_out}.txt", "w", encoding="utf-8") as f_txt:
201
+ for anno in multi_label_annos:
202
+ for line in anno.to_txt():
203
+ f_txt.write(line + os.linesep)
204
+ f_txt.write(os.linesep)
205
+
206
+
207
+ if __name__ == '__main__':
208
+ main()
sociofillmore/common/filter_lang.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import langdetect
2
+
3
+ import json
4
+
5
+
6
+ DATA_FILE = "data/thecrashes_data.json"
7
+
8
+
9
+ def main():
10
+ texts = get_texts()
11
+ for text in texts:
12
+ if langdetect.detect(text) == "en":
13
+ print("\n<-------------------------------")
14
+ print(text)
15
+ print("------------------------------>\n")
16
+
17
+
18
+ def get_texts():
19
+ with open(DATA_FILE, encoding="utf-8") as f:
20
+ data = json.load(f)
21
+
22
+ texts = []
23
+
24
+ for event in data:
25
+ for article in event["articles"]:
26
+ texts.append(article["title"] + "\n\n" + article["summary"])
27
+
28
+ return texts
29
+
30
+
31
+ if __name__ == '__main__':
32
+ main()
sociofillmore/common/get_nltk_fn_roles.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk.corpus import framenet as fn
2
+ import json
3
+
4
+ frames_to_roles = {}
5
+
6
+ for frame in fn.frames():
7
+ frames_to_roles[frame.name] = list(frame.FE.keys())
8
+
9
+
10
+ with open("resources/fn_frames_to_roles.json", "w", encoding="utf-8") as f:
11
+ json.dump(frames_to_roles, f)
sociofillmore/common/pos_based_targetid.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import spacy
5
+
6
+
7
+ TARGET_POS = [
8
+ "NOUN",
9
+ "VERB",
10
+ "ADJ",
11
+ "ADV"
12
+ ]
13
+
14
+
15
+ def do_frameid():
16
+ nlp = spacy.load("it_core_news_md")
17
+
18
+ with open("data/migration/corpus_titoli_all_raw.txt", encoding="utf-8") as f_in, \
19
+ open("output/migration/pos_based_targetid/corpus_titoli_all_raw.jsonl", "w", encoding="utf-8") as f_out:
20
+
21
+ for line in f_in:
22
+ doc = nlp(line.strip())
23
+ out = {
24
+ "tokens": [t.text for t in doc],
25
+ "predicates": [i for i, t in enumerate(doc) if t.pos_ in TARGET_POS]
26
+ }
27
+ f_out.write(json.dumps(out) + os.linesep)
28
+
29
+
30
+ if __name__ == "__main__":
31
+ do_frameid()
sociofillmore/common/split_lome_files.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import tarfile
4
+
5
+
6
+ def split_lome_files(lome_folder, output_folder):
7
+ for file in glob.glob(f"{lome_folder}/**/*.comm.*"):
8
+ doc_id = os.path.basename(file).split(".")[0].split("_")[1]
9
+ doc_key = doc_id[:2]
10
+
11
+ print(file, "->", doc_key)
12
+
13
+ with tarfile.open(f"{output_folder}/block_{doc_key}.tar", "a") as tar_f:
14
+ tar_f.add(file)
15
+
16
+
17
+ if __name__ == "__main__":
18
+ #split_lome_files("output/migration/lome/multilabel/lome_0shot/pavia/", "output/migration/lome/lome_0shot/multilabel_pavia_blocks")
19
+ # split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai/", "output/femicides/lome/lome_0shot/multilabel_rai_blocks")
20
+ split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai_ALL/", "output/femicides/lome/lome_0shot/multilabel_rai_ALL_blocks")
21
+ # split_lome_files("output/femicides/lome/lome_0shot/multilabel/olv/", "output/femicides/lome/lome_0shot/multilabel_olv_blocks")
22
+ # split_lome_files("output/crashes/lome/lome_0shot/multilabel/thecrashes/", "output/crashes/lome/lome_0shot/multilabel_thecrashes_blocks")
sociofillmore/crashes/__pycache__/utils.cpython-37.pyc ADDED
Binary file (629 Bytes). View file
 
sociofillmore/crashes/__pycache__/utils.cpython-39.pyc ADDED
Binary file (645 Bytes). View file
 
sociofillmore/crashes/generate_templates.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import json
3
+ import os
4
+
5
+ random.seed(2021)
6
+
7
+
8
+ NUM_SENTENCES = 100_000
9
+ NUM_FAILS = 25
10
+
11
+
12
+ SENT_TYPES = ("0_PTY", "1_PTY", "2_PTY")
13
+ SENT_1_PTY_TYPES = ("VICTIM", "OUTCOME", "DRIVE")
14
+ SENT_ACTIVE_TYPES = ("ACTIVE", "NON_ACTIVE")
15
+
16
+ SENTS_0_PTY_OUTCOME = ("[[OUTCOME]] [[CIRCUMSTANCE]] [[PLACE]]",
17
+ "[[OUTCOME]] [[CIRCUMSTANCE]] [[TIME]]", "[[OUTCOME]] [[CIRCUMSTANCE]]")
18
+ SENTS_1_PTY_VICTIM = ("[[SUBJECT]] [[VERB_V2]] [[PLACE]]",
19
+ "[[SUBJECT]] [[TIME]] [[VERB_V2]]", "[[SUBJECT]] [[VERB_V2]]")
20
+ SENTS_1_PTY_OUTCOME = ("[[SUBJECT]] [[OUTCOME]] [[PLACE]] [[CIRCUMSTANCE]]",
21
+ "[[SUBJECT]] [[OUTCOME]] [[CIRCUMSTANCE]]")
22
+ SENTS_1_PTY_DRIVE = ("[[SUBJECT]] [[VP_DRIVE]] [[PLACE]]",
23
+ "[[SUBJECT]] [[VP_DRIVE]]")
24
+ SENTS_2_PTYS = ("[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]] [[PLACE]]",
25
+ "[[SUBJECT]] [[VERB_V2]] [[TIME]] [[VERB_P]] [[OTHER]] [[VERB_REST]]", "[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]]")
26
+
27
+ PLACES = ("op stationsplein", "in stadscentrum", "op kruispunt Westerhaven", "op A27", "op A10", "in Lelystad",
28
+ "in Assen", "in Amsterdam", "bij Renkum", "in Schilderswijk", "bij knooppunt Lunetten", "op zuidelijke ringweg",
29
+ "in de buurt van de Erasmusbrug", "op schoolplein Stedelijk Gymnasium", "bij afrit Rotterdam-Noord", "op Kanaleneiland")
30
+ TIMES = ("tijdens avondspits", "vrijdagavond",
31
+ "dinsdagochtend", "donderdagnacht", "rond middaguur")
32
+ CIRCUMSTANCES = ("na ongeluk", "na aanrijding", "na botsing", "na crash")
33
+ CIRCUMSTANCES_AGT = (", dader ervandoor", ", dader ervandoor", ", dader rijdt door", ", bestuurder rijdt door")
34
+
35
+ OUTCOME_0_TYPES = ("TRAFFIC", "HUMAN")
36
+ OUTCOMES_0_TRAFFIC = ("verkeersopstopping", "file", "veel vertraging")
37
+ OUTCOMES_0_HUMAN = ("dode", "zwaargewonde", "gewonde", "drie gewonden")
38
+ OUTCOMES_1 = ("dood", "overleden", "zwaargewond", "lichtgewond", "ongedeerd")
39
+
40
+ SUBJECT_TYPES = ("WEAK_PTY", "DRIVER", "VERHICLE")
41
+
42
+ VPS_DRIVE_ACTIVE = ("rijdt tegen boom", "veroorzaakt ongeluk")
43
+ VPS_DRIVE_NON_ACTIVE = ("verongelukt", "gecrasht", "uit de bocht gevlogen", "raakt gewond", "raakt gewond door klap")
44
+ EVENT_VERBS_1_VICTIM = ("aangereden", "geschept", "raakt gewond", "raakt gewond door klap")
45
+ EVENT_VERBS_2_ACTIVE_ANY = ("raakt|_|_", "botst|op|_", "botst|tegen|_")
46
+ EVENT_VERBS_2_ACTIVE_DRIVE = ("rijdt|_|aan", "rijdt|_|dood", "schept|_|_")
47
+ EVENT_VERBS_2_NON_ACTIVE_DRIVER = (
48
+ "aangereden|door|_", "geschept|door|_")
49
+ EVENT_VERBS_2_NON_ACTIVE_VEHICLE = (
50
+ "aangereden|door|_", "geschept|door|_", "komt|onder|_")
51
+ EVENT_VERBS_2_NON_ACTIVE_ANY = (
52
+ "geraakt|door|_",)
53
+
54
+
55
+
56
+ WEAK_PTY_NPS = ("fietser", "skateboarder", "wielrenner", "rolschaatser", "jogger", "voetganger", "motorrijder",
57
+ "fietskoerier", "[[PERSON]] op fiets", "[[PERSON]] op e-bike")
58
+ ANY_PERSON_NPS = ("vrouw", "man", "meisje", "jongen",
59
+ "bejaarde vrouw", "bejaarde man", "Duitser", "toerist")
60
+ CYCLIST_PERSON_NPS = ("postbode", "maaltijdbezorger", "politieagent")
61
+ DRIVER_NPS = ("automobilist", "automobiliste", "bestuurder", "dronken automobilist", "dronken bestuurder", "motorrijder",
62
+ "minderjarige bestuurder", "trucker", "taxichauffeur", "[[PERSON]] in auto", "dronken [[PERSON]] in auto")
63
+ VEHICLE_NPS = ("auto", "personenauto", "vrachtwagen", "tractor", "auto met caravan", "scooter", "motor",
64
+ "tram", "stadsbus", "lijn 10", "touringcar", "camper", "vorkheftruck")
65
+
66
+
67
+ def generate_weak_pty():
68
+ noun_phrase = random.choice(WEAK_PTY_NPS)
69
+ if "[[PERSON]]" in noun_phrase:
70
+ person = random.choice(ANY_PERSON_NPS + CYCLIST_PERSON_NPS)
71
+ return noun_phrase.replace("[[PERSON]]", person)
72
+ else:
73
+ return noun_phrase
74
+
75
+
76
+ def generate_driver():
77
+ noun_phrase = random.choice(DRIVER_NPS)
78
+ if "[[PERSON]]" in noun_phrase:
79
+ person = random.choice(ANY_PERSON_NPS)
80
+ return noun_phrase.replace("[[PERSON]]", person)
81
+ else:
82
+ return noun_phrase
83
+
84
+
85
+ def make_sentence(template, fields):
86
+ sentence = template
87
+ for field, value in fields.items():
88
+ sentence = sentence.replace(f"[[{field}]]", value)
89
+ sentence = sentence.replace("_", "").replace(" ", " ").strip()
90
+ sentence = sentence[0].upper() + sentence[1:]
91
+ return sentence
92
+
93
+
94
+ def main():
95
+ sentences = {}
96
+
97
+ dup_fails = 0
98
+ while len(sentences) < NUM_SENTENCES and dup_fails < NUM_FAILS:
99
+ fields = {}
100
+
101
+ label = {"party_mentioned": 0, "party_human": 0, "active": False}
102
+
103
+ fields["TIME"] = random.choice(TIMES)
104
+ fields["PLACE"] = random.choice(PLACES)
105
+
106
+ sent_type = random.choice(SENT_TYPES)
107
+ if sent_type == "0_PTY":
108
+ if random.random() < 0.5:
109
+ fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
110
+ else:
111
+ fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
112
+ label["party_mentioned"] += 1
113
+ label["party_human"] += 1
114
+
115
+ outcome_type = random.choice(OUTCOME_0_TYPES)
116
+ if outcome_type == "TRAFFIC":
117
+ fields["OUTCOME"] = random.choice(OUTCOMES_0_TRAFFIC)
118
+ else:
119
+ fields["OUTCOME"] = random.choice(OUTCOMES_0_HUMAN)
120
+ label["party_mentioned"] += 1
121
+ label["party_human"] += 1
122
+ sentence = make_sentence(
123
+ random.choice(SENTS_0_PTY_OUTCOME), fields)
124
+
125
+ elif sent_type == "1_PTY":
126
+ if random.random() < 0.5:
127
+ fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
128
+ else:
129
+ fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
130
+ label["party_mentioned"] += 1
131
+ label["party_human"] += 1
132
+
133
+ sent_subtype = random.choice(SENT_1_PTY_TYPES)
134
+ if sent_subtype == "VICTIM":
135
+ label["party_mentioned"] += 1
136
+ label["party_human"] += 1
137
+ fields["SUBJECT"] = generate_weak_pty()
138
+ fields["VERB_V2"] = random.choice(EVENT_VERBS_1_VICTIM)
139
+ sentence = make_sentence(
140
+ random.choice(SENTS_1_PTY_VICTIM), fields)
141
+ elif sent_subtype == "OUTCOME":
142
+ subject_type = random.choice(["WEAK_PTY", "DRIVER"])
143
+ fields["OUTCOME"] = random.choice(OUTCOMES_1)
144
+ if subject_type == "WEAK_PTY":
145
+ label["party_mentioned"] += 1
146
+ label["party_human"] += 1
147
+ fields["SUBJECT"] = generate_weak_pty()
148
+ else: # driver
149
+ label["party_mentioned"] += 1
150
+ label["party_human"] += 1
151
+ fields["SUBJECT"] = generate_driver()
152
+ sentence = make_sentence(
153
+ random.choice(SENTS_1_PTY_OUTCOME), fields)
154
+ else: # drive
155
+ subject_type = random.choice(["DRIVER", "VERHICLE"])
156
+ active_type = random.choice(SENT_ACTIVE_TYPES)
157
+ if active_type == "ACTIVE":
158
+ fields["VP_DRIVE"] = random.choice(VPS_DRIVE_ACTIVE)
159
+ label["active"] = True
160
+ else:
161
+ fields["VP_DRIVE"] = random.choice(VPS_DRIVE_NON_ACTIVE)
162
+ if subject_type == "DRIVER":
163
+ label["party_mentioned"] += 1
164
+ label["party_human"] += 1
165
+ fields["SUBJECT"] = generate_driver()
166
+ else: # vehicle
167
+ label["party_mentioned"] += 1
168
+ fields["SUBJECT"] = random.choice(VEHICLE_NPS)
169
+ sentence = make_sentence(
170
+ random.choice(SENTS_1_PTY_DRIVE), fields)
171
+ else: # 2 pty
172
+ active_type = random.choice(SENT_ACTIVE_TYPES)
173
+ if active_type == "ACTIVE":
174
+ subject_type = random.choice(["WEAK_PTY", "DRIVER", "VERHICLE"])
175
+ label["active"] = True
176
+
177
+ if subject_type == "WEAK_PTY":
178
+ label["party_mentioned"] += 1
179
+ label["party_human"] += 1
180
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
181
+ EVENT_VERBS_2_ACTIVE_ANY).split("|")
182
+ fields["SUBJECT"] = generate_weak_pty()
183
+ other_type = random.choice(["WEAK_PTY", "VEHICLE"])
184
+ elif subject_type == "DRIVER":
185
+ label["party_mentioned"] += 1
186
+ label["party_human"] += 1
187
+ fields["SUBJECT"] = generate_driver()
188
+ if random.random() < 0.5:
189
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
190
+ EVENT_VERBS_2_ACTIVE_ANY).split("|")
191
+ other_type = random.choice(["WEAK_PTY", "VEHICLE"])
192
+ else:
193
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
194
+ EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
195
+ other_type = "WEAK_PTY"
196
+
197
+ else: # vehicle
198
+ label["party_mentioned"] += 1
199
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
200
+ EVENT_VERBS_2_ACTIVE_ANY + EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
201
+ fields["SUBJECT"] = random.choice(VEHICLE_NPS)
202
+
203
+ if other_type == "WEAK_PTY":
204
+ label["party_mentioned"] += 1
205
+ label["party_human"] += 1
206
+ fields["OTHER"] = generate_weak_pty()
207
+ elif other_type == "DRIVER":
208
+ label["party_mentioned"] += 1
209
+ label["party_human"] += 1
210
+ fields["OTHER"] = generate_driver()
211
+ else: # vehicle
212
+ label["party_mentioned"] += 1
213
+ fields["OTHER"] = random.choice(VEHICLE_NPS)
214
+
215
+ else: # non-active
216
+ other_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
217
+ if other_type == "WEAK_PTY":
218
+ label["party_mentioned"] += 1
219
+ label["party_human"] += 1
220
+ fields["OTHER"] = generate_weak_pty()
221
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
222
+ EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
223
+ subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
224
+
225
+ elif other_type == "DRIVER":
226
+ label["party_mentioned"] += 1
227
+ label["party_human"] += 1
228
+ fields["OTHER"] = generate_driver()
229
+ if random.random() < 0.5:
230
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
231
+ EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
232
+ subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
233
+ else:
234
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
235
+ EVENT_VERBS_2_NON_ACTIVE_DRIVER).split("|")
236
+ subject_type = random.choice(["WEAK_PTY"])
237
+
238
+ else: # "vehicle"
239
+ label["party_mentioned"] += 1
240
+ fields["OTHER"] = random.choice(VEHICLE_NPS)
241
+ if random.random() < 0.5:
242
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
243
+ EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
244
+ subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
245
+ else:
246
+ fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
247
+ EVENT_VERBS_2_NON_ACTIVE_VEHICLE).split("|")
248
+ subject_type = random.choice(["WEAK_PTY"])
249
+
250
+ if subject_type == "WEAK_PTY":
251
+ label["party_mentioned"] += 1
252
+ label["party_human"] += 1
253
+ fields["SUBJECT"] = generate_weak_pty()
254
+ elif subject_type == "DRIVER":
255
+ label["party_mentioned"] += 1
256
+ label["party_human"] += 1
257
+ fields["SUBJECT"] = generate_driver()
258
+ else: # vehicle
259
+ label["party_mentioned"] += 1
260
+ fields["SUBJECT"] = random.choice(VEHICLE_NPS)
261
+
262
+ sentence = make_sentence(random.choice(SENTS_2_PTYS), fields)
263
+
264
+ if sentence not in sentences:
265
+ sentences[sentence] = label
266
+ dup_fails = 0
267
+ else:
268
+ dup_fails += 1
269
+
270
+ with open("output/crashes/generate_templates/sentences.jsonl", "w", encoding="utf-8") as f_out:
271
+ for sentence, label in sentences.items():
272
+ f_out.write(json.dumps({"sentence": sentence, "label": label}) + os.linesep)
273
+ f_out.write(os.linesep)
274
+
275
+
276
+ if __name__ == "__main__":
277
+ main()
sociofillmore/crashes/make_bechdel_dicts.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ import json
4
+
5
+ sys.path.append("./libs")
6
+ from OpenDutchWordnet import Wn_grid_parser
7
+
8
+
9
+ def find_all_le_hyponyms(instance, le_id):
10
+ print(f"Starting from `{le_id}`...")
11
+ le_el = instance.les_find_le(le_id)
12
+ le_ss = le_el.get_synset_id()
13
+ siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
14
+ print(f"Siblings: {siblings}")
15
+ synset_el = instance.synsets_find_synset(le_ss)
16
+ print(f"Top-level synset: `{le_el.get_synset_id()}`...")
17
+ hyponyms = find_all_synset_hyponyms(instance, synset_el)
18
+ return siblings.union(hyponyms)
19
+
20
+
21
+ def find_all_synset_hyponyms(instance, synset_el):
22
+ print(f"Finding hyponyms of synset with gloss: `{synset_el.get_glosses()[:1]}`...")
23
+ hypo_les = set()
24
+ hypo_rels = synset_el.get_relations("has_hyponym")
25
+ for rel in hypo_rels:
26
+ hypo_ss = rel.get_target()
27
+ print(hypo_ss)
28
+ ss_les = {le.get_id() for le in instance.les_all_les_of_one_synset(hypo_ss)}
29
+ for i in ss_les:
30
+ print(f"\tfound LE: {i}")
31
+ ss_les.update(find_all_synset_hyponyms(instance, instance.synsets_find_synset(hypo_ss)))
32
+ hypo_les.update(ss_les)
33
+ return hypo_les
34
+
35
+
36
+ def find_siblings_and_hyperonym(instance, le_id):
37
+ le_el = instance.les_find_le(le_id)
38
+ le_ss = le_el.get_synset_id()
39
+ siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
40
+ print(siblings)
41
+ synset_el = instance.synsets_find_synset(le_ss)
42
+ hyper = synset_el.get_relations("has_hyperonym")[0]
43
+ hyper_ss = instance.synsets_find_synset(hyper.get_target())
44
+ print(hyper_ss.get_glosses())
45
+ print({le.get_id() for le in instance.les_all_les_of_one_synset(hyper.get_target())})
46
+
47
+
48
+ def main():
49
+ instance = Wn_grid_parser(Wn_grid_parser.odwn)
50
+ # find_all_le_hyponyms(instance, "slachtoffer-n-4")
51
+ dicts = {
52
+ "vehicles": {
53
+ "WN:cars": sorted(find_all_le_hyponyms(instance, "automobiel-n-1")),
54
+ "WN:motorbikes": sorted(find_all_le_hyponyms(instance, "motorfiets-n-1")),
55
+ "WN:bikes": sorted(find_all_le_hyponyms(instance, "fiets-n-1")),
56
+ "WN:buses": sorted(find_all_le_hyponyms(instance, "autobus-n-1")),
57
+ "extra": sorted(["scootmobiel", "e-bike"])
58
+ },
59
+ "persons": {
60
+ "WN:driver": sorted(find_all_le_hyponyms(instance, "bestuurder-n-2")),
61
+ "WN:cyclist": sorted(find_all_le_hyponyms(instance, "fietser-n-1")),
62
+ "WN:walker": sorted(find_all_le_hyponyms(instance, "loper-n-4")),
63
+ "WN:pedestrian": sorted(find_all_le_hyponyms(instance, "voetganger-n-1")),
64
+ "WN:victim": sorted(find_all_le_hyponyms(instance, "slachtoffer-n-4")),
65
+ "extra": sorted(
66
+ ["man", "vrouw", "jongen", "meisje", "persoon", "bejaarde", "maaltijdbezorger"]
67
+ )
68
+ }
69
+ }
70
+
71
+ ignore_file = "output/crashes/predict_bechdel/lexical_dicts_ignore.json"
72
+ if os.path.isfile(ignore_file):
73
+ with open(ignore_file, encoding="utf-8") as f_ign:
74
+ ignore = json.load(f_ign)
75
+
76
+ cleaned_dicts = {}
77
+ for category in dicts.keys():
78
+ cleaned_dicts[category] = {}
79
+ for subcat, words in dicts[category].items():
80
+ ignore_subcat = ignore.get(category, {}).get(subcat, [])
81
+ cleaned_dicts[category][subcat] = [w for w in words if w not in ignore_subcat]
82
+ else:
83
+ cleaned_dicts = dicts
84
+
85
+ with open("output/crashes/predict_bechdel/lexical_dicts.json", "w", encoding="utf-8") as f_out:
86
+ json.dump(cleaned_dicts, f_out, indent=4)
87
+
88
+
89
+ if __name__ == "__main__":
90
+ main()
sociofillmore/crashes/predict_bechdel.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ from typing import Dict, Iterable, List, Optional, Tuple
4
+ import json
5
+ import random
6
+ import argparse
7
+ from allennlp.data.fields.field import Field
8
+ from allennlp.data.fields.sequence_field import SequenceField
9
+ from allennlp.models.model import Model
10
+ from allennlp.nn.util import get_text_field_mask
11
+ from allennlp.predictors.predictor import Predictor
12
+
13
+ import pandas as pd
14
+ import spacy
15
+ import torch
16
+ from sklearn.preprocessing import MultiLabelBinarizer
17
+
18
+ from allennlp.common.util import pad_sequence_to_length
19
+ from allennlp.data import TextFieldTensors
20
+ from allennlp.data.vocabulary import Vocabulary
21
+ from allennlp.data import DatasetReader, TokenIndexer, Instance, Token
22
+ from allennlp.data.fields import TextField, LabelField
23
+ from allennlp.data.token_indexers.pretrained_transformer_indexer import (
24
+ PretrainedTransformerIndexer,
25
+ )
26
+ from allennlp.data.tokenizers.pretrained_transformer_tokenizer import (
27
+ PretrainedTransformerTokenizer,
28
+ )
29
+ from allennlp.models import BasicClassifier
30
+ from allennlp.modules.text_field_embedders.basic_text_field_embedder import (
31
+ BasicTextFieldEmbedder,
32
+ )
33
+ from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
34
+ PretrainedTransformerEmbedder,
35
+ )
36
+ from allennlp.modules.seq2vec_encoders.bert_pooler import BertPooler
37
+ from allennlp.modules.seq2vec_encoders.cls_pooler import ClsPooler
38
+ from allennlp.training.checkpointer import Checkpointer
39
+ from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
40
+ from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader
41
+ from allennlp.training.optimizers import AdamOptimizer
42
+ from allennlp.predictors.text_classifier import TextClassifierPredictor
43
+ from allennlp.training.callbacks.tensorboard import TensorBoardCallback
44
+ from torch import nn
45
+ from torch.nn.functional import binary_cross_entropy_with_logits
46
+
47
+
48
+ random.seed(1986)
49
+
50
+
51
+ SEQ_LABELS = ["humansMentioned", "vehiclesMentioned", "eventVerb", "activeEventVerb"]
52
+
53
+
54
+ # adapted from bert-for-framenet project
55
+ class SequenceMultiLabelField(Field):
56
+
57
+ def __init__(self,
58
+ labels: List[List[str]],
59
+ sequence_field: SequenceField,
60
+ binarizer: MultiLabelBinarizer,
61
+ label_namespace: str
62
+ ):
63
+ self.labels = labels
64
+ self._indexed_labels = None
65
+ self._label_namespace = label_namespace
66
+ self.sequence_field = sequence_field
67
+ self.binarizer = binarizer
68
+
69
+ @staticmethod
70
+ def retokenize_tags(tags: List[List[str]],
71
+ offsets: List[Tuple[int, int]],
72
+ wp_primary_token: str = "last",
73
+ wp_secondary_tokens: str = "empty",
74
+ empty_value=lambda: []
75
+ ) -> List[List[str]]:
76
+ tags_per_wordpiece = [
77
+ empty_value() # [CLS]
78
+ ]
79
+
80
+ for i, (off_start, off_end) in enumerate(offsets):
81
+ tag = tags[i]
82
+
83
+ # put a tag on the first wordpiece corresponding to the word token
84
+ # e.g. "hello" --> "he" + "##ll" + "##o" --> 2 extra tokens
85
+ # TAGS: [..., TAG, None, None, ...]
86
+ num_extra_tokens = off_end - off_start
87
+ if wp_primary_token == "first":
88
+ tags_per_wordpiece.append(tag)
89
+ if wp_secondary_tokens == "repeat":
90
+ tags_per_wordpiece.extend(num_extra_tokens * [tag])
91
+ else:
92
+ tags_per_wordpiece.extend(num_extra_tokens * [empty_value()])
93
+ if wp_primary_token == "last":
94
+ tags_per_wordpiece.append(tag)
95
+
96
+ tags_per_wordpiece.append(empty_value()) # [SEP]
97
+
98
+ return tags_per_wordpiece
99
+
100
+ def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
101
+ for label_list in self.labels:
102
+ for label in label_list:
103
+ counter[self._label_namespace][label] += 1
104
+
105
+ def get_padding_lengths(self) -> Dict[str, int]:
106
+ return {"num_tokens": self.sequence_field.sequence_length()}
107
+
108
+ def index(self, vocab: Vocabulary):
109
+
110
+ indexed_labels: List[List[int]] = []
111
+ for sentence_labels in self.labels:
112
+ sentence_indexed_labels = []
113
+ for label in sentence_labels:
114
+ try:
115
+ sentence_indexed_labels.append(
116
+ vocab.get_token_index(label, self._label_namespace))
117
+ except KeyError:
118
+ print(f"[WARNING] Ignore unknown label {label}")
119
+ indexed_labels.append(sentence_indexed_labels)
120
+ self._indexed_labels = indexed_labels
121
+
122
+ def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
123
+
124
+ # binarize
125
+ binarized_seq = self.binarizer.transform(self._indexed_labels).tolist()
126
+
127
+ # padding
128
+ desired_num_tokens = padding_lengths["num_tokens"]
129
+ padded_tags = pad_sequence_to_length(binarized_seq, desired_num_tokens,
130
+ default_value=lambda: list(self.binarizer.transform([[]])[0]))
131
+
132
+ tensor = torch.tensor(padded_tags, dtype=torch.float)
133
+ return tensor
134
+
135
+ def empty_field(self) -> 'Field':
136
+
137
+ field = SequenceMultiLabelField(
138
+ [], self.sequence_field.empty_field(), self.binarizer, self._label_namespace)
139
+ field._indexed_labels = []
140
+ return field
141
+
142
+
143
+ # adapted from bert-for-framenet project
144
+ class MultiSequenceLabelModel(Model):
145
+
146
+ def __init__(self, embedder: PretrainedTransformerEmbedder, decoder_output_size: int, hidden_size: int, vocab: Vocabulary, embedding_size: int = 768):
147
+ super().__init__(vocab)
148
+ self.embedder = embedder
149
+ self.out_features = decoder_output_size
150
+ self.hidden_size = hidden_size
151
+ self.layers = nn.Sequential(
152
+ nn.Linear(in_features=embedding_size,
153
+ out_features=self.hidden_size),
154
+ nn.ReLU(),
155
+ nn.Linear(in_features=self.hidden_size,
156
+ out_features=self.out_features)
157
+ )
158
+
159
+ def forward(self, tokens: TextFieldTensors, label: Optional[torch.FloatTensor] = None):
160
+ embeddings = self.embedder(tokens["token_ids"])
161
+ mask = get_text_field_mask(tokens).float()
162
+ tag_logits = self.layers(embeddings)
163
+ mask = mask.reshape(mask.shape[0], mask.shape[1], 1).repeat(1, 1, self.out_features)
164
+ output = {"tag_logits": tag_logits}
165
+ if label is not None:
166
+ loss = binary_cross_entropy_with_logits(tag_logits, label, mask)
167
+ output["loss"] = loss
168
+
169
+ def get_metrics(self, _) -> Dict[str, float]:
170
+ return {}
171
+
172
+ def make_human_readable(self,
173
+ prediction,
174
+ label_namespace,
175
+ threshold=0.2,
176
+ sigmoid=True
177
+ ) -> Tuple[List[str], Optional[List[float]]]:
178
+ if sigmoid:
179
+ prediction = torch.sigmoid(prediction)
180
+
181
+ predicted_labels: List[List[str]] = [[] for _ in range(len(prediction))]
182
+
183
+ # get all predictions with a positive probability
184
+ for coord in torch.nonzero(prediction > threshold):
185
+ label = self.vocab.get_token_from_index(int(coord[1]), label_namespace)
186
+ predicted_labels[coord[0]].append(f"{label}:{prediction[coord[0], coord[1]]:.3f}")
187
+
188
+ str_predictions: List[str] = []
189
+ for label_list in predicted_labels:
190
+ str_predictions.append("|".join(label_list) or "_")
191
+
192
+ return str_predictions
193
+
194
+
195
+ class TrafficBechdelReader(DatasetReader):
196
+
197
+ def __init__(self, token_indexers, tokenizer, binarizer):
198
+ self.token_indexers = token_indexers
199
+ self.tokenizer: PretrainedTransformerTokenizer = tokenizer
200
+ self.binarizer = binarizer
201
+ self.orig_data = []
202
+ super().__init__()
203
+
204
+ def _read(self, file_path) -> Iterable[Instance]:
205
+ self.orig_data.clear()
206
+
207
+ with open(file_path, encoding="utf-8") as f:
208
+ for line in f:
209
+ # skip any empty lines
210
+ if not line.strip():
211
+ continue
212
+
213
+ sentence_parts = line.lstrip("[").rstrip("]").split(",")
214
+ token_txts = []
215
+ token_mlabels = []
216
+
217
+ for sp in sentence_parts:
218
+ sp_txt, sp_lbl_str = sp.split(":")
219
+ if sp_lbl_str == "[]":
220
+ sp_lbls = []
221
+ else:
222
+ sp_lbls = sp_lbl_str.lstrip("[").rstrip("]").split("|")
223
+
224
+ # if the text is a WordNet thingy
225
+ wn_match = re.match(r"^(.+)-n-\d+$", sp_txt)
226
+ if wn_match:
227
+ sp_txt = wn_match.group(1)
228
+
229
+ # multi-token text
230
+ sp_toks = sp_txt.split()
231
+ for tok in sp_toks:
232
+ token_txts.append(tok)
233
+ token_mlabels.append(sp_lbls)
234
+
235
+ self.orig_data.append({
236
+ "sentence": token_txts,
237
+ "labels": token_mlabels,
238
+ })
239
+ yield self.text_to_instance(token_txts, token_mlabels)
240
+
241
+ def text_to_instance(self, sentence: List[str], labels: List[List[str]] = None) -> Instance:
242
+ tokens, offsets = self.tokenizer.intra_word_tokenize(sentence)
243
+
244
+ text_field = TextField(tokens, self.token_indexers)
245
+ fields = {"tokens": text_field}
246
+ if labels is not None:
247
+ labels_ = SequenceMultiLabelField.retokenize_tags(labels, offsets)
248
+ label_field = SequenceMultiLabelField(labels_, text_field, self.binarizer, "labels")
249
+ fields["label"] = label_field
250
+ return Instance(fields)
251
+
252
+
253
+ def count_parties(sentence, lexical_dicts, nlp):
254
+
255
+ num_humans = 0
256
+ num_vehicles = 0
257
+
258
+ def is_in_words(l, category):
259
+ for subcategory, words in lexical_dicts[category].items():
260
+ if subcategory.startswith("WN:"):
261
+ words = [re.match(r"^(.+)-n-\d+$", w).group(1) for w in words]
262
+ if l in words:
263
+ return True
264
+ return False
265
+
266
+ doc = nlp(sentence.lower())
267
+ for token in doc:
268
+ lemma = token.lemma_
269
+ if is_in_words(lemma, "persons"):
270
+ num_humans += 1
271
+ if is_in_words(lemma, "vehicles"):
272
+ num_vehicles += 1
273
+
274
+ return num_humans, num_vehicles
275
+
276
+
277
+ def predict_rule_based(annotations="data/crashes/bechdel_annotations_dev_first_25.csv"):
278
+ data_crashes = pd.read_csv(annotations)
279
+ with open("output/crashes/predict_bechdel/lexical_dicts.json", encoding="utf-8") as f:
280
+ lexical_dicts = json.load(f)
281
+
282
+ nlp = spacy.load("nl_core_news_md")
283
+
284
+ for _, row in data_crashes.iterrows():
285
+ sentence = row["sentence"]
286
+ num_humans, num_vehicles = count_parties(sentence, lexical_dicts, nlp)
287
+ print(sentence)
288
+ print(f"\thumans={num_humans}, vehicles={num_vehicles}")
289
+
290
+
291
+ def evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_first_25.csv", out_file="output/crashes/predict_bechdel/predictions_crashes25.csv"):
292
+ data_crashes = pd.read_csv(annotations)
293
+ labels_crashes = [
294
+ {
295
+ "party_mentioned": str(row["mentioned"]),
296
+ "party_human": str(row["as_human"]),
297
+ "active": str(True) if str(row["active"]).lower() == "true" else str(False)
298
+ }
299
+ for _, row in data_crashes.iterrows()
300
+ ]
301
+ predictions_crashes = [predictor.predict(
302
+ row["sentence"]) for i, row in data_crashes.iterrows()]
303
+ crashes_out = []
304
+ correct = 0
305
+ partial_2_attrs = 0
306
+ partial_1_attr = 0
307
+ correct_mentions = 0
308
+ correct_humans = 0
309
+ correct_active = 0
310
+
311
+ for sentence, label, prediction in zip(data_crashes["sentence"], labels_crashes, predictions_crashes):
312
+ predicted = prediction["label"]
313
+ if attrib == "all":
314
+ gold = "|".join([f"{k}={v}" for k, v in label.items()])
315
+ else:
316
+ gold = label["attrib"]
317
+ if gold == predicted:
318
+ correct += 1
319
+ if attrib == "all":
320
+ partial_2_attrs += 1
321
+ partial_1_attr += 1
322
+
323
+ if attrib == "all":
324
+ gold_attrs = set(gold.split("|"))
325
+ pred_attrs = set(predicted.split("|"))
326
+ if len(gold_attrs & pred_attrs) == 2:
327
+ partial_2_attrs += 1
328
+ partial_1_attr += 1
329
+ elif len(gold_attrs & pred_attrs) == 1:
330
+ partial_1_attr += 1
331
+
332
+ if gold.split("|")[0] == predicted.split("|")[0]:
333
+ correct_mentions += 1
334
+ if gold.split("|")[1] == predicted.split("|")[1]:
335
+ correct_humans += 1
336
+ if gold.split("|")[2] == predicted.split("|")[2]:
337
+ correct_active += 1
338
+
339
+ crashes_out.append(
340
+ {"sentence": sentence, "gold": gold, "prediction": predicted})
341
+
342
+ print("ACC_crashes (strict) = ", correct/len(data_crashes))
343
+ print("ACC_crashes (partial:2) = ", partial_2_attrs/len(data_crashes))
344
+ print("ACC_crashes (partial:1) = ", partial_1_attr/len(data_crashes))
345
+ print("ACC_crashes (mentions) = ", correct_mentions/len(data_crashes))
346
+ print("ACC_crashes (humans) = ", correct_humans/len(data_crashes))
347
+ print("ACC_crashes (active) = ", correct_active/len(data_crashes))
348
+
349
+ pd.DataFrame(crashes_out).to_csv(out_file)
350
+
351
+
352
+ def filter_events_for_bechdel():
353
+
354
+ with open("data/crashes/thecrashes_data_all_text.json", encoding="utf-8") as f:
355
+ events = json.load(f)
356
+
357
+ total_articles = 0
358
+ data_out = []
359
+ for ev in events:
360
+ total_articles += len(ev["articles"])
361
+
362
+ num_persons = len(ev["persons"])
363
+ num_transport_modes = len({p["transportationmode"]
364
+ for p in ev["persons"]})
365
+
366
+ if num_transport_modes <= 2:
367
+ for art in ev["articles"]:
368
+ data_out.append({"event_id": ev["id"], "article_id": art["id"], "headline": art["title"],
369
+ "num_persons": num_persons, "num_transport_modes": num_transport_modes})
370
+
371
+ print("Total articles = ", total_articles)
372
+
373
+ print("Filtered articles: ", len(data_out))
374
+ out_df = pd.DataFrame(data_out)
375
+ out_df.to_csv("output/crashes/predict_bechdel/filtered_headlines.csv")
376
+
377
+
378
+ def train_and_eval(train=True):
379
+
380
+ # use_gpu = False
381
+ use_gpu = True
382
+ cuda_device = None if use_gpu and torch.cuda.is_available() else -1
383
+
384
+ transformer = "GroNLP/bert-base-dutch-cased"
385
+ # transformer = "xlm-roberta-large"
386
+ token_indexers = {"tokens": PretrainedTransformerIndexer(transformer)}
387
+ tokenizer = PretrainedTransformerTokenizer(transformer)
388
+
389
+ binarizer = MultiLabelBinarizer()
390
+ binarizer.fit([SEQ_LABELS])
391
+ reader = TrafficBechdelReader(token_indexers, tokenizer, binarizer)
392
+ instances = list(reader.read("output/prolog/bechdel_headlines.txt"))
393
+ orig_data = reader.orig_data
394
+ zipped = list(zip(instances, orig_data))
395
+ random.shuffle(zipped)
396
+ instances_ = [i[0] for i in zipped]
397
+ orig_data_ = [i[1] for i in zipped]
398
+
399
+ num_dev = round(0.05 * len(instances_))
400
+ num_test = round(0.25 * len(instances_))
401
+ num_train = len(instances_) - num_dev - num_test
402
+ print("LEN(train/dev/test)=", num_train, num_dev, num_test)
403
+
404
+ instances_train = instances_[:num_train]
405
+ instances_dev = instances_[num_train:num_train + num_dev]
406
+ # instances_test = instances_[num_train+num_dev:num_train:]
407
+
408
+ # orig_train = orig_data_[:num_train]
409
+ orig_dev = orig_data_[num_train:num_train + num_dev]
410
+
411
+ vocab = Vocabulary.from_instances(instances_train + instances_dev)
412
+
413
+ embedder = BasicTextFieldEmbedder(
414
+ {"tokens": PretrainedTransformerEmbedder(transformer)})
415
+ model = MultiSequenceLabelModel(embedder, len(SEQ_LABELS), 1000, vocab)
416
+ if use_gpu:
417
+ model = model.cuda(cuda_device)
418
+
419
+ # checkpoint_dir = f"output/crashes/predict_bechdel/model_{attrib}/"
420
+ checkpoint_dir = f"/scratch/p289731/predict_bechdel/model_seqlabel/"
421
+ serialization_dir = f"/scratch/p289731/predict_bechdel/serialization_seqlabel/"
422
+
423
+ if train:
424
+ os.makedirs(checkpoint_dir)
425
+ os.makedirs(serialization_dir)
426
+ tensorboard = TensorBoardCallback(
427
+ serialization_dir, should_log_learning_rate=True)
428
+ checkpointer = Checkpointer(serialization_dir=checkpoint_dir)
429
+ optimizer = AdamOptimizer(
430
+ [(n, p) for n, p in model.named_parameters() if p.requires_grad],
431
+ lr=1e-5
432
+ )
433
+ train_loader = SimpleDataLoader(
434
+ instances_train, batch_size=8, shuffle=True)
435
+ dev_loader = SimpleDataLoader(
436
+ instances_dev, batch_size=8, shuffle=False)
437
+ train_loader.index_with(vocab)
438
+ dev_loader.index_with(vocab)
439
+
440
+ print("\t\tTraining BERT model")
441
+ trainer = GradientDescentTrainer(
442
+ model,
443
+ optimizer,
444
+ train_loader,
445
+ validation_data_loader=dev_loader,
446
+ # patience=32,
447
+ patience=2,
448
+ # num_epochs=1,
449
+ checkpointer=checkpointer,
450
+ cuda_device=cuda_device,
451
+ serialization_dir=serialization_dir,
452
+ callbacks=[tensorboard]
453
+ )
454
+ trainer.train()
455
+ else:
456
+ state_dict = torch.load(
457
+ "/scratch/p289731/predict_bechdel/serialization_all/best.th", map_location=cuda_device)
458
+ model.load_state_dict(state_dict)
459
+
460
+ print("\t\tProducing predictions...")
461
+
462
+ predictor = Predictor(model, reader)
463
+ predictions_dev = [predictor.predict_instance(i) for i in instances_dev]
464
+
465
+ data_out = []
466
+ for sentence, prediction in zip(orig_dev, predictions_dev):
467
+ readable = model.make_human_readable(prediction, "labels")
468
+ text = sentence["sentence"]
469
+ gold = sentence["labels"]
470
+ predicted = readable
471
+ data_out.append(
472
+ {"sentence": text, "gold": gold, "predicted": predicted})
473
+ df_out = pd.DataFrame(data_out)
474
+ df_out.to_csv("output/crashes/predict_bechdel/predictions_dev.csv")
475
+
476
+ # print()
477
+
478
+ # print("First 25 crashes:")
479
+ # evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_first_25.csv",
480
+ # out_file="output/crashes/predict_bechdel/predictions_first_25.csv")
481
+ # print()
482
+ # print("Next 75 crashes:")
483
+ # evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_next_75.csv",
484
+ # out_file="output/crashes/predict_bechdel/predictions_next_75.csv")
485
+
486
+
487
+ if __name__ == "__main__":
488
+ ap = argparse.ArgumentParser()
489
+ ap.add_argument("action", choices=["train", "predict", "rules", "filter"])
490
+
491
+ args = ap.parse_args()
492
+
493
+ if args.action == "train":
494
+ train_and_eval(train=True)
495
+ elif args.action == "predict":
496
+ train_and_eval(train=False)
497
+ elif args.action == "rules":
498
+ predict_rule_based()
499
+ else:
500
+ filter_events_for_bechdel()
sociofillmore/crashes/split_data.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import random
4
+ import shutil
5
+ from datetime import datetime
6
+
7
+ import langdetect
8
+ import nltk
9
+ import pandas as pd
10
+ from langdetect import DetectorFactory, LangDetectException
11
+
12
+ DATA_FILE = "data/crashes/thecrashes_data_all_text.json"
13
+
14
+ DEV_PORTION = .10
15
+
16
+ random.seed(2001)
17
+ DetectorFactory.seed = 0
18
+
19
+
20
+ def is_a_real_time(timestamp):
21
+ """Helper function, checks if a given timestamp really has a time"""
22
+
23
+ # 00:00:00 (midnight) is the "empty" timestamp, ignore it
24
+ if timestamp.hour == timestamp.minute == timestamp.second == 0:
25
+ return False
26
+ return True
27
+
28
+
29
+ def main():
30
+ process_events()
31
+
32
+
33
+ def detect_language(article):
34
+ if article["alltext"]:
35
+ sample = article["alltext"]
36
+ elif article["summary"]:
37
+ sample = article["summary"]
38
+ else:
39
+ sample = article["title"]
40
+
41
+ try:
42
+ return langdetect.detect(sample)
43
+ except LangDetectException:
44
+ print(f"\tCould not detect language for text_id={article['id']}")
45
+ print(f"\tSample={sample})")
46
+ print()
47
+ return "UNK_LANG"
48
+
49
+
50
+ def extract_text_info(event):
51
+ ev_text_lines = []
52
+ ev_id_lines = []
53
+ ev_meta_rows = []
54
+
55
+ for article in event["articles"]:
56
+ text_id = article["id"]
57
+ try:
58
+ pubdate = datetime.fromisoformat(article["publishedtime"]).strftime("%Y-%m-%d %H:%M:%S")
59
+ except ValueError:
60
+ print(f"\t\tcould not parse date {article['publishedtime']}")
61
+ pubdate = None
62
+ url = article["url"]
63
+ provider = article["sitename"]
64
+ title = article["title"]
65
+ language = detect_language(article)
66
+ ev_meta_rows.append({
67
+ "event_id": event["id"],
68
+ "text_id": text_id,
69
+ "pubdate": pubdate,
70
+ "language": language,
71
+ "url": url,
72
+ "provider": provider,
73
+ "title": title
74
+ })
75
+
76
+ summary = article["summary"]
77
+ body = article["alltext"]
78
+
79
+ text_lines = []
80
+ id_lines = []
81
+
82
+ for line in segment(title, language):
83
+ text_lines.append(line)
84
+ id_lines.append(f"event {event['id']}\ttext {text_id}\ttitle")
85
+
86
+ for line in segment(summary, language):
87
+ text_lines.append(line)
88
+ id_lines.append(f"event {event['id']}\ttext {text_id}\tsummary")
89
+
90
+ for line in segment(body, language):
91
+ text_lines.append(line)
92
+ id_lines.append(f"event {event['id']}\ttext {text_id}\tbody")
93
+
94
+ ev_text_lines.append(text_lines)
95
+ ev_id_lines.append(id_lines)
96
+
97
+ return ev_text_lines, ev_id_lines, ev_meta_rows
98
+
99
+
100
+ def segment(text, language):
101
+ # don't split Hebrew and Vietnamese (because we don't have a segmenter for it)
102
+ if language in ["he", "vi"]:
103
+ return text
104
+
105
+ lang_map = {
106
+ "nl": "dutch",
107
+ "en": "english",
108
+ "es": "spanish",
109
+ "de": "german",
110
+ "fr": "french",
111
+ "ru": "russian",
112
+ "pt": "portuguese"
113
+ }
114
+
115
+ nltk_lang = lang_map.get(language)
116
+
117
+ # what to do with languages without sent tokenizer in NLTK (apart from Hebrew):
118
+ if not nltk_lang:
119
+ if language == "af":
120
+ # treat Afrikaans as Dutch
121
+ nltk_lang = "dutch"
122
+ else:
123
+ print(f"Found an article with unsupported language={language}, falling back to English NLTK")
124
+ nltk_lang = "english"
125
+
126
+ return nltk.sent_tokenize(text, nltk_lang)
127
+
128
+
129
+ def write_to_text_by_event(text_lines, text_meta_lines, event_id, split_to_dir, split):
130
+ event_dir = f"{split_to_dir[split]}/{event_id}"
131
+ os.makedirs(event_dir, exist_ok=True)
132
+ for art_lines, row in zip(text_lines, text_meta_lines):
133
+ text_file = f"{event_dir}/{row['text_id']}.txt"
134
+ with open(text_file, "w", encoding="utf-8") as f:
135
+ for line in art_lines:
136
+ f.write(line + os.linesep)
137
+
138
+
139
+ def process_events():
140
+ print("Loading data file...")
141
+ with open(DATA_FILE, encoding="utf-8") as f:
142
+ data = json.load(f)
143
+ event_all_rows = []
144
+ event_dev_rows = []
145
+ event_main_rows = []
146
+
147
+ text_all_rows = []
148
+ text_dev_rows = []
149
+ text_main_rows = []
150
+
151
+ # make empty text files
152
+ text_file_basenames = {
153
+ "all": "output/crashes/split_data/all.texts",
154
+ "dev": "output/crashes/split_data/split_dev10.texts",
155
+ "main": "output/crashes/split_data/split_main.texts"
156
+ }
157
+ for split, bn in text_file_basenames.items():
158
+ for ext in [".text.txt", ".ids.txt"]:
159
+ f = open(f"{bn}{ext}", "w", encoding="utf-8")
160
+ f.close()
161
+
162
+ # clear & make text file directories
163
+ text_files_by_event_dir = {}
164
+ for split in ["all", "dev", "main"]:
165
+ prefix = "split_dev10" if split == "dev" else "split_main" if split == "main" else "all"
166
+ text_dir = f"output/crashes/split_data/{prefix}_texts_by_event"
167
+ text_files_by_event_dir[split] = text_dir
168
+ if os.path.exists(text_dir):
169
+ shutil.rmtree(text_dir)
170
+ os.mkdir(text_dir)
171
+
172
+ # helper function for writing text files
173
+ def append_to_txt(txt_file, lines):
174
+ with open(txt_file, "a", encoding="utf-8") as f_out:
175
+ for art_lines in lines:
176
+ for line in art_lines:
177
+ f_out.write(line + os.linesep)
178
+
179
+ print("Processing events...")
180
+ for event in data:
181
+ event_id = event["id"]
182
+ print(f"\tevent_id={event_id}")
183
+ try:
184
+ timestamp = datetime.fromisoformat(event["date"])
185
+ except ValueError:
186
+ timestamp = None
187
+
188
+ event_row = {
189
+ "event:id": event_id,
190
+ "event:date": timestamp.strftime("%Y-%m-%d") if timestamp else None,
191
+ "event:time": timestamp.strftime("%H-%M-%S") if timestamp and is_a_real_time(timestamp) else None,
192
+ "event:coordinates": f"{event['latitude'], event['longitude']}",
193
+ "vehicle_involved": 1 if any(p for p in event["persons"] if p["transportationmode"] in range(5, 14)) else 0
194
+ }
195
+
196
+ for health, health_code in (("dead", 3), ("injured", 2)):
197
+ all_with_health = [p for p in event["persons"] if p["health"] == health_code]
198
+ event_row[f"outcomes:{health}:total"] = len(all_with_health)
199
+ event_row[f"outcomes:{health}:child"] = len([p for p in all_with_health if p["child"] == 1])
200
+ for mode, mode_codes in (("pedestrian", [1]), ("cyclist", [2]), ("vehicle", range(5, 14))):
201
+ event_row[f"outcomes:{health}:{mode}"] = len([p for p in all_with_health
202
+ if p["transportationmode"] in mode_codes])
203
+
204
+ text_lines, text_id_lines, text_meta_rows = extract_text_info(event)
205
+
206
+ event_all_rows.append(event_row)
207
+ text_all_rows.extend(text_meta_rows)
208
+ append_to_txt(text_file_basenames["all"] + ".text.txt", text_lines)
209
+ append_to_txt(text_file_basenames["all"] + ".ids.txt", text_id_lines)
210
+ write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "all")
211
+
212
+ if random.random() < DEV_PORTION:
213
+ event_dev_rows.append(event_row)
214
+ text_dev_rows.extend(text_meta_rows)
215
+ append_to_txt(text_file_basenames["dev"] + ".text.txt", text_lines)
216
+ append_to_txt(text_file_basenames["dev"] + ".ids.txt", text_id_lines)
217
+ write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "dev")
218
+
219
+ else:
220
+ event_main_rows.append(event_row)
221
+ text_main_rows.extend(text_meta_rows)
222
+ append_to_txt(text_file_basenames["main"] + ".text.txt", text_lines)
223
+ append_to_txt(text_file_basenames["main"] + ".ids.txt", text_id_lines)
224
+ write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "main")
225
+
226
+ all_ev_df = pd.DataFrame(event_all_rows)
227
+ main_ev_df = pd.DataFrame(event_main_rows)
228
+ dev_ev_df = pd.DataFrame(event_dev_rows)
229
+ for df, file in ((all_ev_df, "all.events"), (main_ev_df, "split_main.events"), (dev_ev_df, "split_dev10.events")):
230
+ df.to_csv(f"output/crashes/split_data/{file}.csv")
231
+
232
+ all_txt_df = pd.DataFrame(text_all_rows)
233
+ main_txt_df = pd.DataFrame(text_main_rows)
234
+ dev_txt_df = pd.DataFrame(text_dev_rows)
235
+ for df, file in ((all_txt_df, "all.texts"), (main_txt_df, "split_main.texts"), (dev_txt_df, "split_dev10.texts")):
236
+ df.to_csv(f"output/crashes/split_data/{file}.meta.csv")
237
+
238
+
239
+ if __name__ == '__main__':
240
+ main()
sociofillmore/crashes/utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ texts_meta = pd.read_csv("output/crashes/split_data/split_dev10.texts.meta.csv", index_col=0, dtype={"text_id": str})
4
+
5
+
6
+ def is_a_dutch_text(doc_id, exclude_frisian=True):
7
+ filtered_for_doc = texts_meta[texts_meta["text_id"] == doc_id]
8
+ if len(filtered_for_doc) >= 1:
9
+ if exclude_frisian:
10
+ # exclude newsproviders publishing mainly in Frisian
11
+ # (NB these texts are recognized as Dutch by langdetect, hence the need for a provider filter)
12
+ if filtered_for_doc["provider"].iloc[0] == "omropfryslan.nl":
13
+ return False
14
+ if filtered_for_doc["language"].iloc[0] == "nl":
15
+ return True
16
+ return False
sociofillmore/femicides/compare_lome_models.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ import glob
3
+ import json
4
+ import os
5
+ import re
6
+ import random
7
+ import sys
8
+ from typing import List, Dict, Tuple
9
+
10
+ import pandas as pd
11
+ import numpy as np
12
+
13
+ from sociofillmore.common.analyze_text import load_caches, process_fn_sentence, FrameStructure, read_frames_of_interest
14
+
15
+ RANDOM_SEED = 9718
16
+ NUM_EVALUATION_SENTENCES = 150
17
+
18
+ EVALITA_MODEL = "lome_evalita_plus_fn"
19
+ # EVALITA_MODEL = "lome_evalita_plus_fn_0conf"
20
+ OUT_FOLDER = f"0shot__vs__{EVALITA_MODEL.split('_', maxsplit=1)[1]}"
21
+ print(OUT_FOLDER)
22
+
23
+
24
+ random.seed(RANDOM_SEED)
25
+
26
+
27
+ def map_predicates_to_frames(structures: List[FrameStructure]) -> Dict[str, str]:
28
+ mapping = {}
29
+ for struct in structures:
30
+ pred_key = "_".join(struct.target.tokens_str)
31
+ mapping[pred_key] = struct.frame
32
+ return mapping
33
+
34
+
35
+ def make_evaluation_sample(diffs_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
36
+
37
+ def make_experimental_columns(row: pd.Series):
38
+ if random.choice((True, False)):
39
+ left_col = "predicted_frame_0shot"
40
+ right_col = "predicted_frame_evalita"
41
+ else:
42
+ left_col = "predicted_frame_evalita"
43
+ right_col = "predicted_frame_0shot"
44
+
45
+ exp_info = pd.Series({
46
+ "prediction_1": row[left_col],
47
+ "prediction_2": row[right_col],
48
+ "model_1": left_col,
49
+ "model_2": right_col
50
+ })
51
+
52
+ return row.append(exp_info)
53
+
54
+ sample = diffs_df.sample(n=NUM_EVALUATION_SENTENCES,
55
+ random_state=RANDOM_SEED).reset_index(drop=True)
56
+ with_exp_info = sample.apply(make_experimental_columns, axis=1)
57
+ annotator_sheet = with_exp_info[[
58
+ "sentence", "predicate", "prediction_1", "prediction_2"]]
59
+ # add answer columns
60
+ for answer_field in ["1_is_best", "2_is_best", "both_are_good", "both_are_bad", "missing_frame"]:
61
+ annotator_sheet.insert(len(annotator_sheet.columns),
62
+ f"answer::{answer_field}", np.nan)
63
+ # annotator_sheet[f"answer::{answer_field}"] = np.nan
64
+
65
+ return annotator_sheet, with_exp_info
66
+
67
+
68
+ def make_annotation_experiment():
69
+ _, deep_frame_cache = load_caches("femicides/rai")
70
+ frames_of_interest = read_frames_of_interest("femicides/rai")
71
+
72
+ all_differences = []
73
+ foi_differences = [] # foi='frame of interest'
74
+
75
+ # number of predicates that have been annotated by at least one model
76
+ num_all_predictions = 0
77
+ num_foi_predictions = 0
78
+
79
+ num_z_shot_all_predictions = 0
80
+ num_z_shot_foi_predictions = 0
81
+
82
+ num_evalita_all_predictions = 0
83
+ num_evalita_foi_predictions = 0
84
+
85
+ for ev_dir in sorted(glob.glob("output/femicides/lome/lome_0shot/multilabel/rai/*")):
86
+ ev_id = os.path.basename(ev_dir).rstrip("/")
87
+ print(f"event={ev_id}")
88
+ for doc_file in sorted(glob.glob(f"{ev_dir}/*.comm.json")):
89
+ doc_id = re.search(r'/lome_(\d+)\.comm\.json', doc_file).group(1)
90
+ print(f"\tdoc={doc_id}")
91
+
92
+ with open(doc_file, encoding="utf-8") as f:
93
+ z_shot_annotations = json.load(f)
94
+
95
+ with open(doc_file.replace("/lome_0shot/", f"/{EVALITA_MODEL}/"), encoding="utf-8") as f:
96
+ evalita_annotations = json.load(f)
97
+
98
+ for sent_idx, (z_shot_sent, evalita_sent) in enumerate(zip(z_shot_annotations, evalita_annotations)):
99
+ z_shot_structs = process_fn_sentence(
100
+ z_shot_sent, deep_frame_cache)
101
+ evalita_structs = process_fn_sentence(
102
+ evalita_sent, deep_frame_cache)
103
+
104
+ z_shot_frames = {s.frame for s in z_shot_structs.values()}
105
+ evalita_frames = {s.frame for s in evalita_structs.values()}
106
+ overlapping_frames = z_shot_frames.intersection(evalita_frames)
107
+
108
+ print(f"\t\tsent #{sent_idx}: {len(z_shot_frames)}x lome_0shot frames, "
109
+ f"{len(evalita_frames)}x evalita frames, {len(overlapping_frames)}x overlapping")
110
+
111
+ z_shot_preds_to_frames = map_predicates_to_frames(
112
+ z_shot_structs.values())
113
+ evalita_preds_to_frames = map_predicates_to_frames(
114
+ evalita_structs.values())
115
+ all_predicates = sorted(set(z_shot_preds_to_frames.keys()).union(
116
+ evalita_preds_to_frames.keys()))
117
+
118
+ for predicate in all_predicates:
119
+ print(f"\t\t\tpredicate={predicate}")
120
+ z_shot_frame = z_shot_preds_to_frames.get(predicate)
121
+ evalita_frame = evalita_preds_to_frames.get(predicate)
122
+ has_relevant_frame = z_shot_frame in frames_of_interest or evalita_frame in frames_of_interest
123
+
124
+ if z_shot_frame is not None:
125
+ num_z_shot_all_predictions += 1
126
+ if z_shot_frame in frames_of_interest:
127
+ num_z_shot_foi_predictions += 1
128
+
129
+ if evalita_frame is not None:
130
+ num_evalita_all_predictions += 1
131
+ if evalita_frame in frames_of_interest:
132
+ num_evalita_foi_predictions += 1
133
+
134
+ num_all_predictions += 1
135
+ if has_relevant_frame:
136
+ num_foi_predictions += 1
137
+
138
+ if z_shot_frame != evalita_frame:
139
+ diff = {
140
+ "ev_id": ev_id,
141
+ "doc_id": doc_id,
142
+ "sent_idx": sent_idx,
143
+ "sentence": " ".join(z_shot_sent["tokens"]),
144
+ "predicate": predicate,
145
+ "predicted_frame_0shot": z_shot_frame or "_",
146
+ "predicted_frame_evalita": evalita_frame or "_"
147
+ }
148
+ all_differences.append(diff)
149
+ if has_relevant_frame:
150
+ foi_differences.append(diff)
151
+
152
+ print()
153
+
154
+ print()
155
+
156
+ print(f"num_z_shot_all_predictions = {num_z_shot_all_predictions}")
157
+ print(f"num_z_shot_foi_predictions = {num_z_shot_foi_predictions}")
158
+ print(f"num_evalita_all_predictions = {num_evalita_all_predictions}")
159
+ print(f"num_evalita_foi_predictions = {num_evalita_foi_predictions}")
160
+
161
+ print(
162
+ f"all_differences: {len(all_differences)}/{num_all_predictions}={len(all_differences)/num_all_predictions}")
163
+ print(
164
+ f"foi_differences: {len(foi_differences)}/{num_foi_predictions}={len(foi_differences) / num_foi_predictions}")
165
+
166
+ # all_diffs_df = pd.DataFrame(all_differences)
167
+ # foi_diffs_df = pd.DataFrame(foi_differences)
168
+
169
+ # all_diffs_df.to_csv("output/femicides/compare_lome_models/all_differences.csv")
170
+ # foi_diffs_df.to_csv("output/femicides/compare_lome_models/foi_differences.csv")
171
+
172
+ # annotator_sheet, experiment_sheet = make_evaluation_sample(foi_diffs_df)
173
+ # annotator_sheet.to_csv("output/femicides/compare_lome_models/annotator_sheet.csv")
174
+ # experiment_sheet.to_csv("output/femicides/compare_lome_models/experiment_sheet.csv")
175
+
176
+
177
+ def analyze_annotations():
178
+ ann_df = pd.read_excel("resources/sara_lome_annotations.xlsx", index_col=0)
179
+ exp_df = pd.read_csv(
180
+ f"output/femicides/compare_lome_models/{OUT_FOLDER}/experiment_sheet.csv", index_col=0)
181
+ ann_df_ = ann_df.join(exp_df[["model_1", "model_2"]])
182
+ ann_df_proc = ann_df_.apply(combine_labels, axis=1)
183
+ print(ann_df_proc.head())
184
+ ann_df_proc.to_csv(
185
+ f"output/femicides/compare_lome_models/{OUT_FOLDER}/annotator_sheet_processed.csv")
186
+
187
+
188
+ def combine_labels(row: pd.Series) -> pd.Series:
189
+
190
+ model_1 = row["model_1"].split("_")[-1]
191
+ model_2 = row["model_2"].split("_")[-1]
192
+
193
+ if row["answer::1_is_best"] == "X":
194
+ answer = f"{model_1}_is_best"
195
+ elif row["answer::2_is_best"] == "X":
196
+ answer = f"{model_2}_is_best"
197
+ elif row["answer::both_are_good"] == "X":
198
+ answer = "both_are_good"
199
+ elif row["answer::both_are_bad"] == "X":
200
+ answer = "both_are_bad"
201
+ elif row["answer::missing_frame"] == "X":
202
+ answer = "missing_frame"
203
+ else:
204
+ raise ValueError(f"Missing annotation in row {row}")
205
+
206
+ row_ = row.drop([k for k in row.keys() if k.startswith("answer::")])
207
+ return row_.append(pd.Series({"answer": answer}))
208
+
209
+
210
+ def prep_svm_challenge():
211
+ annotated_df = pd.read_csv(
212
+ "output/femicides/compare_lome_models/0shot__vs__evalita_plus_fn/annotator_sheet_processed.csv", index_col=0)
213
+
214
+ evalita_train_data = []
215
+ with open("../stupid-svm-frameid/data/evalita_jsonl/evalita_train.jsonl", encoding="utf-8") as f_in:
216
+ for line in f_in:
217
+ evalita_train_data.append(json.loads(line))
218
+ # evalita_frame_labels = {annotation["label"] for sentence in evalita_train_data for annotation in sentence["annotations"]}
219
+ evalita_frame_labels = defaultdict(int)
220
+ for sentence in evalita_train_data:
221
+ for annotation in sentence["annotations"]:
222
+ evalita_frame_labels[annotation["label"]] += 1
223
+ evalita_train_counts = pd.DataFrame(evalita_frame_labels.items(), columns=["label", "count"]).sort_values(by="count")
224
+ evalita_train_counts.to_csv("output/femicides/compare_lome_models/evalita_trainset_counts.csv")
225
+
226
+ print("Evalita frame labels:", sorted(evalita_frame_labels.keys()))
227
+
228
+ out = []
229
+ zshot_score = 0
230
+ evalita_score = 0
231
+
232
+ for _, row in annotated_df.iterrows():
233
+ answer = row["answer"]
234
+ if answer not in ["0shot_is_best", "evalita_is_best", "both_are_good"]:
235
+ continue
236
+
237
+ tokens = row["sentence"].split()
238
+ predicate = row["predicate"].split("_")[0] # to keep things simple, only look at first token of predicate
239
+ predicate_idx = [i for i, tok in enumerate(tokens) if tok == predicate][0]
240
+
241
+ if answer == "0shot_is_best":
242
+ if row["model_1"] == "predicted_frame_0shot":
243
+ zshot_label = label = row["prediction_1"]
244
+ evalita_label = row["prediction_2"]
245
+ else:
246
+ zshot_label = label = row["prediction_2"]
247
+ evalita_label = row["prediction_1"]
248
+ elif answer == "evalita_is_best":
249
+ if row["model_1"] == "predicted_frame_evalita":
250
+ evalita_label = label = row["prediction_1"]
251
+ zshot_label = row["prediction_2"]
252
+ else:
253
+ evalita_label = label = row["prediction_2"]
254
+ zshot_label = row["prediction_1"]
255
+ else:
256
+ label = row["prediction_1"]
257
+ if row["model_1"] == "predicted_frame_evalita":
258
+ evalita_label = row["prediction_1"]
259
+ zshot_label = row["prediction_2"]
260
+ else:
261
+ evalita_label = row["prediction_2"]
262
+ zshot_label = row["prediction_1"]
263
+
264
+ if label not in evalita_frame_labels:
265
+ print("\tskipping gold frame label not present in EVALITA: ", label)
266
+ continue
267
+
268
+ if zshot_label == label:
269
+ zshot_score += 1
270
+ if evalita_label == label:
271
+ evalita_score += 1
272
+
273
+ out.append({"tokens": tokens, "annotations": [{"label": label, "span": [predicate_idx, predicate_idx], "lu": None, "children": []}]})
274
+
275
+ print(f"Found {len(out)} relevant annotations")
276
+ print("0-shot score: ", zshot_score / len(out))
277
+ print("evalita score: ", evalita_score / len(out))
278
+
279
+
280
+ with open("output/femicides/compare_lome_models/svm_challenge.jsonl", "w", encoding="utf-8") as f_out:
281
+ for line in out:
282
+ f_out.write(json.dumps(line) + os.linesep)
283
+ f_out.write(os.linesep)
284
+
285
+
286
+
287
+ if __name__ == '__main__':
288
+ action = sys.argv[1]
289
+ assert action in ["make", "analyze", "prep_svm_challenge"]
290
+
291
+ if action == "make":
292
+ make_annotation_experiment()
293
+ elif action == "analyze":
294
+ analyze_annotations()
295
+ else:
296
+ prep_svm_challenge()
sociofillmore/femicides/evalita_err_analysis.py ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import product
2
+
3
+ import pandas as pd
4
+ import numpy as np
5
+ from scipy.spatial.distance import cosine
6
+
7
+ from nltk.corpus import framenet as fn
8
+
9
+ from sociofillmore.common.analyze_text import read_frames_of_interest
10
+
11
+
12
+ COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5]
13
+
14
+
15
+ PREDICTION_FILES = {
16
+ "evalita-dev": {
17
+ "stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv",
18
+ "lome-en": "misc/frame_prediction_output_lome-en_dev.csv",
19
+ "lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv",
20
+ },
21
+ "evalita-test": {
22
+ "stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv",
23
+ "lome-en": "misc/frame_prediction_output_lome-en_test.csv",
24
+ "lome-it": "misc/frame_prediction_output_lome-it-best_test.csv",
25
+ },
26
+ "rai_femicides": {
27
+ "stupid-svm": "../stupid-svm-frameid/rai_predictions.csv",
28
+ "lome-en": "misc/frame_prediction_output_lome-en_rai.csv",
29
+ "lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv",
30
+ },
31
+ }
32
+
33
+
34
+ def load_embeddings(embedding_file):
35
+ frame_vocab = []
36
+ word_vocab = []
37
+ vectors = []
38
+
39
+ with open(embedding_file, encoding="utf-8") as f:
40
+ for line in f:
41
+ columns = line.split()
42
+ frame = columns[0]
43
+ words = tuple(columns[1].split("+"))
44
+ vector = np.array([float(i) for i in columns[2:]])
45
+
46
+ frame_vocab.append(frame)
47
+ word_vocab.append(words)
48
+ vectors.append(vector)
49
+
50
+ frames_to_idxs = {}
51
+ for i, frame in enumerate(frame_vocab):
52
+ frames_to_idxs[frame] = i
53
+
54
+ return np.array(vectors, dtype=np.float64), frames_to_idxs
55
+
56
+
57
+ def femicide_frame_distances(embeddings, frame_to_idx):
58
+ femicide_frames = read_frames_of_interest("femicides/rai")
59
+ print("Cosines: ")
60
+ for fr1, fr2 in product(femicide_frames, femicide_frames):
61
+ dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]])
62
+ print(f"\t{fr1}-{fr2}: {dist:.4f}")
63
+
64
+
65
+ def embedding_scores(predictions, embeddings, frame_to_idx):
66
+ correct = 0
67
+ close_calls = {threshold: 0 for threshold in COSINE_THRESH}
68
+ total_dist = 0.0
69
+
70
+ for _, row in predictions.iterrows():
71
+ predicted = row["frame_pred"]
72
+ gold = row["frame_gold"]
73
+ dist = cosine(
74
+ embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]]
75
+ )
76
+ if predicted == gold:
77
+ correct += 1
78
+ else:
79
+ for threshold in COSINE_THRESH:
80
+ if dist < threshold:
81
+ close_calls[threshold] += 1
82
+ total_dist += dist
83
+
84
+ print("#correct: ", correct / len(predictions))
85
+ print("#close calls: ")
86
+ for threshold in COSINE_THRESH:
87
+ print("\t", threshold, (close_calls[threshold]) / len(predictions))
88
+ print("#correct or close: ")
89
+ for threshold in COSINE_THRESH:
90
+ print("\t", threshold, (correct + close_calls[threshold]) / len(predictions))
91
+ print("avg cosine dist: ", total_dist / len(predictions))
92
+
93
+
94
+ def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames):
95
+
96
+ all_frames = predictions
97
+ ifn_frames = predictions[
98
+ predictions["frame_gold"].isin(evalita_train_counts["label"])
99
+ ]
100
+ bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)]
101
+ rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)]
102
+
103
+
104
+ print("LEN (ALL/IFN/BFN/RAI:)")
105
+ print(
106
+ "\t".join(
107
+ [
108
+ str(len(preds))
109
+ for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
110
+ ]
111
+ )
112
+ )
113
+
114
+ print("ACC (ALL/IFN/BFN/RAI:)")
115
+ print(
116
+ "\t".join(
117
+ [
118
+ str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds))
119
+ for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
120
+ ]
121
+ )
122
+ )
123
+
124
+
125
+ def main():
126
+
127
+ evalita_train_counts = pd.read_csv(
128
+ "output/femicides/compare_lome_models/evalita_trainset_counts.csv"
129
+ )
130
+
131
+ fn_frames = {fr.name for fr in fn.frames()}
132
+ femicide_frames = read_frames_of_interest("femicides/rai")
133
+ evalita_train_counts = pd.read_csv(
134
+ "output/femicides/compare_lome_models/evalita_trainset_counts.csv"
135
+ )
136
+
137
+ for dataset in PREDICTION_FILES:
138
+ print(f"==={dataset}===")
139
+ for model, predictions_file in PREDICTION_FILES[dataset].items():
140
+
141
+ print(f"---{model}---")
142
+
143
+ predictions = pd.read_csv(predictions_file, index_col=0)
144
+ print("Total predictions:", len(predictions))
145
+
146
+ # predictions_with_fn_frames = predictions[
147
+ # predictions["frame_gold"].isin(fn_frames)
148
+ # & predictions["frame_pred"].isin(fn_frames)
149
+ # ]
150
+ # print("Predictions with FN frames: ", len(predictions_with_fn_frames))
151
+
152
+ # errors = predictions[predictions["frame_gold"] != predictions["frame_pred"]]
153
+ # print("Total errors: ", len(errors))
154
+
155
+ # errors_with_fn_frames = errors[
156
+ # errors["frame_gold"].isin(fn_frames) & errors["frame_pred"].isin(fn_frames)
157
+ # ]
158
+ # print("Errors with FN frames: ", len(errors_with_fn_frames))
159
+
160
+ # print("Loading embeddings...")
161
+ # embeddings, frame_to_idx = load_embeddings(
162
+ # "../bert-for-framenet/data/embeddings/bag_of_lu_embeddings.txt"
163
+ # )
164
+ # # femicide_frame_distances(embeddings, frame_to_idx)
165
+ # embedding_scores(predictions_with_fn_frames, embeddings, frame_to_idx)
166
+
167
+ if dataset == "rai_femicides":
168
+ predictions = predictions[predictions["frame_gold"].isin(femicide_frames)]
169
+
170
+
171
+ femicide_frames = read_frames_of_interest("femicides/rai")
172
+ generalization_exp(
173
+ predictions, evalita_train_counts, fn_frames, femicide_frames
174
+ )
175
+
176
+
177
+ print()
178
+ print()
179
+
180
+
181
+ if __name__ == "__main__":
182
+ main()
sociofillmore/femicides/extract_texts.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def extract_texts():
5
+ df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
6
+ print(df)
7
+ for _, row in df.iterrows():
8
+ source_id = row["ID"]
9
+ text = f"{row['title']}\n\n{row['text']}"
10
+ with open(f"output/femicides/extract_text/source_{source_id}.txt", "w", encoding="utf-8") as f:
11
+ f.write(text)
12
+
13
+
14
+ if __name__ == '__main__':
15
+ extract_texts()
sociofillmore/femicides/split_data.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the
3
+ dataset, to be used later
4
+ """
5
+ import os
6
+ import json
7
+ import random
8
+ import argparse
9
+ from collections import defaultdict
10
+ from typing import List, Tuple, Dict, Any
11
+
12
+ import pandas as pd
13
+ import nltk
14
+
15
+ random.seed(1996)
16
+
17
+
18
+ def split_rai_femicides():
19
+ # process the excel file
20
+ print("Processing excel file...")
21
+ femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx",
22
+ sheet_name="dati", header=0)
23
+ event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events)
24
+
25
+ # save information about correspondences between victims and events
26
+ # (we will need this later to retrieve the correct texts for each event, because the XLSX with texts uses victim
27
+ # names as keys)
28
+ dicts_to_save = (
29
+ (victim_to_event_id, "victim_to_event_id"),
30
+ (event_id_to_victims, "event_id_to_victims"),
31
+ (victim_duplicate_counts, "victim_duplicate_counts")
32
+ )
33
+ write_dict_to_json(dicts_to_save)
34
+
35
+ # shuffle and split
36
+ print("Shuffling and splitting...")
37
+ shuffled_event_ids = list(event_ids)
38
+ random.shuffle(shuffled_event_ids)
39
+ dev10_idx = shuffled_event_ids[:78]
40
+ main_idx = shuffled_event_ids[78:]
41
+ dev10_df, main_df = create_split_df(dev10_idx, femicide_events)
42
+
43
+ # write split dataframes
44
+ for df, df_name in ((dev10_df, "dev10"), (main_df, "main")):
45
+ df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv")
46
+ df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx")
47
+
48
+ # write filtered victim data
49
+ dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx}
50
+ main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx}
51
+ filtered_dicts_to_save = (
52
+ (dev10_victims, "event_id_to_victims.dev10"),
53
+ (main_victims, "event_id_to_victims.main"),
54
+ )
55
+ write_dict_to_json(filtered_dicts_to_save)
56
+
57
+ # retrieve texts for filtered data
58
+ print("Filtering & writing texts...")
59
+ texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
60
+ filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts)
61
+ filter_texts("main", texts_df, main_victims, victim_duplicate_counts)
62
+
63
+
64
+ def split_olv_femicides():
65
+ texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv")
66
+ events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv")
67
+
68
+ event_ids = events_df["event:id"].tolist()
69
+ random.shuffle(event_ids)
70
+
71
+ num_dev_events = round(len(event_ids) * 0.10)
72
+ dev10_ids = event_ids[:num_dev_events]
73
+ dev10_df, main_df = create_split_df(dev10_ids, events_df)
74
+
75
+ # split texts
76
+ dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)]
77
+ main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)]
78
+
79
+ # write to files
80
+ for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")):
81
+ events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv")
82
+ texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv")
83
+ events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx")
84
+ texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx")
85
+
86
+ for _, row in texts_df.iterrows():
87
+ event_id = row["event_id"]
88
+ text_id = row["text_id"]
89
+ event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/"
90
+ os.makedirs(event_dir, exist_ok=True)
91
+ with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
92
+ title = row["title"] if not pd.isna(row["title"]) else ""
93
+ for line in nltk.sent_tokenize(title, language="italian"):
94
+ f_by_event.write(line + os.linesep)
95
+ f_by_event.write(os.linesep)
96
+ fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else ""
97
+ if not fulltext:
98
+ print(f"WARNING: empty fulltext in text_id={text_id}")
99
+ for line in nltk.sent_tokenize(fulltext, language="italian"):
100
+ line = line.strip()
101
+ if not line:
102
+ continue
103
+ f_by_event.write(line + os.linesep)
104
+
105
+
106
+ def write_dict_to_json(filtered_dicts_to_save):
107
+ for dict_data, dict_name in filtered_dicts_to_save:
108
+ with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f:
109
+ json.dump(dict_data, f, indent=4, sort_keys=True)
110
+
111
+
112
+ def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
113
+ dev10_rows = []
114
+ main_rows = []
115
+ for idx, row in femicide_events.iterrows():
116
+ event_id = row["event:id"]
117
+ if pd.isna(event_id):
118
+ continue
119
+ event_id = int(event_id)
120
+ if event_id in dev10:
121
+ dev10_rows.append(row)
122
+ else:
123
+ main_rows.append(row)
124
+ dev10_df = pd.DataFrame(dev10_rows)
125
+ main_df = pd.DataFrame(main_rows)
126
+ return dev10_df, main_df
127
+
128
+
129
+ def read_events(events_df):
130
+ event_ids: List[int] = []
131
+ victim_to_event_id: Dict[str, int] = {}
132
+ event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list)
133
+ victim_duplicate_counts: Dict[str, int] = defaultdict(int)
134
+
135
+ for idx, row in events_df.iterrows():
136
+ event_id = row["event:id"]
137
+ if pd.isna(event_id):
138
+ continue
139
+ event_id = int(event_id)
140
+
141
+ # unspecified name --> "UNKNOWN_X"
142
+ victim = row["victim:name"]
143
+ if victim == "non rilevato" or pd.isna(victim):
144
+ victim = f"UNKNOWN_{event_id}"
145
+
146
+ # disambiguate victims with duplicate names
147
+ victim_duplicate_counts[victim] += 1
148
+ duplicate_id = victim_duplicate_counts[victim]
149
+
150
+ event_ids.append(event_id)
151
+ victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id
152
+ event_id_to_victims[event_id].append((victim, duplicate_id))
153
+ return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts
154
+
155
+
156
+ def filter_texts(split_name: str,
157
+ texts_df: pd.DataFrame,
158
+ event_idx_to_victims: Dict[int, List[Tuple[str, int]]],
159
+ victim_duplicate_counts: Dict[str, int]):
160
+
161
+ print(f"\tfilter_texts: filtering split {split_name}")
162
+
163
+ # first filter victims
164
+ victim_to_event_idx = {}
165
+ for e_id in event_idx_to_victims:
166
+ for victim in event_idx_to_victims[e_id]:
167
+ victim_name, victim_dup_id = victim
168
+ if victim_duplicate_counts[victim_name] > 1:
169
+ print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'")
170
+ continue
171
+ victim_to_event_idx[victim_name] = e_id
172
+
173
+ meta_rows: List[Dict[str, Any]] = []
174
+ with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \
175
+ open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id:
176
+ for _, row in texts_df.iterrows():
177
+ text_victim = row["vittima"].strip()
178
+ if text_victim in victim_to_event_idx:
179
+ e_id = victim_to_event_idx[text_victim]
180
+ text_id = int(row["ID"])
181
+ url = row["link"]
182
+ pubdate = row["pubdate"]
183
+ provider = row["provider"]
184
+ title = row["title"]
185
+
186
+ meta_rows.append({
187
+ "event_id": e_id,
188
+ "text_id": text_id,
189
+ "url": url,
190
+ "pubdate": pubdate,
191
+ "provider": provider,
192
+ "title": title
193
+ })
194
+
195
+ # body_text_lines = row["text"].split("\n")
196
+ body_text_lines = nltk.sent_tokenize(row["text"], language="italian")
197
+ title_lines = nltk.sent_tokenize(title, language="italian")
198
+
199
+ # f_txt.write(title.strip() + os.linesep)
200
+ # f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
201
+ for line in title_lines:
202
+ f_txt.write(line + os.linesep)
203
+ f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
204
+
205
+ event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/"
206
+ os.makedirs(event_dir, exist_ok=True)
207
+ with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
208
+ # f_by_event.write(title.strip() + os.linesep)
209
+ for line in title_lines:
210
+ f_by_event.write(line + os.linesep)
211
+ f_by_event.write(os.linesep)
212
+ for line in body_text_lines:
213
+ line = line.strip()
214
+ if not line:
215
+ continue
216
+ f_txt.write(line + os.linesep)
217
+ f_by_event.write(line + os.linesep)
218
+ f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep)
219
+
220
+ meta_df = pd.DataFrame(meta_rows)
221
+ meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv")
222
+ meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx")
223
+ print()
224
+
225
+
226
+ if __name__ == '__main__':
227
+
228
+ ap = argparse.ArgumentParser()
229
+ ap.add_argument("dataset", choices=["rai", "olv"])
230
+ args = ap.parse_args()
231
+
232
+ if args.dataset == "rai":
233
+ split_rai_femicides()
234
+ else:
235
+ split_olv_femicides()
sociofillmore/migration/cda_classify.py ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Learn to classify the manually annotated CDA attributes (frames, 'riferimento', orientation)
3
+ """
4
+
5
+ import sys
6
+
7
+ import torch
8
+
9
+ from allennlp.data.vocabulary import Vocabulary
10
+ from allennlp.data import DatasetReader, TokenIndexer, Instance, Token
11
+ from allennlp.data.fields import TextField, LabelField
12
+ from allennlp.data.token_indexers.pretrained_transformer_indexer import (
13
+ PretrainedTransformerIndexer,
14
+ )
15
+ from allennlp.data.tokenizers.pretrained_transformer_tokenizer import (
16
+ PretrainedTransformerTokenizer,
17
+ )
18
+ from allennlp.models import BasicClassifier
19
+ from allennlp.modules.text_field_embedders.basic_text_field_embedder import (
20
+ BasicTextFieldEmbedder,
21
+ )
22
+ from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
23
+ PretrainedTransformerEmbedder,
24
+ )
25
+ from allennlp.modules.seq2vec_encoders.bert_pooler import BertPooler
26
+ from allennlp.training.checkpointer import Checkpointer
27
+ from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
28
+ from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader
29
+ from allennlp.training.optimizers import AdamOptimizer
30
+ from allennlp.predictors.text_classifier import TextClassifierPredictor
31
+
32
+ from sklearn.svm import SVC
33
+ from sklearn.feature_extraction.text import CountVectorizer
34
+ from sklearn.metrics import precision_recall_fscore_support
35
+ from sklearn.tree import DecisionTreeClassifier
36
+ from sklearn.dummy import DummyClassifier
37
+
38
+ import pandas as pd
39
+ import numpy as np
40
+ import spacy
41
+
42
+ import json
43
+ import os
44
+ from typing import Dict, Iterable
45
+
46
+
47
+ class MigrationReader(DatasetReader):
48
+ def __init__(self, token_indexers, tokenizer):
49
+ self.token_indexers = token_indexers
50
+ self.tokenizer = tokenizer
51
+
52
+ def text_to_instance(self, sentence, label=None) -> Instance:
53
+ text_field = TextField(self.tokenizer.tokenize(sentence), self.token_indexers)
54
+ fields = {"tokens": text_field}
55
+ if label is not None:
56
+ label_field = LabelField(label)
57
+ fields["label"] = label_field
58
+ return Instance(fields)
59
+
60
+
61
+ def read_instances(
62
+ self, text: pd.Series, labels: pd.Series
63
+ ) -> Iterable[Instance]:
64
+ for sentence, label in zip(text, labels):
65
+ instance = self.text_to_instance(sentence, label)
66
+ yield instance
67
+
68
+
69
+ def train(attrib, use_gpu=False):
70
+ assert attrib in ["cda_frame", "riferimento", "orientation", "fake"]
71
+
72
+ # load data
73
+ print("Loading data...")
74
+ x_train, y_train, x_dev, y_dev = load_data(attrib)
75
+ print(f"\t\ttrain size: {len(x_train)}")
76
+ print(f"\t\tdev size: {len(x_dev)}")
77
+
78
+ # try different setups
79
+ print("Running training setups...")
80
+ scores = []
81
+ setups = [
82
+ # defaults: remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False
83
+ # ({}, {}, {"type": "svm", "options": {"kernel": "linear", "C": 1.0}}),
84
+ (
85
+ {},
86
+ {},
87
+ {
88
+ "type": "bert",
89
+ "options": {"transformer": "Musixmatch/umberto-commoncrawl-cased-v1"},
90
+ },
91
+ ),
92
+ # ({"lemmatize": True, "remove_stop": True}, {}, {"type": "svm", "options": {"kernel": "linear", "C": 0.8}}),
93
+ # ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "svm", "options": {"kernel": "linear", "C": 0.8}}),
94
+ # ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "dummy", "options": {}}),
95
+ # ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "tree", "options": {}}),
96
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear')),
97
+ # ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5}, SVC(kernel='linear')),
98
+ # ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5, "max_freq": .70}, SVC(kernel='linear')),
99
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.6)),
100
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.7)),
101
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.8)),
102
+ # ({"lemmatize": True, "remove_stop": True}, {"ngram_range": (1,2)}, SVC(kernel='linear', C=0.8)),
103
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel="rbf")),
104
+ ]
105
+
106
+ nlp = spacy.load("it_core_news_md")
107
+
108
+ for s_idx, (text_options, vect_options, model_info) in enumerate(setups):
109
+
110
+ if model_info["type"] == "bert":
111
+ print("\t\tPreparing BERT model...")
112
+
113
+ # cuda_device = 0 if torch.cuda.is_available() else -1
114
+ cuda_device = None if use_gpu and torch.cuda.is_available() else -1
115
+
116
+ transformer = model_info["options"]["transformer"]
117
+ token_indexers = {"tokens": PretrainedTransformerIndexer(transformer)}
118
+ tokenizer = PretrainedTransformerTokenizer(transformer)
119
+
120
+ reader = MigrationReader(token_indexers, tokenizer)
121
+ train_instances = list(
122
+ reader.read_instances(x_train, y_train)
123
+ )
124
+ dev_instances = list(
125
+ reader.read_instances(x_dev, y_dev)
126
+ )
127
+ vocab = Vocabulary.from_instances(train_instances + dev_instances)
128
+ print(vocab.get_vocab_size("tags"))
129
+
130
+ embedder = BasicTextFieldEmbedder(
131
+ {"tokens": PretrainedTransformerEmbedder(transformer)}
132
+ )
133
+ seq2vec = BertPooler(transformer)
134
+ model = BasicClassifier(vocab, embedder, seq2vec, namespace="tags")
135
+ if use_gpu:
136
+ model = model.cuda(cuda_device)
137
+
138
+ checkpoint_dir = f"/scratch/p289731/cda_classify/model_{attrib}/checkpoints/"
139
+ serialization_dir = f"/scratch/p289731/cda_classify/model_{attrib}/serialize/"
140
+ os.makedirs(checkpoint_dir)
141
+ os.makedirs(serialization_dir)
142
+ checkpointer = Checkpointer(checkpoint_dir)
143
+ optimizer = AdamOptimizer(
144
+ [(n, p) for n, p in model.named_parameters() if p.requires_grad],
145
+ lr=1e-6
146
+ )
147
+ train_loader = SimpleDataLoader(train_instances, batch_size=8, shuffle=True)
148
+ dev_loader = SimpleDataLoader(dev_instances, batch_size=8, shuffle=False)
149
+ train_loader.index_with(vocab)
150
+ dev_loader.index_with(vocab)
151
+
152
+ print("\t\tTraining BERT model")
153
+ trainer = GradientDescentTrainer(
154
+ model,
155
+ optimizer,
156
+ train_loader,
157
+ validation_data_loader=dev_loader,
158
+ patience=32,
159
+ checkpointer=checkpointer,
160
+ cuda_device=cuda_device,
161
+ serialization_dir=serialization_dir
162
+ )
163
+ trainer.train()
164
+
165
+ print("\t\tProducing predictions...")
166
+ predictor = TextClassifierPredictor(model, reader)
167
+ predictions = [predictor.predict(sentence) for sentence in x_dev]
168
+ y_dev_pred = [p["label"] for p in predictions]
169
+ class_labels = list(vocab.get_token_to_index_vocabulary("labels").keys())
170
+
171
+ elif model_info["type"] in ["svm", "tree", "dummy"]:
172
+ # extract features
173
+ print("\t\tExtracting features...")
174
+ x_train_fts, vectorizer = extract_features(
175
+ x_train, nlp, text_options, **vect_options
176
+ )
177
+ x_dev_fts, _ = extract_features(
178
+ x_dev, nlp, text_options, **vect_options, vectorizer=vectorizer
179
+ )
180
+
181
+ if not vect_options["embed"]:
182
+ print(f"\t\t\tnum features: {len(vectorizer.vocabulary_)}")
183
+ else:
184
+ assert model_info["type"] != "tree", "Decision tree does not support embedding input"
185
+
186
+ print("\t\tTraining the model...")
187
+ if model_info["type"] == "svm":
188
+ model = SVC(**model_info["options"])
189
+ elif model_info["type"] == "tree":
190
+ model = DecisionTreeClassifier()
191
+ else:
192
+ model = DummyClassifier()
193
+ model.fit(x_train_fts, y_train)
194
+
195
+ # evaluate on dev
196
+ print("\t\tValidating the model...")
197
+ y_dev_pred = model.predict(x_dev_fts)
198
+ class_labels = model.classes_
199
+
200
+ p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(
201
+ y_dev, y_dev_pred, average="micro"
202
+ )
203
+ p_classes, r_classes, f_classes, _ = precision_recall_fscore_support(
204
+ y_dev, y_dev_pred, average=None, labels=class_labels, zero_division=0
205
+ )
206
+ print(
207
+ f"\t\t\tOverall scores (micro-averaged):\tP={p_micro}\tR={r_micro}\tF={f_micro}"
208
+ )
209
+
210
+ scores.append(
211
+ {
212
+ "micro": {"p": p_micro, "r": r_micro, "f": f_micro},
213
+ "classes": {
214
+ "p": list(zip(class_labels, p_classes)),
215
+ "r": list(zip(class_labels, r_classes)),
216
+ "f": list(zip(class_labels, f_classes)),
217
+ },
218
+ }
219
+ )
220
+
221
+ prediction_df = pd.DataFrame(
222
+ zip(x_dev, y_dev, y_dev_pred), columns=["headline", "gold", "prediction"]
223
+ )
224
+ prediction_df.to_csv(
225
+ f"output/migration/cda_classify/predictions_{attrib}_{s_idx:02}.csv"
226
+ )
227
+
228
+ with open(
229
+ f"output/migration/cda_classify/scores_{attrib}.json", "w", encoding="utf-8"
230
+ ) as f_scores:
231
+ json.dump(scores, f_scores, indent=4)
232
+
233
+
234
+ def load_data(attrib):
235
+ train_data = pd.read_csv("output/migration/preprocess/annotations_train.csv")
236
+ dev_data = pd.read_csv("output/migration/preprocess/annotations_dev.csv")
237
+
238
+ x_train = train_data["Titolo"]
239
+ x_dev = dev_data["Titolo"]
240
+
241
+ if attrib == "cda_frame":
242
+ y_train = train_data["frame"]
243
+ y_dev = dev_data["frame"]
244
+ elif attrib == "riferimento":
245
+ y_train = train_data["riferimento"]
246
+ y_dev = dev_data["riferimento"]
247
+ elif attrib == "orientation":
248
+ y_train = train_data["orientation"]
249
+ y_dev = dev_data["orientation"]
250
+
251
+ # fake task to test setup
252
+ else:
253
+ y_train = pd.Series(["true" if "rifugiato" in exa else "false" for exa in x_train])
254
+ y_dev = pd.Series(["true" if "rifugiato" in exa else "false" for exa in x_dev])
255
+
256
+ return x_train, y_train, x_dev, y_dev
257
+
258
+
259
+ def extract_features(
260
+ headlines,
261
+ nlp,
262
+ text_options,
263
+ embed=False,
264
+ min_freq=1,
265
+ max_freq=1.0,
266
+ ngram_range=(1, 1),
267
+ vectorizer=None,
268
+ ):
269
+
270
+ if embed:
271
+ vectorized = np.array(
272
+ [vec for vec in process_text(headlines, nlp, embed=True, **text_options)]
273
+ )
274
+ else:
275
+ tokenized = [
276
+ " ".join(sent) for sent in process_text(headlines, nlp, **text_options)
277
+ ]
278
+ if vectorizer is None:
279
+ vectorizer = CountVectorizer(
280
+ lowercase=False,
281
+ analyzer="word",
282
+ min_df=min_freq,
283
+ max_df=max_freq,
284
+ ngram_range=ngram_range,
285
+ )
286
+ vectorized = vectorizer.fit_transform(tokenized)
287
+ else:
288
+ vectorized = vectorizer.transform(tokenized)
289
+ return vectorized, vectorizer
290
+
291
+
292
+ def process_text(
293
+ headlines,
294
+ nlp,
295
+ embed=False,
296
+ remove_punct=True,
297
+ lowercase=True,
298
+ lemmatize=False,
299
+ remove_stop=False,
300
+ ):
301
+ for sent in headlines:
302
+ doc = nlp(sent)
303
+ tokens = (
304
+ t
305
+ for t in doc
306
+ if (not remove_stop or not t.is_stop)
307
+ and (not remove_punct or t.pos_ not in ["PUNCT", "SYM", "X"])
308
+ )
309
+ if embed:
310
+ if lemmatize:
311
+ tokens = (t.vocab[t.lemma].vector for t in tokens)
312
+ else:
313
+ tokens = (t.vector for t in tokens if t.has_vector)
314
+ else:
315
+ if lemmatize:
316
+ tokens = (t.lemma_ for t in tokens)
317
+ else:
318
+ tokens = (t.text for t in tokens)
319
+
320
+ if lowercase:
321
+ tokens = (t.lower() for t in tokens)
322
+
323
+ if embed:
324
+ token_arr = np.array([t for t in tokens])
325
+ if len(token_arr) == 0:
326
+ yield np.random.rand(300)
327
+ else:
328
+ yield np.mean(token_arr, axis=0)
329
+ else:
330
+ yield list(tokens)
331
+
332
+
333
+ if __name__ == "__main__":
334
+ use_gpu = True if sys.argv[1] == "gpu" else False
335
+ # train(attrib="fake", use_gpu=use_gpu)
336
+ train(attrib="cda_frame", use_gpu=use_gpu)
337
+ # train(attrib="riferimento")
338
+ # train(attrib="orientation")
sociofillmore/migration/cda_classify_.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Learn to classify the manually annotated CDA attributes (frames, 'riferimento', orientation)
3
+ """
4
+
5
+ GLOVE_MODEL = "/net/aistaff/gminnema/thesis_data/data/glove-it/glove_WIKI"
6
+
7
+
8
+ from sklearn.svm import SVC
9
+ from sklearn.feature_extraction.text import CountVectorizer
10
+ from sklearn.metrics import precision_recall_fscore_support
11
+ import gensim
12
+ import pandas as pd
13
+ import spacy
14
+
15
+ import json
16
+
17
+
18
+ def train(attrib):
19
+ assert attrib in ["cda_frame", "riferimento", "orientation"]
20
+
21
+ # load data
22
+ print("Loading data...")
23
+ x_train, y_train, x_dev, y_dev = load_data(attrib)
24
+ print(f"\t\ttrain size: {len(x_train)}")
25
+ print(f"\t\tdev size: {len(x_dev)}")
26
+
27
+ # try different setups
28
+ print("Running training setups...")
29
+ scores = []
30
+ setups = [
31
+ # defaults: remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False
32
+ # ({}, {}, SVC(kernel='linear')),
33
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear')),
34
+ # ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5}, SVC(kernel='linear')),
35
+ # ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5, "max_freq": .70}, SVC(kernel='linear')),
36
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.6)),
37
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.7)),
38
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.8)),
39
+ ({"lemmatize": True, "remove_stop": True}, {"embed": "glove"}, SVC(kernel='linear', C=0.8)),
40
+ # ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel="rbf")),
41
+ ]
42
+
43
+
44
+ nlp = spacy.load("it_core_news_md")
45
+
46
+ for s_idx, (text_options, vect_options, model) in enumerate(setups):
47
+
48
+ print(f"\tSetup #{s_idx}")
49
+
50
+ # extract features
51
+ print("\t\tExtracting features...")
52
+ x_train_fts, vectorizer = extract_features(x_train, nlp, text_options, **vect_options)
53
+ x_dev_fts, _ = extract_features(x_dev, nlp, text_options, **vect_options, vectorizer=vectorizer)
54
+ print(f"\t\t\tnum features: {len(vectorizer.vocabulary_)}")
55
+
56
+ print("\t\tTraining the model...")
57
+ model.fit(x_train_fts, y_train)
58
+
59
+ # evaluate on dev
60
+ print("\t\tValidating the model...")
61
+ y_dev_pred = model.predict(x_dev_fts)
62
+ p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(
63
+ y_dev, y_dev_pred, average="micro")
64
+ p_classes, r_classes, f_classes, _ = precision_recall_fscore_support(
65
+ y_dev, y_dev_pred, average=None, labels=model.classes_, zero_division=0)
66
+ print(
67
+ f"\t\t\tOverall scores (micro-averaged):\tP={p_micro}\tR={r_micro}\tF={f_micro}"
68
+ )
69
+
70
+ scores.append({
71
+ "micro": {
72
+ "p": p_micro,
73
+ "r": r_micro,
74
+ "f": f_micro
75
+ },
76
+ "classes": {
77
+ "p": list(zip(model.classes_, p_classes)),
78
+ "r": list(zip(model.classes_, r_classes)),
79
+ "f": list(zip(model.classes_, f_classes)),
80
+ }
81
+ })
82
+
83
+ prediction_df = pd.DataFrame(zip(x_dev, y_dev, y_dev_pred), columns=["headline", "gold", "prediction"])
84
+ prediction_df.to_csv(f"output/migration/cda_classify/predictions_{s_idx:02}.csv")
85
+
86
+
87
+ with open("output/migration/cda_classify/scores.json", "w", encoding="utf-8") as f_scores:
88
+ json.dump(scores, f_scores, indent=4)
89
+
90
+
91
+ def load_data(attrib):
92
+ train_data = pd.read_csv(
93
+ "output/migration/preprocess/annotations_train.csv")
94
+ dev_data = pd.read_csv("output/migration/preprocess/annotations_dev.csv")
95
+
96
+ x_train = train_data["Titolo"]
97
+ x_dev = dev_data["Titolo"]
98
+
99
+ if attrib == "cda_frame":
100
+ y_train = train_data["frame"]
101
+ y_dev = dev_data["frame"]
102
+ elif attrib == "riferimento":
103
+ y_train = train_data["riferimento"]
104
+ y_dev = dev_data["riferimento"]
105
+ else:
106
+ x_train = train_data["orientation"]
107
+ y_dev = dev_data["orientation"]
108
+ return x_train, y_train, x_dev, y_dev
109
+
110
+
111
+ def extract_features(headlines, nlp, text_options, min_freq=1, max_freq=1.0, embed=None, vectorizer=None):
112
+ tokenized = [" ".join(sent) for sent in tokenize(headlines, nlp, **text_options)]
113
+ if vectorizer is None:
114
+ if embed is None:
115
+ vectorizer = CountVectorizer(lowercase=False, analyzer="word", min_df=min_freq, max_df=max_freq)
116
+ vectorized = vectorizer.fit_transform(tokenized)
117
+ else:
118
+ vectorizer = gensim.models.
119
+ else:
120
+ vectorized = vectorizer.transform(tokenized)
121
+ return vectorized, vectorizer
122
+
123
+
124
+ def tokenize(headlines, nlp, remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False):
125
+ for sent in headlines:
126
+ doc = nlp(sent)
127
+ tokens = (
128
+ t.lemma_ if lemmatize else t.text
129
+ for t in doc
130
+ if (not remove_stop or not t.is_stop) and (not remove_punct or t.pos_ not in ["PUNCT", "SYM", "X"])
131
+ )
132
+ if lowercase:
133
+ tokens = [t.lower() for t in tokens]
134
+ else:
135
+ tokens = [t for t in tokens]
136
+ yield tokens
137
+
138
+
139
+ if __name__ == '__main__':
140
+ train(attrib="cda_frame")
sociofillmore/migration/extract_political_ratings.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+
4
+ def main():
5
+ corpus_df = pd.read_excel("data/migration/corpus_titoli_2013_2021_with_most_recent_years.xlsx")
6
+ ratings = corpus_df.groupby("Testata").first()[["Orientamento politico", "Religiosa?"]]
7
+ ratings.index.rename("provider", inplace=True)
8
+ ratings["political_stance"] = ratings["Orientamento politico"]
9
+ ratings["religious"] = ratings["Religiosa?"] == "religiosa"
10
+ ratings.drop("Orientamento politico", axis=1, inplace=True)
11
+ ratings.drop("Religiosa?", axis=1, inplace=True)
12
+
13
+ ratings.to_csv("data/migration/provider_pol_rel_ratings.csv")
14
+
15
+
16
+ if __name__ == "__main__":
17
+ main()
sociofillmore/migration/preprocess.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+
4
+ import random
5
+
6
+ random.seed(1996)
7
+
8
+
9
+ CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv"
10
+ CORPUS_ALL = "data/migration/corpus_all.csv"
11
+
12
+ RATIO_DEV = 0.05
13
+ RATIO_TEST = 0.25
14
+
15
+
16
+ def preprocess_annotated():
17
+ print("Loading corpus...")
18
+ df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
19
+ print(f"\tfound {len(df)} annotated headlines")
20
+
21
+ train_idx = []
22
+ dev_idx = []
23
+ test_idx = []
24
+
25
+ print("Making random train/dev/test split...")
26
+ for i in range(len(df)):
27
+ rnd = random.random()
28
+ if rnd < RATIO_DEV:
29
+ dev_idx.append(i)
30
+ elif rnd < (RATIO_DEV + RATIO_TEST):
31
+ test_idx.append(i)
32
+ else:
33
+ train_idx.append(i)
34
+
35
+ print(f"\tassigned {len(train_idx)} samples to train")
36
+ print(f"\tassigned {len(dev_idx)} samples to dev")
37
+ print(f"\tassigned {len(test_idx)} samples to test")
38
+
39
+ df_train = df.iloc[train_idx]
40
+ df_dev = df.iloc[dev_idx]
41
+ df_test = df.iloc[test_idx]
42
+
43
+ df_train.to_csv("output/migration/preprocess/annotations_train.csv")
44
+ df_dev.to_csv("output/migration/preprocess/annotations_dev.csv")
45
+ df_test.to_csv("output/migration/preprocess/annotations_test.csv")
46
+
47
+
48
+ def preprocess_all():
49
+ df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
50
+ for _, row in df.iterrows():
51
+ pass
52
+
53
+
54
+
55
+ if __name__ == "__main__":
56
+ # preprocess_annotated()
57
+ preprocess_all()
sociofillmore/migration/split_data.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import datetime
3
+
4
+ import pandas as pd
5
+
6
+ random.seed(1996)
7
+
8
+
9
+ DEV_RATIO = 0.10
10
+
11
+
12
+ def choose_best_casing(orig, predicted):
13
+ num_upper_tokens = len([c == c.upper() for c in orig.upper()])
14
+ if num_upper_tokens > 0.5 * len(orig):
15
+ return predicted
16
+ return predicted
17
+
18
+
19
+ def split_data():
20
+ events_main = []
21
+ texts_main = []
22
+ events_dev = []
23
+ texts_dev = []
24
+
25
+ with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f:
26
+ titles_tc = [line.strip() for line in f]
27
+
28
+ df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1")
29
+ for idx, (_, row) in enumerate(df_all.iterrows()):
30
+
31
+ if idx % 1000 == 0:
32
+ print("Processing line:", idx)
33
+
34
+ year = int(row["Anno"])
35
+
36
+ event_data = {
37
+ "event:id": idx,
38
+ "event:year": year,
39
+
40
+ }
41
+ text_data = {
42
+ "event_id": idx,
43
+ "text_id": idx,
44
+ "pubyear": year,
45
+ "language": "Italian",
46
+ "provider": row["Testata"].lstrip("*T_"),
47
+ "title": choose_best_casing(row["Titolo"], titles_tc[idx]),
48
+ "title_truecased": titles_tc[idx],
49
+ "title_orig": row["Titolo"]
50
+ }
51
+
52
+ if random.random() < DEV_RATIO:
53
+ events_dev.append(event_data)
54
+ texts_dev.append(text_data)
55
+
56
+ with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
57
+ f_out.write(text_data["title"])
58
+
59
+ with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
60
+ f_out.write(text_data["title_orig"])
61
+
62
+ with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
63
+ f_out.write(text_data["title_truecased"])
64
+
65
+ else:
66
+ events_main.append(event_data)
67
+ texts_main.append(text_data)
68
+
69
+ with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
70
+ f_out.write(text_data["title"])
71
+
72
+ with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
73
+ f_out.write(text_data["title_orig"])
74
+
75
+ with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
76
+ f_out.write(text_data["title_truecased"])
77
+
78
+ pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv")
79
+ pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv")
80
+ pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv")
81
+ pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv")
82
+
83
+
84
+ if __name__ == "__main__":
85
+ split_data()
sociofillmore/migration/split_lome_predictions.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import pandas as pd
5
+
6
+
7
+ def main(input_json, input_txt, output_dir):
8
+ meta_df = pd.read_csv("output/migration/split_data/split_dev10.texts.meta.csv")
9
+ text_ids = meta_df["text_id"].to_list()
10
+ with open(input_json, encoding="utf-8") as f:
11
+ json_predictions = json.load(f)
12
+
13
+ with open(input_txt, encoding="utf-8") as f:
14
+ txt_predictions = f.read().split("\n\n")
15
+
16
+ for t_id, json_p, txt_p in zip(text_ids, json_predictions, txt_predictions):
17
+
18
+ if int(t_id) % 100 == 0:
19
+ print(t_id)
20
+
21
+ prediction_dir = f"{output_dir}/{t_id}"
22
+ if not os.path.isdir(prediction_dir):
23
+ os.makedirs(prediction_dir)
24
+ prediction_file_json = f"{prediction_dir}/lome_{t_id}.comm.json"
25
+ prediction_file_txt = f"{prediction_dir}/lome_{t_id}.comm.txt"
26
+
27
+ with open(prediction_file_json, "w", encoding="utf-8") as f_out:
28
+ json.dump([json_p], f_out)
29
+
30
+ with open(prediction_file_txt, "w", encoding="utf-8") as f_out:
31
+ f_out.write(txt_p + "\n\n")
32
+
33
+
34
+ if __name__ == "__main__":
35
+ # main(
36
+ # input_json="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_tc.comm.json",
37
+ # input_txt="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_tc.comm.txt",
38
+ # output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
39
+ # )
40
+ # main(
41
+ # input_json="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_best-truecase.comm.json",
42
+ # input_txt="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_best-truecase.comm.txt",
43
+ # output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
44
+ # )
45
+ # main(
46
+ # input_json="output/migration/lome/lome_zs-tgt_ev-frm/data-in.concat.combined_zs_ev.tc_bilstm.json",
47
+ # input_txt="output/migration/lome/lome_zs-tgt_ev-frm/data-in.concat.combined_zs_ev.tc_bilstm.txt",
48
+ # output_dir="output/migration/lome/multilabel/lome_zs-tgt_ev_frm/pavia"
49
+ # )
50
+ main(
51
+ input_json="/home/gossminn/WorkSyncs/Code/fn-for-social-frames/output/migration/lome/lome_migration_concat.comm.json",
52
+ input_txt="/home/gossminn/WorkSyncs/Code/fn-for-social-frames/output/migration/lome/lome_migration_concat.comm.txt",
53
+ output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
54
+ )
sociofillmore/scoring/eval/__pycache__/analyze_final_questionnaire.cpython-37.pyc ADDED
Binary file (2.64 kB). View file