Gosse Minnema
commited on
Commit
·
b11ac48
1
Parent(s):
0f2a300
Add sociofillmore code, load dataset via private dataset repo
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- Dockerfile +13 -0
- app.py +0 -4
- docker_commands.sh +8 -0
- requirements.txt +0 -0
- resources/RAI_sources_mr.xlsx +0 -0
- resources/active_frames_full.csv +1229 -0
- resources/crashes_frame_list.txt +14 -0
- resources/crashes_frame_to_roles.csv +11 -0
- resources/crashes_sources.csv +440 -0
- resources/deep_frame_cache.json +0 -0
- resources/dep_labels.txt +159 -0
- resources/femicide_frame_list.txt +23 -0
- resources/femicides_frame_to_roles.csv +16 -0
- resources/fn_frames_to_roles.json +0 -0
- resources/migration_frame_list.txt +56 -0
- sociofillmore/__init__.py +0 -0
- sociofillmore/__init__.pyc +0 -0
- sociofillmore/__pycache__/__init__.cpython-311.pyc +0 -0
- sociofillmore/__pycache__/__init__.cpython-37.pyc +0 -0
- sociofillmore/__pycache__/__init__.cpython-39.pyc +0 -0
- sociofillmore/common/__init__.py +0 -0
- sociofillmore/common/__pycache__/__init__.cpython-37.pyc +0 -0
- sociofillmore/common/__pycache__/__init__.cpython-39.pyc +0 -0
- sociofillmore/common/__pycache__/analyze_text.cpython-37.pyc +0 -0
- sociofillmore/common/__pycache__/analyze_text.cpython-39.pyc +0 -0
- sociofillmore/common/__pycache__/split_lome_files.cpython-39.pyc +0 -0
- sociofillmore/common/analyze_text.py +1046 -0
- sociofillmore/common/convert_comms.py +208 -0
- sociofillmore/common/filter_lang.py +32 -0
- sociofillmore/common/get_nltk_fn_roles.py +11 -0
- sociofillmore/common/pos_based_targetid.py +31 -0
- sociofillmore/common/split_lome_files.py +22 -0
- sociofillmore/crashes/__pycache__/utils.cpython-37.pyc +0 -0
- sociofillmore/crashes/__pycache__/utils.cpython-39.pyc +0 -0
- sociofillmore/crashes/generate_templates.py +277 -0
- sociofillmore/crashes/make_bechdel_dicts.py +90 -0
- sociofillmore/crashes/predict_bechdel.py +500 -0
- sociofillmore/crashes/split_data.py +240 -0
- sociofillmore/crashes/utils.py +16 -0
- sociofillmore/femicides/compare_lome_models.py +296 -0
- sociofillmore/femicides/evalita_err_analysis.py +182 -0
- sociofillmore/femicides/extract_texts.py +15 -0
- sociofillmore/femicides/split_data.py +235 -0
- sociofillmore/migration/cda_classify.py +338 -0
- sociofillmore/migration/cda_classify_.py +140 -0
- sociofillmore/migration/extract_political_ratings.py +17 -0
- sociofillmore/migration/preprocess.py +57 -0
- sociofillmore/migration/split_data.py +85 -0
- sociofillmore/migration/split_lome_predictions.py +54 -0
- sociofillmore/scoring/eval/__pycache__/analyze_final_questionnaire.cpython-37.pyc +0 -0
Dockerfile
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9
|
2 |
+
WORKDIR /app
|
3 |
+
ADD . /app
|
4 |
+
RUN mkdir /nltk_data
|
5 |
+
RUN mkdir /.allennlp
|
6 |
+
RUN mkdir /.cache
|
7 |
+
RUN mkdir /.local
|
8 |
+
RUN chmod -R 777 /nltk_data
|
9 |
+
RUN chmod -R 777 /.allennlp
|
10 |
+
RUN chmod -R 777 /.cache
|
11 |
+
RUN chmod -R 777 /.local
|
12 |
+
RUN pip install -v -r requirements.combined.txt
|
13 |
+
CMD ["sh", "docker_commands.sh"]
|
app.py
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
from gradio_client import Client
|
3 |
-
|
4 |
-
client = Client("responsibility-framing/sociofillmore", hf_token=os.getenv("HF_TOKEN"))
|
|
|
|
|
|
|
|
|
|
docker_commands.sh
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# cd spanfinder/
|
2 |
+
# python -m sociolome.lome_webserver &
|
3 |
+
# cd ..
|
4 |
+
git clone https://gossminn:$HF_TOKEN@huggingface.co/datasets/responsibility-framing/sociofillmore-datasets
|
5 |
+
cp -r sociofillmore-datasets/data .
|
6 |
+
cp -r sociofillmore-datasets/output .
|
7 |
+
|
8 |
+
python -m sociofillmore.webapp.app 0.0.0.0
|
requirements.txt
ADDED
File without changes
|
resources/RAI_sources_mr.xlsx
ADDED
Binary file (8.21 kB). View file
|
|
resources/active_frames_full.csv
ADDED
@@ -0,0 +1,1229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
frame,active,notes
|
2 |
+
Abandonment,TRUE,
|
3 |
+
Abounding_with,FALSE,
|
4 |
+
Absorb_heat,FALSE,
|
5 |
+
Abundance,FALSE,
|
6 |
+
Abusing,TRUE,
|
7 |
+
Access_scenario,FALSE,
|
8 |
+
Accompaniment,FALSE,
|
9 |
+
Accomplishment,TRUE,
|
10 |
+
Accoutrements,FALSE,
|
11 |
+
Accuracy,FALSE,
|
12 |
+
Achieving_first,TRUE,
|
13 |
+
Active_substance,TRUE,
|
14 |
+
Activity,TRUE,
|
15 |
+
Activity_abandoned_state,TRUE,
|
16 |
+
Activity_done_state,TRUE,
|
17 |
+
Activity_finish,TRUE,
|
18 |
+
Activity_ongoing,TRUE,
|
19 |
+
Activity_pause,TRUE,
|
20 |
+
Activity_paused_state,TRUE,
|
21 |
+
Activity_prepare,TRUE,
|
22 |
+
Activity_ready_state,FALSE,
|
23 |
+
Activity_resume,TRUE,
|
24 |
+
Activity_start,TRUE,
|
25 |
+
Activity_stop,TRUE,
|
26 |
+
Actually_occurring_entity,FALSE,
|
27 |
+
Addiction,FALSE,
|
28 |
+
Adding_up,TRUE,
|
29 |
+
Adducing,TRUE,
|
30 |
+
Adjacency,FALSE,
|
31 |
+
Adjusting,TRUE,
|
32 |
+
Adopt_selection,TRUE,
|
33 |
+
Aesthetics,FALSE,
|
34 |
+
Affirm_or_deny,TRUE,
|
35 |
+
Age,FALSE,
|
36 |
+
Aggregate,FALSE,
|
37 |
+
Aging,FALSE,
|
38 |
+
Agree_or_refuse_to_act,TRUE,
|
39 |
+
Agriculture,TRUE,
|
40 |
+
Aiming,TRUE,
|
41 |
+
Alignment_image_schema,FALSE,
|
42 |
+
Alliance,FALSE,
|
43 |
+
Alternatives,FALSE,
|
44 |
+
Alternativity,FALSE,
|
45 |
+
Amalgamation,FALSE,
|
46 |
+
Amassing,TRUE,"according to def, Recipient arg ‘sometimes agentive’, but from EN examples seems unergative"
|
47 |
+
Ambient_temperature,FALSE,
|
48 |
+
Ammunition,FALSE,
|
49 |
+
Amounting_to,FALSE,
|
50 |
+
Animals,FALSE,
|
51 |
+
Annoyance,FALSE,
|
52 |
+
Appeal,TRUE,
|
53 |
+
Appellations,FALSE,
|
54 |
+
Apply_heat,TRUE,
|
55 |
+
Appointing,TRUE,
|
56 |
+
Architectural_part,FALSE,
|
57 |
+
Arithmetic,FALSE,
|
58 |
+
Arithmetic_commutative,FALSE,
|
59 |
+
Arithmetic_non-commutative,FALSE,
|
60 |
+
Armor,FALSE,
|
61 |
+
Arraignment,FALSE,
|
62 |
+
Arranging,TRUE,
|
63 |
+
Arrest,TRUE,
|
64 |
+
Arriving,FALSE,
|
65 |
+
Arson,TRUE,
|
66 |
+
Artifact,FALSE,
|
67 |
+
Artifact_subpart,FALSE,
|
68 |
+
Artificiality,FALSE,
|
69 |
+
Artistic_style,FALSE,
|
70 |
+
Assemble,TRUE,
|
71 |
+
Assessing,TRUE,
|
72 |
+
Assigned_location,FALSE,
|
73 |
+
Assistance,TRUE,
|
74 |
+
Asymmetric_reciprocality,TRUE,
|
75 |
+
Atonement,TRUE,
|
76 |
+
Attaching,TRUE,
|
77 |
+
Attack,TRUE,
|
78 |
+
Attempt,TRUE,
|
79 |
+
Attempt_action_scenario,FALSE,
|
80 |
+
Attempt_distant_interaction_scenario,FALSE,
|
81 |
+
Attempt_means,TRUE,
|
82 |
+
Attempt_obtain_food_scenario,FALSE,
|
83 |
+
Attempt_obtain_mineral_scenario,FALSE,
|
84 |
+
Attempt_suasion,TRUE,
|
85 |
+
Attempting_and_resolving_scenario,FALSE,
|
86 |
+
Attending,TRUE,
|
87 |
+
Attention,TRUE,
|
88 |
+
Attention_getting,FALSE,
|
89 |
+
Attitude_description,FALSE,
|
90 |
+
Attributed_information,FALSE,
|
91 |
+
Attributes,FALSE,
|
92 |
+
Authority,TRUE,
|
93 |
+
Avoiding,TRUE,
|
94 |
+
Awareness,TRUE,
|
95 |
+
Awareness_change_scenario,FALSE,
|
96 |
+
Awareness_situation,FALSE,
|
97 |
+
Awareness_status,FALSE,
|
98 |
+
Bail_decision,TRUE,
|
99 |
+
Basis_for_attribute,FALSE,
|
100 |
+
Be_in_agreement_on_action,TRUE,
|
101 |
+
Be_in_agreement_on_assessment,TRUE,
|
102 |
+
Be_on_alert,FALSE,
|
103 |
+
Be_subset_of,FALSE,
|
104 |
+
Be_translation_equivalent,FALSE,
|
105 |
+
Bearing_arms,TRUE,
|
106 |
+
Beat_opponent,TRUE,
|
107 |
+
Becoming,FALSE,
|
108 |
+
Becoming_a_member,TRUE,
|
109 |
+
Becoming_attached,FALSE,
|
110 |
+
Becoming_aware,TRUE,
|
111 |
+
Becoming_detached,FALSE,
|
112 |
+
Becoming_dry,FALSE,
|
113 |
+
Becoming_separated,FALSE,
|
114 |
+
Becoming_silent,TRUE,
|
115 |
+
Becoming_visible,FALSE,
|
116 |
+
Behind_the_scenes,TRUE,
|
117 |
+
Being_active,TRUE,
|
118 |
+
Being_at_risk,FALSE,
|
119 |
+
Being_attached,FALSE,
|
120 |
+
Being_awake,TRUE,
|
121 |
+
Being_born,FALSE,
|
122 |
+
Being_contained_within,FALSE,
|
123 |
+
Being_detached,FALSE,
|
124 |
+
Being_dry,FALSE,
|
125 |
+
Being_employed,TRUE,
|
126 |
+
Being_in_captivity,FALSE,
|
127 |
+
Being_in_category,FALSE,
|
128 |
+
Being_in_control,TRUE,
|
129 |
+
Being_in_effect,FALSE,
|
130 |
+
Being_in_operation,TRUE,
|
131 |
+
Being_incarcerated,TRUE,e.g. ‘serve time’
|
132 |
+
Being_included,FALSE,
|
133 |
+
Being_located,TRUE,e.g. ‘sit on’
|
134 |
+
Being_named,FALSE,
|
135 |
+
Being_necessary,FALSE,
|
136 |
+
Being_obligated,FALSE,
|
137 |
+
Being_obligatory,FALSE,"? only verb ‘behoove’, not sure about properties"
|
138 |
+
Being_operational,TRUE,"e.g. ‘works, is working’"
|
139 |
+
Being_pregnant,FALSE,
|
140 |
+
Being_questionable,FALSE,
|
141 |
+
Being_relevant,FALSE,"? ‘pertain ‘seems ergative syntactically, but not sure"
|
142 |
+
Being_rotted,FALSE,
|
143 |
+
Being_up_to_it,FALSE,
|
144 |
+
Being_wet,FALSE,
|
145 |
+
Besieging,TRUE,
|
146 |
+
Beyond_compare,FALSE,
|
147 |
+
Billing,TRUE,
|
148 |
+
Biological_area,FALSE,
|
149 |
+
Biological_classification,FALSE,
|
150 |
+
Biological_entity,FALSE,
|
151 |
+
Biological_mechanisms,TRUE,
|
152 |
+
Biological_urge,FALSE,
|
153 |
+
Birth_scenario,FALSE,
|
154 |
+
Board_vehicle,TRUE,
|
155 |
+
Body_decoration,FALSE,
|
156 |
+
Body_description_holistic,FALSE,
|
157 |
+
Body_description_part,FALSE,
|
158 |
+
Body_mark,FALSE,
|
159 |
+
Body_movement,TRUE,
|
160 |
+
Body_parts,FALSE,
|
161 |
+
Bond_maturation,TRUE,
|
162 |
+
Borrowing,TRUE,
|
163 |
+
Boundary,FALSE,
|
164 |
+
Bounded_entity,FALSE,
|
165 |
+
Bounded_region,FALSE,
|
166 |
+
Bragging,TRUE,
|
167 |
+
Breaking_apart,FALSE,
|
168 |
+
Breaking_off,FALSE,
|
169 |
+
Breaking_out_captive,TRUE,
|
170 |
+
Breathing,TRUE,
|
171 |
+
Bringing,TRUE,
|
172 |
+
Building,TRUE,
|
173 |
+
Building_subparts,FALSE,
|
174 |
+
Buildings,FALSE,
|
175 |
+
Bungling,TRUE,
|
176 |
+
Burying,TRUE,
|
177 |
+
Business_closure,TRUE,'founder’ seems ambiguous (?foundered efforts? foundered business?) but ‘close doors’ is clearly unergative
|
178 |
+
Businesses,FALSE,
|
179 |
+
Cache,FALSE,
|
180 |
+
Calendric_unit,FALSE,
|
181 |
+
Candidness,FALSE,
|
182 |
+
Capability,TRUE,
|
183 |
+
Capacity,FALSE,verbs difficult to judge (e.g. ‘a stadium seats’ ??)
|
184 |
+
Capital_stock,FALSE,
|
185 |
+
Cardinal_numbers,FALSE,
|
186 |
+
Carry_goods,TRUE,
|
187 |
+
Catastrophe,TRUE,
|
188 |
+
Catching_fire,FALSE,
|
189 |
+
Categorization,TRUE,
|
190 |
+
Causation,TRUE,
|
191 |
+
Causation_scenario,TRUE,
|
192 |
+
Cause_benefit_or_detriment,TRUE,
|
193 |
+
Cause_bodily_experience,TRUE,
|
194 |
+
Cause_change,TRUE,
|
195 |
+
Cause_change_of_consistency,TRUE,
|
196 |
+
Cause_change_of_phase,TRUE,
|
197 |
+
Cause_change_of_position_on_a_scale,TRUE,
|
198 |
+
Cause_change_of_strength,TRUE,
|
199 |
+
Cause_emotion,TRUE,
|
200 |
+
Cause_expansion,TRUE,
|
201 |
+
Cause_fluidic_motion,TRUE,
|
202 |
+
Cause_harm,TRUE,
|
203 |
+
Cause_impact,TRUE,
|
204 |
+
Cause_motion,TRUE,
|
205 |
+
Cause_proliferation_in_number,TRUE,
|
206 |
+
Cause_temperature_change,TRUE,
|
207 |
+
Cause_to_amalgamate,TRUE,
|
208 |
+
Cause_to_be_dry,TRUE,
|
209 |
+
Cause_to_be_included,TRUE,
|
210 |
+
Cause_to_be_sharp,TRUE,
|
211 |
+
Cause_to_be_wet,TRUE,
|
212 |
+
Cause_to_burn,TRUE,
|
213 |
+
Cause_to_continue,TRUE,
|
214 |
+
Cause_to_end,TRUE,
|
215 |
+
Cause_to_experience,TRUE,
|
216 |
+
Cause_to_fragment,TRUE,
|
217 |
+
Cause_to_land,TRUE,
|
218 |
+
Cause_to_make_noise,TRUE,
|
219 |
+
Cause_to_make_progress,TRUE,
|
220 |
+
Cause_to_move_in_place,TRUE,
|
221 |
+
Cause_to_perceive,TRUE,
|
222 |
+
Cause_to_resume,TRUE,
|
223 |
+
Cause_to_rot,TRUE,
|
224 |
+
Cause_to_start,TRUE,
|
225 |
+
Cause_to_wake,TRUE,
|
226 |
+
Ceasing_to_be,FALSE,
|
227 |
+
Certainty,TRUE,
|
228 |
+
Change_accessibility,TRUE,
|
229 |
+
Change_direction,TRUE,"1st FE is called Theme, but verbs (‘turn’) pass test"
|
230 |
+
Change_event_duration,TRUE,
|
231 |
+
Change_event_time,TRUE,
|
232 |
+
Change_of_consistency,FALSE,
|
233 |
+
Change_of_leadership,TRUE,Selector is not 1st FE but seems to be the subj of all of the verbs in the frame
|
234 |
+
Change_of_phase,FALSE,'the lake froze’ → ‘the frozen lake’ => unaccusative
|
235 |
+
Change_of_phase_scenario,FALSE,
|
236 |
+
Change_of_quantity_of_possession,TRUE,
|
237 |
+
Change_of_temperature,FALSE,
|
238 |
+
Change_operational_state,TRUE,
|
239 |
+
Change_position_on_a_scale,FALSE,
|
240 |
+
Change_post-state,FALSE,
|
241 |
+
Change_posture,TRUE,
|
242 |
+
Change_resistance,TRUE,
|
243 |
+
Change_tool,TRUE,
|
244 |
+
Chaos,FALSE,
|
245 |
+
Chatting,TRUE,
|
246 |
+
Chemical-sense_description,TRUE,
|
247 |
+
Chemical_potency,FALSE,
|
248 |
+
Choosing,TRUE,
|
249 |
+
Circumscribed_existence,FALSE,
|
250 |
+
Citing,TRUE,
|
251 |
+
Claim_ownership,TRUE,
|
252 |
+
Clemency,FALSE,no verbs but describes active event
|
253 |
+
Closure,TRUE,
|
254 |
+
Clothing,FALSE,
|
255 |
+
Clothing_parts,FALSE,
|
256 |
+
Co-association,FALSE,
|
257 |
+
Cogitation,TRUE,
|
258 |
+
Cognitive_connection,TRUE,
|
259 |
+
Coincidence,FALSE,
|
260 |
+
Collaboration,TRUE,
|
261 |
+
Collocation_image_schema,FALSE,
|
262 |
+
Colonization,TRUE,
|
263 |
+
Color,FALSE,
|
264 |
+
Color_qualities,FALSE,
|
265 |
+
Come_down_with,TRUE,
|
266 |
+
Come_into_effect,FALSE,"not sure, ‘terms and conditions apply’ → ‘the applied terms and conditions’?"
|
267 |
+
Come_together,TRUE,
|
268 |
+
Coming_to_be,FALSE,
|
269 |
+
Coming_to_believe,TRUE,
|
270 |
+
Coming_up_with,TRUE,
|
271 |
+
Commemorative,FALSE,
|
272 |
+
Commerce_buy,TRUE,
|
273 |
+
Commerce_collect,TRUE,
|
274 |
+
Commerce_goods-transfer,TRUE,
|
275 |
+
Commerce_money-transfer,TRUE,
|
276 |
+
Commerce_pay,TRUE,
|
277 |
+
Commerce_scenario,TRUE,
|
278 |
+
Commerce_sell,TRUE,
|
279 |
+
Commercial_transaction,TRUE,
|
280 |
+
Commitment,TRUE,
|
281 |
+
Committing_crime,TRUE,
|
282 |
+
Commonality,FALSE,
|
283 |
+
Communicate_categorization,TRUE,
|
284 |
+
Communication,TRUE,
|
285 |
+
Communication_manner,TRUE,
|
286 |
+
Communication_means,TRUE,
|
287 |
+
Communication_noise,TRUE,
|
288 |
+
Communication_response,TRUE,
|
289 |
+
Commutation,TRUE,
|
290 |
+
Commutative_process,TRUE,"hard to judge, no examples"
|
291 |
+
Commutative_statement,FALSE,
|
292 |
+
Compatibility,FALSE,"Difficult, but it’s 'rhyming words’ not ‘rhymed words’; ‘matched’/’matching’ both possible but ‘matched’ implies an agent (probably a different frame)"
|
293 |
+
Competition,TRUE,
|
294 |
+
Complaining,TRUE,
|
295 |
+
Completeness,FALSE,
|
296 |
+
Compliance,TRUE,e.g. ‘obey’
|
297 |
+
Concessive,FALSE,
|
298 |
+
Condition_symptom_relation,TRUE,
|
299 |
+
Conditional_occurrence,FALSE,
|
300 |
+
Conditional_scenario,FALSE,
|
301 |
+
Conduct,TRUE,
|
302 |
+
Confronting_problem,TRUE,
|
303 |
+
Connecting_architecture,FALSE,
|
304 |
+
Connectors,FALSE,
|
305 |
+
Conquering,TRUE,
|
306 |
+
Contact_image_schema,FALSE,
|
307 |
+
Contacting,TRUE,
|
308 |
+
Container_focused_placing,TRUE,
|
309 |
+
Container_focused_removing,TRUE,
|
310 |
+
Containers,FALSE,
|
311 |
+
Containing,TRUE,
|
312 |
+
Containment_scenario,FALSE,
|
313 |
+
Contingency,TRUE,
|
314 |
+
Continued_state_of_affairs,FALSE,
|
315 |
+
Contrary_circumstances,FALSE,
|
316 |
+
Contrition,TRUE,
|
317 |
+
Control,TRUE,
|
318 |
+
Controller_object,FALSE,
|
319 |
+
Convey_importance,TRUE,
|
320 |
+
Convoy,FALSE,
|
321 |
+
Cooking_creation,TRUE,
|
322 |
+
Corporal_punishment,TRUE,
|
323 |
+
Correctness,FALSE,
|
324 |
+
Corroding,FALSE,
|
325 |
+
Corroding_caused,TRUE,
|
326 |
+
Cotheme,TRUE,"1st FE is called Theme, but verbs (e.g. ‘follow’) seem agentive"
|
327 |
+
Counterattack,TRUE,
|
328 |
+
Court_examination,TRUE,
|
329 |
+
Craft,FALSE,
|
330 |
+
Create_physical_artwork,TRUE,
|
331 |
+
Create_representation,TRUE,
|
332 |
+
Creating,TRUE,
|
333 |
+
Crime_scenario,FALSE,
|
334 |
+
Criminal_investigation,TRUE,
|
335 |
+
Criminal_process,FALSE,
|
336 |
+
Cure,TRUE,
|
337 |
+
Custom,FALSE,
|
338 |
+
Cutting,TRUE,
|
339 |
+
Cycle_of_existence_scenario,FALSE,
|
340 |
+
Cycle_of_life_and_death,FALSE,
|
341 |
+
Damaging,TRUE,
|
342 |
+
Daring,TRUE,
|
343 |
+
Dead_or_alive,FALSE,
|
344 |
+
Death,FALSE,
|
345 |
+
Deception_end,TRUE,
|
346 |
+
Deception_scenario,FALSE,
|
347 |
+
Deception_success,TRUE,
|
348 |
+
Deciding,TRUE,
|
349 |
+
Defending,TRUE,
|
350 |
+
Degree,FALSE,
|
351 |
+
Degree_of_processing,FALSE,
|
352 |
+
Delimitation_of_diversity,FALSE,
|
353 |
+
Delimited_state_scenario,FALSE,
|
354 |
+
Delivery,TRUE,
|
355 |
+
Deny_or_grant_permission,TRUE,
|
356 |
+
Departing,TRUE,Directional movement verbs
|
357 |
+
Deserving,TRUE,"verbs don’t seem to be able to be used intransitively, but seem ‘active-like’ (e.g. ‘justify’)"
|
358 |
+
Desirability,TRUE,? e.g. ‘the movie rocks/sucks’ (metaphorical but derives from active verb?)
|
359 |
+
Desirable_event,TRUE,
|
360 |
+
Desiring,TRUE,
|
361 |
+
Destiny,FALSE,
|
362 |
+
Destroying,TRUE,
|
363 |
+
Detaching,TRUE,
|
364 |
+
Detaining,TRUE,
|
365 |
+
Detonate_explosive,TRUE,
|
366 |
+
Differentiation,TRUE,
|
367 |
+
Difficulty,FALSE,
|
368 |
+
Dimension,FALSE,"only one verb (‘measure’), cannot be intransitive"
|
369 |
+
Direction,FALSE,
|
370 |
+
Directional_locative_relation,FALSE,
|
371 |
+
Disaster_scenario,FALSE,
|
372 |
+
Discussion,TRUE,
|
373 |
+
Disembarking,TRUE,Directional movement verbs
|
374 |
+
Disgraceful_situation,FALSE,
|
375 |
+
Dispersal,TRUE,
|
376 |
+
Distance_scenario,FALSE,
|
377 |
+
Distant_operated_IED,FALSE,
|
378 |
+
Distinctiveness,TRUE,'characterize’
|
379 |
+
Distributed_abundanced,FALSE,
|
380 |
+
Distributed_position,TRUE,?? not sure any of the verbs can ever be intransitive
|
381 |
+
Diversity,FALSE,
|
382 |
+
Documents,FALSE,
|
383 |
+
Dodging,TRUE,
|
384 |
+
Domain,FALSE,
|
385 |
+
Dominate_competitor,TRUE,
|
386 |
+
Dominate_situation,TRUE,
|
387 |
+
Domination,TRUE,
|
388 |
+
Dough_rising,FALSE,
|
389 |
+
Downing,TRUE,
|
390 |
+
Dressing,TRUE,
|
391 |
+
Drop_in_on,TRUE,
|
392 |
+
Dunking,TRUE,
|
393 |
+
Duplication,TRUE,
|
394 |
+
Duration_description,FALSE,
|
395 |
+
Duration_relation,FALSE,'it lasted’ → *’the lasted thing’ (but ??’the persisted thing’)
|
396 |
+
Duration_scenario,FALSE,
|
397 |
+
Dying,FALSE,
|
398 |
+
Dynamic_situation_scenario,FALSE,
|
399 |
+
Dynamism,FALSE,
|
400 |
+
Earnings_and_losses,TRUE,
|
401 |
+
Eclipse,TRUE,
|
402 |
+
Economy,FALSE,
|
403 |
+
Education_teaching,TRUE,
|
404 |
+
Electricity,FALSE,
|
405 |
+
Elusive_goal,FALSE,
|
406 |
+
Emanating,FALSE,
|
407 |
+
Emergency,FALSE,
|
408 |
+
Emergency_fire,FALSE,
|
409 |
+
Emitting,TRUE,
|
410 |
+
Emotion_directed,TRUE,
|
411 |
+
Emotion_heat,TRUE,
|
412 |
+
Emotions,FALSE,
|
413 |
+
Emotions_by_possibility,FALSE,
|
414 |
+
Emotions_by_stimulus,FALSE,
|
415 |
+
Emotions_of_mental_activity,TRUE,
|
416 |
+
Emotions_success_or_failure,FALSE,Contains no verbs but theoretically possible that it would?
|
417 |
+
Emphasizing,TRUE,
|
418 |
+
Employee_scenario,FALSE,
|
419 |
+
Employer_scenario,FALSE,
|
420 |
+
Employing,TRUE,
|
421 |
+
Employment_continue,FALSE,
|
422 |
+
Employment_end,FALSE,
|
423 |
+
Employment_scenario,FALSE,
|
424 |
+
Employment_start,FALSE,
|
425 |
+
Emptying,TRUE,
|
426 |
+
Encoding,TRUE,
|
427 |
+
Encounter,TRUE,
|
428 |
+
Endangering,TRUE,
|
429 |
+
Endeavor_failure,FALSE,
|
430 |
+
Enforcing,TRUE,
|
431 |
+
Enter_awareness,TRUE,
|
432 |
+
Entering_of_plea,TRUE,?
|
433 |
+
Entity,FALSE,
|
434 |
+
Entourage,FALSE,
|
435 |
+
Erasing,TRUE,
|
436 |
+
Escaping,TRUE,
|
437 |
+
Estimated_value,FALSE,
|
438 |
+
Estimating,TRUE,
|
439 |
+
Evading,TRUE,
|
440 |
+
Evaluative_comparison,TRUE,
|
441 |
+
Event,FALSE,
|
442 |
+
Event_endstate,FALSE,
|
443 |
+
Event_initial_state,FALSE,
|
444 |
+
Event_instance,FALSE,
|
445 |
+
Eventive_affecting,FALSE,
|
446 |
+
Eventive_cognizer_affecting,TRUE,
|
447 |
+
Evidence,TRUE,
|
448 |
+
Evoking,TRUE,
|
449 |
+
Examination,TRUE,
|
450 |
+
Exchange,TRUE,
|
451 |
+
Exchange_currency,TRUE,
|
452 |
+
Exclude_member,TRUE,
|
453 |
+
Excreting,TRUE,
|
454 |
+
Execute_plan,TRUE,
|
455 |
+
Execution,TRUE,
|
456 |
+
Exemplar,FALSE,
|
457 |
+
Exemplariness,FALSE,
|
458 |
+
Exercising,TRUE,
|
459 |
+
Existence,FALSE,
|
460 |
+
Expansion,FALSE,
|
461 |
+
Expectation,TRUE,
|
462 |
+
Expected_location_of_person,FALSE,
|
463 |
+
Expend_resource,TRUE,
|
464 |
+
Expensiveness,TRUE,
|
465 |
+
Experience_bodily_harm,FALSE,syntactically ambiguous? ‘I broke my leg’ / ‘mi sono rotto la gamba’ / ‘je me suis cassé la jambe’ → not sure how to classify this construction in romance languages?
|
466 |
+
Experiencer_focused_emotion,TRUE,
|
467 |
+
Experimentation,TRUE,
|
468 |
+
Expertise,TRUE,
|
469 |
+
Explaining_the_facts,TRUE,
|
470 |
+
Explosion,FALSE,
|
471 |
+
Exporting,TRUE,
|
472 |
+
Expressing_publicly,TRUE,
|
473 |
+
Extradition,TRUE,
|
474 |
+
Extreme_point,FALSE,
|
475 |
+
Extreme_value,FALSE,
|
476 |
+
Facial_expression,FALSE,
|
477 |
+
Fairness_evaluation,FALSE,
|
478 |
+
Fall_asleep,FALSE,
|
479 |
+
Fall_for,TRUE,"'buy’, ‘swallow’ seem clear transitive, ‘fall for’ could maybe be unaccusative??"
|
480 |
+
Fame,TRUE,
|
481 |
+
Familiarity,TRUE,
|
482 |
+
Fastener,FALSE,
|
483 |
+
Fear,TRUE,
|
484 |
+
Feeling,TRUE,
|
485 |
+
Feigning,TRUE,
|
486 |
+
Fields,FALSE,
|
487 |
+
Fighting_activity,FALSE,
|
488 |
+
Filling,TRUE,
|
489 |
+
Fining,TRUE,
|
490 |
+
Finish_competition,TRUE,
|
491 |
+
Finish_game,TRUE,
|
492 |
+
Fire_break,FALSE,
|
493 |
+
Fire_burning,TRUE,
|
494 |
+
Fire_emergency_scenario,FALSE,
|
495 |
+
Fire_end_scenario,FALSE,
|
496 |
+
Fire_going_out,FALSE,
|
497 |
+
Fire_stopping_scenario,FALSE,
|
498 |
+
Firefighting,TRUE,
|
499 |
+
Firing,TRUE,
|
500 |
+
Firing_point,FALSE,
|
501 |
+
First_experience,FALSE,
|
502 |
+
First_rank,FALSE,
|
503 |
+
Fleeing,TRUE,
|
504 |
+
Fluidic_motion,TRUE,
|
505 |
+
Food,FALSE,
|
506 |
+
Food_gathering,TRUE,
|
507 |
+
Foreign_or_domestic_country,FALSE,
|
508 |
+
Forging,TRUE,
|
509 |
+
Forgiveness,TRUE,
|
510 |
+
Forgoing,TRUE,
|
511 |
+
Forming_relationships,TRUE,
|
512 |
+
Fragmentation_scenario,FALSE,
|
513 |
+
Freeing_from_confinement,TRUE,
|
514 |
+
Frequency,FALSE,
|
515 |
+
Friction,TRUE,"difficult, but “Theme exterts pressure and experiences resistance” could be interpreted as somehow (quasi-)agentive"
|
516 |
+
Friendly_or_hostile,FALSE,
|
517 |
+
Front_for,TRUE,(front.v ?)
|
518 |
+
Frugality,TRUE,
|
519 |
+
Fugitive,FALSE,
|
520 |
+
Fullness,FALSE,
|
521 |
+
Function,TRUE,"difficult: inanimate theme, but “exists to perform Activity” so perhaps (metaphorically) can be seen as active?"
|
522 |
+
Funding,TRUE,
|
523 |
+
Gathering_up,TRUE,
|
524 |
+
Gesture,TRUE,
|
525 |
+
Get_a_job,TRUE,
|
526 |
+
Getting,TRUE,"somewhat difficult: most verbs (obtain, acquire, procure) imply active action, but the definition of the frame and verbs like “get” can be seen as passive.
|
527 |
+
|
528 |
+
Interesting: in Dutch there is “verkrijgen” (obtain) vs. “krijgen” (get) were this distinction is encoded morphologically"
|
529 |
+
Getting_scenario,FALSE,
|
530 |
+
Getting_triggered,FALSE,
|
531 |
+
Getting_underway,TRUE,
|
532 |
+
Getting_up,TRUE,
|
533 |
+
Getting_vehicle_underway,TRUE,
|
534 |
+
Give_impression,TRUE,not sure but these all seem (semantically) causative somehow? e.g. “smell good” ~= “cause pleasant taste perception”?
|
535 |
+
Giving,TRUE,
|
536 |
+
Giving_birth,TRUE,
|
537 |
+
Giving_in,TRUE,
|
538 |
+
Giving_scenario,FALSE,
|
539 |
+
Gizmo,FALSE,
|
540 |
+
Go_into_shape,FALSE,"per description “Theme goed into a shape without being made to do so by an agent”, but does that make it agentive?"
|
541 |
+
Goal,FALSE,
|
542 |
+
Going_back_on_a_commitment,TRUE,
|
543 |
+
Government_institution,FALSE,
|
544 |
+
Gradable_artistic_quality,FALSE,
|
545 |
+
Gradable_attributes,FALSE,
|
546 |
+
Gradable_proximity,FALSE,
|
547 |
+
Graph_shape,FALSE,
|
548 |
+
Grasp,TRUE,
|
549 |
+
Grinding,TRUE,
|
550 |
+
Grooming,TRUE,
|
551 |
+
Ground_up,FALSE,
|
552 |
+
Growing_food,TRUE,
|
553 |
+
Guest_and_host,FALSE,
|
554 |
+
Guilt_or_innocence,FALSE,
|
555 |
+
Gusto,FALSE,
|
556 |
+
Hair_configuration,FALSE,
|
557 |
+
Halt,TRUE,"motion verbs (‘has stopped’ etc)
|
558 |
+
|
559 |
+
'stop’ seems to imply agent? ‘the cyclist stopped’, ‘the train stopped’ (metaphorical?), but ??the falling rock stopped (?)"
|
560 |
+
Have_as_requirement,TRUE,"metaphorical use of demand, require, take?"
|
561 |
+
Have_as_translation_equivalent,FALSE,
|
562 |
+
Have_associated,FALSE,metaphorical use of have?
|
563 |
+
Have_visitor_over,FALSE,
|
564 |
+
Having_commercial_agreement,FALSE,
|
565 |
+
Having_or_lacking_access,FALSE,"verb ‘access’ is agentive but seems to be misclassified in this frame (it means to enter, not to have access)"
|
566 |
+
Health_response,FALSE,
|
567 |
+
Hearsay,TRUE,
|
568 |
+
Heat_potential,FALSE,
|
569 |
+
Hedging,FALSE,
|
570 |
+
Heralding,TRUE,
|
571 |
+
Hiding_objects,TRUE,
|
572 |
+
Hindering,TRUE,
|
573 |
+
Hiring,TRUE,
|
574 |
+
Historic_event,FALSE,
|
575 |
+
History,FALSE,
|
576 |
+
History_scenario,FALSE,
|
577 |
+
Hit_or_miss,TRUE,
|
578 |
+
Hit_target,TRUE,
|
579 |
+
Holding_off_on,TRUE,
|
580 |
+
Hospitality,FALSE,
|
581 |
+
Hostile_encounter,TRUE,
|
582 |
+
Hunting,TRUE,
|
583 |
+
Hunting_scenario,FALSE,
|
584 |
+
Hunting_success_or_failure,TRUE,
|
585 |
+
Identicality,FALSE,
|
586 |
+
Identity,FALSE,
|
587 |
+
Identity_scenario,FALSE,
|
588 |
+
Idiosyncrasy,FALSE,
|
589 |
+
Image_schema,FALSE,
|
590 |
+
Imitating,TRUE,
|
591 |
+
Immobilization,TRUE,
|
592 |
+
Impact,TRUE,
|
593 |
+
Import_export_scenario,TRUE,
|
594 |
+
Importance,TRUE,
|
595 |
+
Importing,TRUE,
|
596 |
+
Imposing_obligation,TRUE,
|
597 |
+
Impression,FALSE,
|
598 |
+
Imprisonment,TRUE,
|
599 |
+
Improvement_or_decline,FALSE,
|
600 |
+
Improvised_explosive_device,FALSE,
|
601 |
+
Inclination,FALSE,
|
602 |
+
Inclusion,TRUE,
|
603 |
+
Inclusion_scenario,FALSE,
|
604 |
+
Increment,FALSE,
|
605 |
+
Indicating,TRUE,
|
606 |
+
Indigenous_origin,FALSE,
|
607 |
+
Individual_history,FALSE,
|
608 |
+
Ineffability,FALSE,
|
609 |
+
Infecting,TRUE,
|
610 |
+
Influencing_potential,FALSE,
|
611 |
+
Information,FALSE,
|
612 |
+
Information_display,FALSE,
|
613 |
+
Infrastructure,FALSE,
|
614 |
+
Ingest_substance,TRUE,
|
615 |
+
Ingestion,TRUE,
|
616 |
+
Ingredients,FALSE,
|
617 |
+
Inherent_purpose,FALSE,
|
618 |
+
Inhibit_motion_scenario,FALSE,
|
619 |
+
Inhibit_movement,TRUE,
|
620 |
+
Inspecting,TRUE,
|
621 |
+
Installing,TRUE,
|
622 |
+
Instance,FALSE,
|
623 |
+
Institutionalization,TRUE,
|
624 |
+
Institutions,FALSE,
|
625 |
+
Intentional_deception,TRUE,
|
626 |
+
Intentional_traversing,TRUE,
|
627 |
+
Intentionally_act,TRUE,
|
628 |
+
Intentionally_affect,TRUE,
|
629 |
+
Intentionally_create,TRUE,
|
630 |
+
Intercepting,TRUE,
|
631 |
+
Interior_profile_relation,FALSE,
|
632 |
+
Interrupt_process,TRUE,
|
633 |
+
Intoxicants,FALSE,
|
634 |
+
Intoxication,FALSE,
|
635 |
+
Invading,TRUE,
|
636 |
+
Invasion_scenario,FALSE,
|
637 |
+
Irregular_combatants,FALSE,
|
638 |
+
Isolated_places,FALSE,
|
639 |
+
Judgment,TRUE,
|
640 |
+
Judgment_communication,TRUE,
|
641 |
+
Judgment_direct_address,TRUE,
|
642 |
+
Judgment_of_intensity,FALSE,
|
643 |
+
Judicial_body,FALSE,
|
644 |
+
Jury_deliberation,TRUE,
|
645 |
+
Just_found_out,FALSE,
|
646 |
+
Justifying,TRUE,
|
647 |
+
Key,FALSE,
|
648 |
+
Kidnapping,TRUE,
|
649 |
+
Killing,TRUE,
|
650 |
+
Kinship,FALSE,
|
651 |
+
Knot_creation,TRUE,
|
652 |
+
Knot_creation_scenario,FALSE,
|
653 |
+
Labeling,TRUE,
|
654 |
+
Labor_product,FALSE,
|
655 |
+
Launch_process,TRUE,
|
656 |
+
Law,FALSE,
|
657 |
+
Law_enforcement_agency,FALSE,
|
658 |
+
Leadership,TRUE,
|
659 |
+
Leaving_traces,FALSE,
|
660 |
+
Left_to_do,FALSE,
|
661 |
+
Legal_rulings,TRUE,
|
662 |
+
Legality,FALSE,
|
663 |
+
Lending,TRUE,
|
664 |
+
Level_of_force_exertion,FALSE,
|
665 |
+
Level_of_force_resistance,FALSE,
|
666 |
+
Level_of_light,FALSE,
|
667 |
+
Light_movement,TRUE,metaphorical
|
668 |
+
Likelihood,FALSE,
|
669 |
+
Limitation,FALSE,
|
670 |
+
Limiting,TRUE,
|
671 |
+
Linguistic_meaning,TRUE,? metaphorical active?
|
672 |
+
Lively_place,TRUE,? ‘the park buzzes’ can be seen as metaphorically active?
|
673 |
+
Living_conditions,FALSE,
|
674 |
+
Locale,FALSE,
|
675 |
+
Locale_by_characteristic_entity,FALSE,
|
676 |
+
Locale_by_collocation,FALSE,
|
677 |
+
Locale_by_event,FALSE,
|
678 |
+
Locale_by_ownership,FALSE,
|
679 |
+
Locale_by_use,FALSE,
|
680 |
+
Locale_closure,FALSE,
|
681 |
+
Locating,TRUE,
|
682 |
+
Location_in_time,FALSE,
|
683 |
+
Location_of_light,FALSE,seems anticausative of Light_movement?
|
684 |
+
Location_on_path,FALSE,
|
685 |
+
Locative_relation,TRUE,
|
686 |
+
Locative_scenario,FALSE,
|
687 |
+
Lodging_scenario,FALSE,
|
688 |
+
Lose_possession,FALSE,
|
689 |
+
Lose_possession_scenario,FALSE,
|
690 |
+
Losing,TRUE,'losing sth’ could be both passive or active; in Dutch “heb verloren/ben verloren” both possible (sensitive to the contrast?)
|
691 |
+
Losing_it,TRUE,
|
692 |
+
Losing_someone,TRUE,
|
693 |
+
Losing_track_of,FALSE,
|
694 |
+
Losing_track_of_perceiver,TRUE,
|
695 |
+
Losing_track_of_theme,TRUE,
|
696 |
+
Luck,FALSE,
|
697 |
+
Make_acquaintance,TRUE,
|
698 |
+
Make_agreement_on_action,TRUE,
|
699 |
+
Make_cognitive_connection,TRUE,
|
700 |
+
Make_compromise,TRUE,
|
701 |
+
Make_noise,TRUE,
|
702 |
+
Making_arrangements,TRUE,
|
703 |
+
Making_faces,TRUE,
|
704 |
+
Manipulate_into_doing,TRUE,
|
705 |
+
Manipulate_into_shape,TRUE,
|
706 |
+
Manipulation,TRUE,
|
707 |
+
Manner,FALSE,
|
708 |
+
Manner_of_life,TRUE,
|
709 |
+
Manufacturing,TRUE,
|
710 |
+
Margin_of_resolution,FALSE,
|
711 |
+
Mass_motion,FALSE,?
|
712 |
+
Mathematical_relationship,FALSE,
|
713 |
+
Means,FALSE,
|
714 |
+
Measurable_attributes,FALSE,
|
715 |
+
Measure_area,FALSE,
|
716 |
+
Measure_by_action,FALSE,
|
717 |
+
Measure_duration,FALSE,
|
718 |
+
Measure_mass,FALSE,
|
719 |
+
Measure_of_distance_and_length,FALSE,
|
720 |
+
Measure_scenario,FALSE,
|
721 |
+
Measure_volume,FALSE,
|
722 |
+
Measures,FALSE,
|
723 |
+
Medical_conditions,FALSE,
|
724 |
+
Medical_instruments,FALSE,
|
725 |
+
Medical_interaction_scenario,FALSE,
|
726 |
+
Medical_intervention,TRUE,
|
727 |
+
Medical_professionals,FALSE,
|
728 |
+
Medical_specialties,FALSE,
|
729 |
+
Medium,FALSE,
|
730 |
+
Meet_specifications,TRUE,
|
731 |
+
Meet_with,TRUE,
|
732 |
+
Meet_with_response,TRUE,? ‘meet with critizism’ ~= elicit critizism?
|
733 |
+
Member_of_military,FALSE,
|
734 |
+
Membership,FALSE,? belong
|
735 |
+
Memorization,TRUE,
|
736 |
+
Memory,TRUE,
|
737 |
+
Mental_activity,FALSE,
|
738 |
+
Mental_property,FALSE,
|
739 |
+
Mental_stimulus_exp_focus,FALSE,
|
740 |
+
Mental_stimulus_stimulus_focus,FALSE,
|
741 |
+
Mention,TRUE,
|
742 |
+
Military,FALSE,
|
743 |
+
Military_operation,TRUE,
|
744 |
+
Mining,TRUE,
|
745 |
+
Misdeed,TRUE,
|
746 |
+
Money,FALSE,
|
747 |
+
Morality_evaluation,FALSE,
|
748 |
+
Motion,FALSE,"? this frame is meant to have the non-active versions of the motion verbs (as opposed to Self_motion, Operate_vehicle etc), but in practice it seems many examples are actually active"
|
749 |
+
Motion_directional,FALSE,?
|
750 |
+
Motion_noise,FALSE,?
|
751 |
+
Motion_scenario,FALSE,
|
752 |
+
Moving_in_place,FALSE,?
|
753 |
+
Name_conferral,TRUE,
|
754 |
+
Namesake,FALSE,
|
755 |
+
Natural_features,FALSE,
|
756 |
+
Needing,TRUE,
|
757 |
+
Negation,FALSE,
|
758 |
+
Negative_conditional,FALSE,
|
759 |
+
Network,FALSE,
|
760 |
+
Noise_makers,FALSE,
|
761 |
+
Non-commutative_process,TRUE,active when Calculator (non-core) is present; there are no examples so not clear how often this is the case
|
762 |
+
Non-commutative_statement,FALSE,
|
763 |
+
Non-gradable_proximity,FALSE,
|
764 |
+
Noncombatant,FALSE,
|
765 |
+
Notability,FALSE,
|
766 |
+
Notification_of_charges,TRUE,
|
767 |
+
Nuclear_process,FALSE,
|
768 |
+
Objective_influence,TRUE,
|
769 |
+
Obligation_scenario,FALSE,
|
770 |
+
Obscurity,FALSE,
|
771 |
+
Obviousness,FALSE,
|
772 |
+
Occupy_rank,FALSE,'he ranks second’ ~= ‘he is ranked second’
|
773 |
+
Offenses,FALSE,
|
774 |
+
Offering,TRUE,
|
775 |
+
Offshoot,FALSE,
|
776 |
+
Omen,TRUE,Predictive_phenomenon (quasi-actively) provides cues for something
|
777 |
+
Ontogeny,FALSE,
|
778 |
+
Openness,FALSE,
|
779 |
+
Operate_vehicle,TRUE,
|
780 |
+
Operate_vehicle_scenario,FALSE,
|
781 |
+
Operating_a_system,TRUE,
|
782 |
+
Operational_testing,TRUE,
|
783 |
+
Opinion,TRUE,
|
784 |
+
Opportunity,FALSE,
|
785 |
+
Optical_image,FALSE,
|
786 |
+
Ordinal_numbers,FALSE,
|
787 |
+
Organization,FALSE,
|
788 |
+
Origin,FALSE,
|
789 |
+
Others_situation_as_stimulus,TRUE,
|
790 |
+
Out_of_existence,FALSE,
|
791 |
+
Pardon,TRUE,
|
792 |
+
Part_edge,FALSE,
|
793 |
+
Part_inner_outer,FALSE,
|
794 |
+
Part_ordered_segments,FALSE,
|
795 |
+
Part_orientational,FALSE,
|
796 |
+
Part_piece,FALSE,
|
797 |
+
Part_whole,FALSE,
|
798 |
+
Partiality,TRUE,
|
799 |
+
Participation,TRUE,
|
800 |
+
Partitive,FALSE,
|
801 |
+
Passing,TRUE,
|
802 |
+
Passing_off,TRUE,
|
803 |
+
Path_shape,FALSE,? are syntactically unergative
|
804 |
+
Path_traveled,FALSE,
|
805 |
+
Patrolling,TRUE,
|
806 |
+
Pattern,FALSE,
|
807 |
+
People,FALSE,
|
808 |
+
People_along_political_spectrum,FALSE,
|
809 |
+
People_by_age,FALSE,
|
810 |
+
People_by_jurisdiction,FALSE,
|
811 |
+
People_by_military_specialty,FALSE,
|
812 |
+
People_by_morality,FALSE,
|
813 |
+
People_by_origin,FALSE,
|
814 |
+
People_by_religion,FALSE,
|
815 |
+
People_by_residence,FALSE,
|
816 |
+
People_by_vocation,FALSE,
|
817 |
+
Perception,FALSE,
|
818 |
+
Perception_active,TRUE,
|
819 |
+
Perception_body,TRUE,? 'my head hurts’ → ‘my head causes pain sensation’ (?)
|
820 |
+
Perception_experience,TRUE,"? these are ‘passive experience’ verbs, but syntactically active/unergative; you can argue this kind of perception still implies active processing on the part of the experiencer"
|
821 |
+
Performers,FALSE,
|
822 |
+
Performers_and_roles,TRUE,"'feature.v’ is an odd one because agent/patient are reversed (NB only one annotated example, Fes seem not correct)"
|
823 |
+
Performing_arts,FALSE,
|
824 |
+
Personal_relationship,TRUE,"Mostly nouns/adj, but verbs 'sleep with’, ‘befriend’ are active"
|
825 |
+
Personal_success,TRUE,
|
826 |
+
Physical_artworks,FALSE,
|
827 |
+
Physical_entity,FALSE,
|
828 |
+
Piracy,TRUE,
|
829 |
+
Placing,TRUE,
|
830 |
+
Placing_scenario,FALSE,
|
831 |
+
Planned_trajectory,FALSE,
|
832 |
+
Planting,TRUE,
|
833 |
+
Plants,FALSE,
|
834 |
+
Point_of_dispute,FALSE,
|
835 |
+
Political_actions,TRUE,
|
836 |
+
Political_locales,FALSE,
|
837 |
+
Popularity,FALSE,
|
838 |
+
Posing_as,TRUE,
|
839 |
+
Position_on_a_scale,FALSE,
|
840 |
+
Possession,TRUE,? owning as action?
|
841 |
+
Possibility,FALSE,
|
842 |
+
Post_getting,FALSE,
|
843 |
+
Post_giving,FALSE,
|
844 |
+
Post_lose_possession,FALSE,
|
845 |
+
Post_receiving,FALSE,
|
846 |
+
Post_transfer,FALSE,
|
847 |
+
Posture,TRUE,
|
848 |
+
Practice,TRUE,
|
849 |
+
Praiseworthiness,FALSE,
|
850 |
+
Prank,FALSE,
|
851 |
+
Pre_getting,FALSE,
|
852 |
+
Pre_giving,FALSE,
|
853 |
+
Pre_lose_possession,FALSE,
|
854 |
+
Pre_receiving,FALSE,
|
855 |
+
Pre_transfer,FALSE,
|
856 |
+
Precariousness,TRUE,only verb teeter.v
|
857 |
+
Precipitation,FALSE,
|
858 |
+
Predicament,FALSE,
|
859 |
+
Predicting,TRUE,
|
860 |
+
Preference,TRUE,
|
861 |
+
Preferred_alternative_scenario,FALSE,
|
862 |
+
Preliminaries,FALSE,
|
863 |
+
Presence,FALSE,
|
864 |
+
Presentation_of_mitigation,FALSE,
|
865 |
+
Preserving,TRUE,
|
866 |
+
Prevarication,TRUE,
|
867 |
+
Prevent_or_allow_possession,TRUE,
|
868 |
+
Preventing_or_letting,TRUE,
|
869 |
+
Price_per_unit,FALSE,
|
870 |
+
Prison,FALSE,
|
871 |
+
Probability,FALSE,
|
872 |
+
Process,FALSE,
|
873 |
+
Process_completed_state,FALSE,
|
874 |
+
Process_continue,FALSE,
|
875 |
+
Process_end,FALSE,
|
876 |
+
Process_initial_state,FALSE,
|
877 |
+
Process_pause,FALSE,
|
878 |
+
Process_resume,FALSE,
|
879 |
+
Process_start,FALSE,
|
880 |
+
Process_stop,FALSE,
|
881 |
+
Process_stopped_state,FALSE,
|
882 |
+
Process_uncompleted_state,FALSE,
|
883 |
+
Processing_materials,TRUE,
|
884 |
+
Procreative_sex,TRUE,
|
885 |
+
Product_delivery,FALSE,
|
886 |
+
Product_development,TRUE,
|
887 |
+
Product_development_scenario,FALSE,
|
888 |
+
Product_line,FALSE,
|
889 |
+
Progression,FALSE,
|
890 |
+
Prohibiting_or_licensing,TRUE,
|
891 |
+
Project,FALSE,
|
892 |
+
Proliferating_in_number,FALSE,
|
893 |
+
Prominence,FALSE,
|
894 |
+
Proper_reference,FALSE,
|
895 |
+
Proportion,FALSE,
|
896 |
+
Proportional_quantity,FALSE,
|
897 |
+
Protecting,TRUE,
|
898 |
+
Protest,TRUE,
|
899 |
+
Provide_lodging,TRUE,
|
900 |
+
Proximity_image_schema,FALSE,
|
901 |
+
Public_services,FALSE,
|
902 |
+
Publishing,TRUE,
|
903 |
+
Punctual_perception,FALSE,
|
904 |
+
Purpose,FALSE,
|
905 |
+
Putting_out_fire,TRUE,
|
906 |
+
Quantified_mass,FALSE,
|
907 |
+
Quantity,FALSE,
|
908 |
+
Quarreling,TRUE,
|
909 |
+
Questioning,TRUE,
|
910 |
+
Quitting,TRUE,
|
911 |
+
Quitting_a_place,TRUE,
|
912 |
+
Race_descriptor,FALSE,
|
913 |
+
Range,FALSE,
|
914 |
+
Rank,FALSE,
|
915 |
+
Ranked_expectation,FALSE,
|
916 |
+
Rape,TRUE,
|
917 |
+
Rashness,FALSE,
|
918 |
+
Rate_description,FALSE,
|
919 |
+
Rate_quantification,FALSE,
|
920 |
+
Ratification,TRUE,
|
921 |
+
Reading_activity,TRUE,
|
922 |
+
Reading_aloud,TRUE,
|
923 |
+
Reading_perception,TRUE,
|
924 |
+
Reason,FALSE,
|
925 |
+
Reasoning,TRUE,
|
926 |
+
Reassuring,TRUE,
|
927 |
+
Rebellion,TRUE,
|
928 |
+
Receive_visitor_scenario,FALSE,
|
929 |
+
Receiving,TRUE,
|
930 |
+
Receiving_scenario,FALSE,
|
931 |
+
Reciprocality,FALSE,
|
932 |
+
Recording,TRUE,
|
933 |
+
Records,FALSE,
|
934 |
+
Recovery,FALSE,
|
935 |
+
Redirecting,TRUE,
|
936 |
+
Reference_text,FALSE,
|
937 |
+
Referring_by_name,TRUE,
|
938 |
+
Reforming_a_system,TRUE,
|
939 |
+
Regard,TRUE,
|
940 |
+
Region_with_portal,FALSE,
|
941 |
+
Reject_leadership,TRUE,does not exist in NLTK
|
942 |
+
Rejuvenation,TRUE,
|
943 |
+
Relating_concepts,TRUE,?
|
944 |
+
Relation,FALSE,
|
945 |
+
Relation_between_individuals,FALSE,
|
946 |
+
Relational_location,FALSE,
|
947 |
+
Relational_natural_features,FALSE,
|
948 |
+
Relational_political_locales,FALSE,
|
949 |
+
Relational_quantity,FALSE,
|
950 |
+
Relative_time,FALSE,?
|
951 |
+
Releasing,TRUE,
|
952 |
+
Releasing_from_custody,FALSE,
|
953 |
+
Reliance,TRUE,?
|
954 |
+
Reliance_on_expectation,TRUE,
|
955 |
+
Religious_belief,TRUE,
|
956 |
+
Remainder,FALSE,
|
957 |
+
Remembering_experience,TRUE,
|
958 |
+
Remembering_information,TRUE,
|
959 |
+
Remembering_to_do,TRUE,
|
960 |
+
Removing,TRUE,
|
961 |
+
Removing_scenario,FALSE,
|
962 |
+
Render_nonfunctional,TRUE,
|
963 |
+
Renting,TRUE,
|
964 |
+
Renting_out,TRUE,
|
965 |
+
Renunciation,TRUE,
|
966 |
+
Reparation,TRUE,
|
967 |
+
Repayment,TRUE,
|
968 |
+
Repel,TRUE,
|
969 |
+
Replacing,TRUE,
|
970 |
+
Reporting,TRUE,
|
971 |
+
Representative,FALSE,
|
972 |
+
Representing,TRUE,?
|
973 |
+
Request,TRUE,
|
974 |
+
Request_entity,TRUE,
|
975 |
+
Required_event,TRUE,?
|
976 |
+
Requirement_scenario,FALSE,
|
977 |
+
Rescuing,TRUE,
|
978 |
+
Research,TRUE,
|
979 |
+
Reserving,TRUE,
|
980 |
+
Reshaping,TRUE,
|
981 |
+
Residence,TRUE,
|
982 |
+
Resolve_problem,TRUE,
|
983 |
+
Respond_to_proposal,TRUE,
|
984 |
+
Response,TRUE,
|
985 |
+
Response_scenario,FALSE,
|
986 |
+
Responsibility,FALSE,
|
987 |
+
Rest,FALSE,
|
988 |
+
Result_of_attempt_scenario,FALSE,
|
989 |
+
Resurrection,TRUE,? similar to self_motion?
|
990 |
+
Retaining,TRUE,
|
991 |
+
Reveal_secret,TRUE,
|
992 |
+
Revenge,TRUE,
|
993 |
+
Revolution,FALSE,
|
994 |
+
Rewards_and_punishments,TRUE,
|
995 |
+
Ride_vehicle,TRUE,?
|
996 |
+
Rising_to_a_challenge,TRUE,
|
997 |
+
Risk_scenario,FALSE,
|
998 |
+
Risky_situation,FALSE,
|
999 |
+
Rite,TRUE,
|
1000 |
+
Roadways,FALSE,
|
1001 |
+
Robbery,TRUE,
|
1002 |
+
Rope_manipulation,TRUE,
|
1003 |
+
Rotting,FALSE,
|
1004 |
+
Run_risk,TRUE,
|
1005 |
+
Sacrificing_for,TRUE,
|
1006 |
+
Satisfying,TRUE,
|
1007 |
+
Scarcity,FALSE,
|
1008 |
+
Scheduling,TRUE,
|
1009 |
+
Scope,FALSE,
|
1010 |
+
Scouring,TRUE,
|
1011 |
+
Scrutinizing_for,FALSE,
|
1012 |
+
Scrutiny,TRUE,
|
1013 |
+
Searching_scenario,FALSE,
|
1014 |
+
Secrecy_status,FALSE,
|
1015 |
+
See_through,TRUE,
|
1016 |
+
Seeking,TRUE,
|
1017 |
+
Seeking_to_achieve,TRUE,
|
1018 |
+
Self_control,TRUE,
|
1019 |
+
Self_motion,TRUE,?
|
1020 |
+
Sending,TRUE,
|
1021 |
+
Sensation,FALSE,
|
1022 |
+
Sent_items,FALSE,
|
1023 |
+
Sentencing,TRUE,
|
1024 |
+
Separating,TRUE,
|
1025 |
+
Sequence,FALSE,
|
1026 |
+
Serving_in_capacity,TRUE,
|
1027 |
+
Set_of_interrelated_entities,FALSE,
|
1028 |
+
Set_relation,FALSE,
|
1029 |
+
Setting_back_burn,FALSE,
|
1030 |
+
Setting_fire,TRUE,
|
1031 |
+
Setting_out,TRUE,
|
1032 |
+
Severity_of_offense,FALSE,
|
1033 |
+
Sex,TRUE,
|
1034 |
+
Sexual_reproduction_scenario,FALSE,
|
1035 |
+
Shaped_part,FALSE,
|
1036 |
+
Shapes,FALSE,
|
1037 |
+
Sharing,TRUE,
|
1038 |
+
Sharpness,FALSE,
|
1039 |
+
Shoot_projectiles,TRUE,
|
1040 |
+
Shooting_scenario,FALSE,
|
1041 |
+
Shopping,TRUE,
|
1042 |
+
Short_selling,TRUE,
|
1043 |
+
Sidereal_appearance,FALSE,?
|
1044 |
+
Sign,TRUE,
|
1045 |
+
Sign_agreement,TRUE,
|
1046 |
+
Silencing,TRUE,
|
1047 |
+
Similarity,TRUE,? mimic etc
|
1048 |
+
Simple_name,FALSE,
|
1049 |
+
Simple_naming,TRUE,
|
1050 |
+
Simultaneity,FALSE,
|
1051 |
+
Size,FALSE,
|
1052 |
+
Sleep,TRUE,
|
1053 |
+
Sleep_wake_cycle,FALSE,
|
1054 |
+
Smuggling,TRUE,
|
1055 |
+
Soaking,TRUE,
|
1056 |
+
Soaking_up,TRUE,? metaphorical
|
1057 |
+
Sociability,FALSE,
|
1058 |
+
Social_behavior_evaluation,FALSE,
|
1059 |
+
Social_connection,FALSE,
|
1060 |
+
Social_desirability,FALSE,
|
1061 |
+
Social_event,FALSE,
|
1062 |
+
Social_event_collective,FALSE,
|
1063 |
+
Social_event_individuals,TRUE,
|
1064 |
+
Social_interaction_evaluation,FALSE,
|
1065 |
+
Socially_significant_history_scenario,FALSE,
|
1066 |
+
Sole_instance,FALSE,
|
1067 |
+
Sound_level,FALSE,
|
1068 |
+
Sound_movement,FALSE,?
|
1069 |
+
Sounds,FALSE,
|
1070 |
+
Source_of_getting,FALSE,
|
1071 |
+
Source_path_goal,FALSE,
|
1072 |
+
Spatial_co-location,FALSE,
|
1073 |
+
Spatial_contact,FALSE,
|
1074 |
+
Speak_on_topic,TRUE,
|
1075 |
+
Specific_individual,FALSE,
|
1076 |
+
Speed_description,FALSE,
|
1077 |
+
Spelling_and_pronouncing,TRUE,
|
1078 |
+
Sports_jargon,FALSE,
|
1079 |
+
Stage_of_progress,FALSE,
|
1080 |
+
Standing_by,TRUE,
|
1081 |
+
State,FALSE,
|
1082 |
+
State_continue,FALSE,
|
1083 |
+
State_of_entity,FALSE,
|
1084 |
+
Statement,TRUE,
|
1085 |
+
Stimulate_emotion,TRUE,? metaphorical
|
1086 |
+
Stimulus_focus,FALSE,
|
1087 |
+
Stinginess,TRUE,
|
1088 |
+
Store,FALSE,
|
1089 |
+
Storing,TRUE,
|
1090 |
+
Strictness,FALSE,
|
1091 |
+
Studying,TRUE,
|
1092 |
+
Suasion,TRUE,
|
1093 |
+
Subjective_influence,TRUE,
|
1094 |
+
Subjective_temperature,FALSE,?
|
1095 |
+
Submitting_documents,TRUE,
|
1096 |
+
Subordinates_and_superiors,TRUE,
|
1097 |
+
Subsisting,TRUE,
|
1098 |
+
Substance,FALSE,
|
1099 |
+
Substance_by_phase,FALSE,
|
1100 |
+
Subversion,TRUE,
|
1101 |
+
Success_or_failure,TRUE,
|
1102 |
+
Successful_action,TRUE,
|
1103 |
+
Successfully_communicate_message,TRUE,
|
1104 |
+
Sufficiency,FALSE,
|
1105 |
+
Suicide_attack,TRUE,? there are no verbs but there is an agentive example (‘they kamikazed the base’)
|
1106 |
+
Suitability,FALSE,
|
1107 |
+
Summarizing,TRUE,
|
1108 |
+
Supply,TRUE,
|
1109 |
+
Supporting,TRUE,
|
1110 |
+
Surpassing,TRUE,metaphorical
|
1111 |
+
Surrendering,TRUE,
|
1112 |
+
Surrendering_possession,TRUE,
|
1113 |
+
Surrounding,TRUE,
|
1114 |
+
Surviving,TRUE,
|
1115 |
+
Suspicion,TRUE,
|
1116 |
+
Symmetrical_collective_reciprocality,FALSE,
|
1117 |
+
System,FALSE,
|
1118 |
+
System_complexity,FALSE,
|
1119 |
+
Take_place_of,TRUE,
|
1120 |
+
Taking,TRUE,
|
1121 |
+
Taking_captive,TRUE,
|
1122 |
+
Taking_sides,TRUE,
|
1123 |
+
Taking_time,TRUE,
|
1124 |
+
Talking_into,TRUE,
|
1125 |
+
Tasting,TRUE,
|
1126 |
+
Team,FALSE,
|
1127 |
+
Telling,TRUE,
|
1128 |
+
Temperature,FALSE,
|
1129 |
+
Temporal_collocation,FALSE,
|
1130 |
+
Temporal_pattern,FALSE,
|
1131 |
+
Temporal_subregion,FALSE,
|
1132 |
+
Temporary_group,FALSE,
|
1133 |
+
Temporary_leave,TRUE,
|
1134 |
+
Temporary_stay,TRUE,
|
1135 |
+
Temporary_transfer_scenario,FALSE,
|
1136 |
+
Terms_of_agreement,FALSE,
|
1137 |
+
Terrorism,FALSE,
|
1138 |
+
Text,FALSE,
|
1139 |
+
Text_creation,TRUE,
|
1140 |
+
Theft,TRUE,
|
1141 |
+
Thermodynamic_phase,FALSE,
|
1142 |
+
Thriving,TRUE,
|
1143 |
+
Thwarting,TRUE,
|
1144 |
+
Time_period_of_action,FALSE,
|
1145 |
+
Time_vector,FALSE,
|
1146 |
+
Timespan,FALSE,
|
1147 |
+
Timetable,FALSE,
|
1148 |
+
Tolerating,TRUE,
|
1149 |
+
Tool_purpose,FALSE,
|
1150 |
+
Topic,TRUE,
|
1151 |
+
Touring,TRUE,
|
1152 |
+
Toxic_substance,FALSE,
|
1153 |
+
Transfer,TRUE,
|
1154 |
+
Transfer_scenario,FALSE,
|
1155 |
+
Transition_to_a_quality,FALSE,
|
1156 |
+
Transition_to_a_situation,FALSE,
|
1157 |
+
Transition_to_a_state,FALSE,
|
1158 |
+
Transitive_action,FALSE,
|
1159 |
+
Translating,TRUE,
|
1160 |
+
Transportation_status,FALSE,
|
1161 |
+
Trap,FALSE,
|
1162 |
+
Travel,TRUE,
|
1163 |
+
Traversing,TRUE,
|
1164 |
+
Treating_and_mistreating,TRUE,
|
1165 |
+
Trendiness,FALSE,
|
1166 |
+
Trial,FALSE,
|
1167 |
+
Triggering,TRUE,
|
1168 |
+
Trust,TRUE,
|
1169 |
+
Try_defendant,TRUE,
|
1170 |
+
Trying_out,TRUE,
|
1171 |
+
Turning_out,FALSE,
|
1172 |
+
Type,FALSE,
|
1173 |
+
Typicality,FALSE,
|
1174 |
+
Unattributed_information,TRUE,'rumor.v’ only exists in passive form
|
1175 |
+
Undergo_change,FALSE,
|
1176 |
+
Undergo_transformation,FALSE,
|
1177 |
+
Undergoing,FALSE,
|
1178 |
+
Undergoing_scenario,FALSE,
|
1179 |
+
Undressing,TRUE,
|
1180 |
+
Unemployment_rate,FALSE,
|
1181 |
+
Use_firearm,TRUE,
|
1182 |
+
Use_vehicle,FALSE,
|
1183 |
+
Used_up,FALSE,
|
1184 |
+
Usefulness,TRUE,
|
1185 |
+
Using,TRUE,
|
1186 |
+
Using_resource,TRUE,
|
1187 |
+
Vehicle,FALSE,
|
1188 |
+
Vehicle_departure_initial_stage,TRUE,? metaphorical
|
1189 |
+
Vehicle_landing,TRUE,? metaphorical
|
1190 |
+
Vehicle_subpart,FALSE,
|
1191 |
+
Verdict,TRUE,
|
1192 |
+
Verification,TRUE,
|
1193 |
+
Version_sequence,FALSE,
|
1194 |
+
Victim_operated_IED,FALSE,
|
1195 |
+
Violence,FALSE,
|
1196 |
+
Visit_host,FALSE,empty
|
1197 |
+
Visit_host_arrival,FALSE,empty
|
1198 |
+
Visit_host_departure,FALSE,empty
|
1199 |
+
Visit_host_stay,FALSE,empty
|
1200 |
+
Visiting,TRUE,
|
1201 |
+
Visiting_scenario,FALSE,
|
1202 |
+
Visiting_scenario_arrival,FALSE,
|
1203 |
+
Visiting_scenario_departing,FALSE,
|
1204 |
+
Visiting_scenario_stay,FALSE,
|
1205 |
+
Visitor_and_host,FALSE,
|
1206 |
+
Visitor_arrival,FALSE,
|
1207 |
+
Visitor_departure,FALSE,
|
1208 |
+
Visitor_scenario,FALSE,
|
1209 |
+
Vocalizations,FALSE,
|
1210 |
+
Volubility,FALSE,
|
1211 |
+
Wagering,TRUE,
|
1212 |
+
Waiting,TRUE,
|
1213 |
+
Waking_up,TRUE,
|
1214 |
+
Want_suspect,FALSE,
|
1215 |
+
Warning,TRUE,
|
1216 |
+
Waver_between_options,TRUE,
|
1217 |
+
Wealthiness,FALSE,
|
1218 |
+
Weapon,FALSE,
|
1219 |
+
Wearing,TRUE,
|
1220 |
+
Weather,FALSE,
|
1221 |
+
Wholes_and_parts,FALSE,
|
1222 |
+
Willingness,FALSE,
|
1223 |
+
Win_prize,TRUE,
|
1224 |
+
Withdraw_from_participation,TRUE,
|
1225 |
+
Within_distance,FALSE,
|
1226 |
+
Word_relations,FALSE,
|
1227 |
+
Work,TRUE,
|
1228 |
+
Working_a_post,TRUE,
|
1229 |
+
Worry,TRUE,
|
resources/crashes_frame_list.txt
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# based on Gosse's presentation in 1st meeting with Marco
|
2 |
+
Killing
|
3 |
+
Death
|
4 |
+
Impact
|
5 |
+
Catch_fire
|
6 |
+
Cause_harm
|
7 |
+
|
8 |
+
# extra to make more comparable with femicide data
|
9 |
+
Causation
|
10 |
+
Cause_motion
|
11 |
+
Dead_or_alive
|
12 |
+
Emotion_directed
|
13 |
+
Event
|
14 |
+
Experience_bodily_harm
|
resources/crashes_frame_to_roles.csv
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
frame,role:perpetrator_like,role:victim_like,role:cause_like,notes
|
2 |
+
Catch_fire,-,-,-,
|
3 |
+
Causation,Causer,Affected,Cause,
|
4 |
+
Cause_harm,Agent,Victim,Cause,
|
5 |
+
Cause_motion,-,-,-,does not seem to usually refer to the main murder event
|
6 |
+
Dead_or_alive,-,Protagonist,Explanation,
|
7 |
+
Death,-,Protagonist,Cause,
|
8 |
+
Emotion_directed,-,-,-,does not seem to usually refer to the main murder event
|
9 |
+
Event,-,-,-,does not involve any participants
|
10 |
+
Experience_bodily_harm,Experiencer|Body_part,-,-,
|
11 |
+
Killing,Killer,Victim,Cause,
|
resources/crashes_sources.csv
ADDED
@@ -0,0 +1,440 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ProviderName,ProviderFreq,ProviderNameCorr,RegionalScope,ContentType,MediumType,Country,Province,Locality,MediaCollection,MediaOwner,Duplicate,Notes
|
2 |
+
noordhollandsdagblad.nl,620,Noordhollands Dagblad,Regional,GeneralNews,Newspaper,Netherlands,Noord-Holland,-,Mediahuis Nederland,Mediahuis,,
|
3 |
+
gelderlander.nl,553,De Gelderlander,Regional,GeneralNews,Newspaper,Netherlands,Gelderland,-,ADR Nieuwsmedia,DPG Media,,
|
4 |
+
nhnieuws.nl,479,NH Nieuws,Regional,GeneralNews,TV-Radio,Netherlands,Gelderland,-,-,[public],,
|
5 |
+
rtvoost.nl,409,RTV Oost,Regional,GeneralNews,TV-Radio,Netherlands,Overijssel,-,-,[public],,
|
6 |
+
www.ad.nl,399,Algemeen Dagblad,National,GeneralNews,Newspaper,Netherlands,-,-,ADR Nieuwsmedia,DPG Media,,
|
7 |
+
rtvutrecht.nl,374,RTV Utrecht,National,GeneralNews,TV-Radio,Netherlands,Utrecht,-,-,[public],,
|
8 |
+
destentor.nl,326,De Stentor,Regional,GeneralNews,Newspaper,Netherlands,Flevoland|Gelderland|Overijssel,-,ADR Nieuwsmedia,DPG Media,,
|
9 |
+
omroepbrabant.nl,227,Omroep Brabant,Regional,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,-,-,[public],,
|
10 |
+
haarlemsdagblad.nl,219,Haarlems Dagblad,Local,GeneralNews,Newspaper,Netherlands,Noord-Holland,Haarlem,Mediahuis Nederland,Mediahuis,,
|
11 |
+
www.ed.nl,206,Eindhovens Dagblad,Local,GeneralNews,Newspaper,Netherlands,Noord-Brabant,Eindhoven,ADR Nieuwsmedia,DPG Media,,
|
12 |
+
www.bd.nl,198,Brabants Dagblad,Regional,GeneralNews,Newspaper,Netherlands,Noord-Brabant,-,ADR Nieuwsmedia,DPG Media,,
|
13 |
+
weblogzwolle.nl,191,Weblog Zwolle,Local,GeneralNews,OnlineOnly,Netherlands,Overijssel,Zwolle,-,[independent],,
|
14 |
+
www.at5.nl,173,AT5,Local,GeneralNews,TV-Radio,Netherlands,Noord-Holland,Amsterdam,-,[public],,
|
15 |
+
rtvfocuszwolle.nl,168,RTV Focus Zwolle,Local,GeneralNews,TV-Radio,Netherlands,Overijssel,Zwolle,-,[independent],,
|
16 |
+
hvzeeland.nl,163,HVZeeland.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zeeland,-,-,[independent],,
|
17 |
+
omroepgelderland.nl,142,Omroep Brabant,Regional,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,-,-,[public],,
|
18 |
+
1Limburg | Nieuws en sport uit Limburg,135,1Limburg,Regional,GeneralNews,OnlineOnly,Netherlands,Limburg_NL,-,-,[public],,"belongs to ""1L"" tv/radio channel, but separate brand"
|
19 |
+
www.hln.be,132,Het Laatste Nieuws,National,GeneralNews,Newspaper,Belgium,-,-,-,DPG Media,,
|
20 |
+
telegraaf.nl,124,De Telegraaf,National,GeneralNews,Newspaper,Netherlands,-,-,Mediahuis Nederland,Mediahuis,,
|
21 |
+
amstelveenz.nl,109,AmstelveenZ,Local,GeneralNews,Magazine,Netherlands,Noord-Holland,Amstelveen,-,[independent],,
|
22 |
+
tubantia.nl,105,Tubantia,Regional,GeneralNews,Magazine,Netherlands,Overijssel,-,ADR Nieuwsmedia,DPG Media,,
|
23 |
+
leidschdagblad.nl,100,Leidsch Dagblad,Local,GeneralNews,Magazine,Netherlands,Zuid-Holland,Leiden,-,DPG Media,,
|
24 |
+
bndestem.nl,92,BN DeStem,Regional,GeneralNews,Newspaper,Netherlands,Noord-Brabant|Zeeland,-,ADR Nieuwsmedia,DPG Media,,
|
25 |
+
nos.nl,92,NOS,National,GeneralNews,TV-Radio,Netherlands,-,-,NPO,[public],,
|
26 |
+
hartvannederland.nl,90,Hart van Nederland,National,GeneralNews,TV-Radio,Netherlands,-,-,SBS6,Talpa Network,,
|
27 |
+
Dagblad van het Noorden,85,Dagblad van het Noorden,Regional,GeneralNews,Newspaper,Netherlands,Drenthe|Groningen,-,NDC Mediagroep,Mediahuis,,
|
28 |
+
rtvdrenthe.nl,84,RTV Drenthe,Regional,GeneralNews,TV-Radio,Netherlands,Drenthe,-,-,[public],,
|
29 |
+
rtvnoord.nl,74,RTV Noord,Regional,GeneralNews,TV-Radio,Netherlands,Groningen,-,-,[public],,
|
30 |
+
112groningen.nl,72,112Groningen.nl,Local,EmergenciesNews,OnlineOnly,Netherlands,Groningen,Groningen,-,[independent],,
|
31 |
+
www.nu.nl,70,NU.nl ,National,GeneralNews,OnlineOnly,Netherlands,-,-,-,DPG Media,,
|
32 |
+
omroepwest.nl,67,Omroep West,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],,
|
33 |
+
RTV Rijnmond,62,RTV Rijnmond,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],,
|
34 |
+
www.pzc.nl,62,Provinciale Zeeuwse Courant,Regional,GeneralNews,Newspaper,Netherlands,Zeeland,-,-,[public],,
|
35 |
+
rijnmond.nl,61,RTV Rijnmond,Regional,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[public],"""RTV Rijnmond""",
|
36 |
+
112Twente.nl,59,112Twente,Regional,GeneralNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
|
37 |
+
Het Nieuwsblad,52,Het Nieuwsblad,National,GeneralNews,Newspaper,Belgium,-,-,-,[independent],,
|
38 |
+
hbvl.be,50,Het Belang van Limburg,Regional,GeneralNews,Newspaper,Belgium,Limburg_BE,-,Mediahuis België,Mediahuis,,
|
39 |
+
www.vrt.be,48,VRT,National,GeneralNews,TV-Radio,Belgium,-,-,-,[public],,
|
40 |
+
De Limburger,47,De Limburger,Regional,GeneralNews,Newspaper,Netherlands,Limburg_NL,-,Mediahuis Limburg,Mediahuis,,
|
41 |
+
112fryslan.nl,45,112 Fryslân,Regional,EmergenciesNews,OnlineOnly,Netherlands,Friesland,-,-,[independent],,
|
42 |
+
112HM.nl,44,112HM.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
|
43 |
+
omroepzeeland.nl,43,Omroep Zeeland,Regional,GeneralNews,TV-Radio,Netherlands,Zeeland,-,-,[public],,
|
44 |
+
GelreNieuws.nl,41,GelreNieuws,Regional,GeneralNews,OnlineOnly,Netherlands,Gelderland,-,-,Persbureau Heitink,,
|
45 |
+
twitter.com,39,Twitter,International,SocialMedia,OnlineOnly,-,-,-,-,-,,
|
46 |
+
Het Parool,39,Het Parool,National,GeneralNews,Newspaper,Netherlands,-,-,-,DPG Media,,strong local/regional focus but published nationally
|
47 |
+
nieuwsblad.be,38,Het Nieuwsblad,National,GeneralNews,Newspaper,Belgium,-,-,-,[independent],"""Het Nieuwsblad""",
|
48 |
+
dvhn.nl,33,Dagblad van het Noorden,Regional,GeneralNews,Newspaper,Netherlands,Drenthe|Groningen,-,NDC Mediagroep,Mediahuis,,
|
49 |
+
politie.nl,33,Politie,National,OrganizationSpecific,OnlineOnly,Netherlands,-,-,-,-,,official website of the Dutch National Police
|
50 |
+
112twente.nl,32,112Twente,Regional,GeneralNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],"""112Twente.nl""",
|
51 |
+
hardnieuws.nl,32,Hardnieuws,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
|
52 |
+
112 Overijssel,32,112 Overijssel,Regional,EmergenciesNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
|
53 |
+
www.lc.nl,30,Leeuwarder Courant,Regional,GeneralNews,Newspaper,Netherlands,Friesland,-,-,[independent],,
|
54 |
+
rtlnieuws.nl,29,RTL Nieuws,National,GeneralNews,TV-Radio,Netherlands,-,-,RTL Nederland,RTL Group,,
|
55 |
+
rtva.nl,28,RTVA,Local,GeneralNews,TV-Radio,Netherlands,Noord-Holland,Amstelveen,-,[independent],,
|
56 |
+
Leeuwarder Courant,28,Leeuwarder Courant,Regional,GeneralNews,Newspaper,Netherlands,Friesland,-,-,[independent],"""www.lc.nl""",
|
57 |
+
Gazet van Antwerpen,26,Gazet van Antwerpen,Regional,GeneralNews,Newspaper,Belgium,Antwerpen,-,Mediahuis België,Mediahuis,,
|
58 |
+
District8.net,26,District8,Regional,GeneralNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
|
59 |
+
Focus en WTV,24,Focus-WTV,Regional,GeneralNews,TV-Radio,Belgium,West-Vlaanderen,-,-,Roularta Media Group,,
|
60 |
+
studio040.nl,24,Studio 040,Local,GeneralNews,TV-Radio,Netherlands,Noord-Brabant,Eindhoven,-,[independent],,
|
61 |
+
112-Overijssel,24,112 Overijssel,Regional,EmergenciesNews,OnlineOnly,Netherlands,Overijssel,-,-,[independent],,
|
62 |
+
omroepflevoland.nl,23,Omroep Flevoland,Regional,GeneralNews,TV-Radio,Netherlands,Flevoland,-,-,[public],,
|
63 |
+
De Utrechtse Internet Courant,20,De Utrechtse Internet Courant,Local,GeneralNews,OnlineOnly,Netherlands,Utrecht,Utrecht,-,[independent],,
|
64 |
+
www.wos.nl,19,WOS,Local,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,Maassluis,-,[independent],,
|
65 |
+
wos.nl,19,WOS,Local,GeneralNews,TV-Radio,Netherlands,Zuid-Holland,Maassluis,-,[independent],"""www.wos.nl""",
|
66 |
+
OOG Radio en Televisie,18,OOG,Local,GeneralNews,TV-Radio,Netherlands,Groningen,Groningen,-,[independent],,
|
67 |
+
112barneveld.nl,17,112 Barneveld,Local,EmergenciesNews,TV-Radio,Netherlands,Gelderland,Barneveld,-,112Press,,
|
68 |
+
112hm.nl,17,112HM.nl,Regional,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,-,-,[independent],,
|
69 |
+
flashphoto.nl,17,FlashPhoto,Local,EmergenciesNews,OnlineOnly,Netherlands,Zuid-Holland,Rotterdam,-,[independent],,"specialized in photography, also (emergency) news"
|
70 |
+
TVOOST - Regionaal nieuws uit Oost-Vlaanderen,16,TV Oost,Regional,GeneralNews,TV-Radio,Belgium,Oost-Vlaanderen,-,-,Concentra,,
|
71 |
+
zwollenu.nl,15,ZwolleNu,Local,GeneralNews,OnlineOnly,Netherlands,Overijssel,Zwolle,-,[independent],,
|
72 |
+
112ede.nl,15,112 Ede,Local,EmergenciesNews,OnlineOnly,Netherlands,Gelderland,Ede,-,112Press,,
|
73 |
+
112brabant.nl,13,112 Brabant,Regional,EmergenciesNews,OnlineOnly,Netherlands,Noord-Brabant,-,-,[independent],,
|
74 |
+
TVL - Dagelijks nieuws uit Limburg,13,TVL,Regional,GeneralNews,TV-Radio,Belgium,Limburg_BE,-,-,Concentra,,
|
75 |
+
oogtv.nl,13,OOG,Local,GeneralNews,TV-Radio,Netherlands,Groningen,Groningen,-,[independent],"""OOG""",
|
76 |
+
zhzactueel.nl,12,ZHZ Actueel,Regional,EmergenciesNews,TV-Radio,Netherlands,Zuid-Holland,-,-,[independent],,
|
77 |
+
www.nrc.nl,12,NRC,National,GeneralNews,Newspaper,Netherlands,-,-,NRC Media,Mediahuis,,
|
78 |
+
stedendriehoek.net,12,Nieuwsblad Stedendriehoek,Regional,GeneralNews,Newspaper,Netherlands,Gelderland|Overijssel,-,-,[independent],,
|
79 |
+
ijmuidercourant.nl,11,IJmuider Courant,Local,GeneralNews,Newspaper,Netherlands,Noord-Holland,IJmuiden,Mediahuis Nederland,Mediahuis,,
|
80 |
+
Meternieuws.nl,10,Meter Nieuws,Regional,EmergenciesNews,OnlineOnly,Netherlands,Drenthe|Groningen|Overijssel,-,-,[independent],,
|
81 |
+
deswollenaer.nl,10,De Swollenaer,Local,GeneralNews,Newspaper,Netherlands,Overijssel,Zwolle,-,Brug Media,,
|
82 |
+
alkmaarcentraal.nl,10,Alkmaar Centraal,Local,GeneralNews,OnlineOnly,Netherlands,Noord-Holland,Alkmaar,-,[independent],,
|
83 |
+
112Vandaag,10,112Vandaag,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
|
84 |
+
mediatv.nl,10,MediaTV,National,EmergenciesNews,OnlineOnly,Netherlands,-,-,-,[independent],,
|
85 |
+
"gelderlander.nl, het laatste nieuws uit binnen- en buitenland, sport en show",10,De Gelderlander,Regional,GeneralNews,Newspaper,Netherlands,Gelderland,-,ADR Nieuwsmedia,DPG Media,"""gelderlander.nl""",
|
86 |
+
Weertdegekste.nl,9,,,,,,,,,,,
|
87 |
+
WâldNet,9,,,,,,,,,,,
|
88 |
+
transport-online.nl,9,,,,,,,,,,,
|
89 |
+
noordernieuws.nl,9,,,,,,,,,,,
|
90 |
+
regiopurmerend.nl,8,,,,,,,,,,,
|
91 |
+
https://www.vlaardingen24.nl,8,,,,,,,,,,,
|
92 |
+
Groninger Gezinsbode,8,,,,,,,,,,,
|
93 |
+
"Ring TV | Jouw zender, Jouw nieuws",8,,,,,,,,,,,
|
94 |
+
blikopnieuws.nl,8,,,,,,,,,,,
|
95 |
+
edestad.nl,8,,,,,,,,,,,
|
96 |
+
steenwijkercourant.nl,8,,,,,,,,,,,
|
97 |
+
nieuwsopbeeld.nl,8,,,,,,,,,,,
|
98 |
+
ROB-tv - Regionale Omroep Brabant,8,,,,,,,,,,,
|
99 |
+
barneveldsekrant.nl,8,,,,,,,,,,,
|
100 |
+
https://www.schiedam24.nl,8,,,,,,,,,,,
|
101 |
+
Sleutelstad.nl,7,,,,,,,,,,,
|
102 |
+
Unity NU is de nieuwssite voor de regio Leiden [www.unity.nu],7,,,,,,,,,,,
|
103 |
+
112WestFriesland.nl,7,,,,,,,,,,,
|
104 |
+
112vallei.nl,7,,,,,,,,,,,
|
105 |
+
Omroep Gelderland,7,,,,,,,,,,,
|
106 |
+
Het Belang van Limburg,7,,,,,,,,,,,
|
107 |
+
sleutelstad.nl,7,,,,,,,,,,,
|
108 |
+
Bredavandaag|HétnieuwsuitBreda,6,,,,,,,,,,,
|
109 |
+
alarmeringen.nl,6,,,,,,,,,,,
|
110 |
+
stedendriehoek.nl,6,,,,,,,,,,,
|
111 |
+
halstadcentraal.nl,6,,,,,,,,,,,
|
112 |
+
Westlanders.nu,6,,,,,,,,,,,
|
113 |
+
ATV - Antwerpse televisie,6,,,,,,,,,,,
|
114 |
+
Stefan Verkerk Fotografie en Webdesign,6,,,,,,,,,,,
|
115 |
+
De Gooi- en Eemlander,6,,,,,,,,,,,
|
116 |
+
alphens.nl,6,,,,,,,,,,,
|
117 |
+
112nieuwsonline.nl,6,,,,,,,,,,,
|
118 |
+
zwollezuidnieuws.nl,6,,,,,,,,,,,
|
119 |
+
1Limburg,5,,,,,,,,,,,
|
120 |
+
denoordoostpolder.nl,5,,,,,,,,,,,
|
121 |
+
112provincieutrecht.nl,5,,,,,,,,,,,
|
122 |
+
rtvzaanstreek.nl,5,,,,,,,,,,,
|
123 |
+
nederweert24.nl,5,,,,,,,,,,,
|
124 |
+
Nieuws dat je raakt. 24/24u – Nnieuws.be,5,,,,,,,,,,,
|
125 |
+
nieuws.nl,5,,,,,,,,,,,
|
126 |
+
RTV Oost,5,,,,,,,,,,,
|
127 |
+
regio15.nl,5,,,,,,,,,,,
|
128 |
+
De Standaard,5,,,,,,,,,,,
|
129 |
+
flevopost.nl,5,,,,,,,,,,,
|
130 |
+
regionieuwshoogeveen.nl,5,,,,,,,,,,,
|
131 |
+
petershotnews.nl | Nieuws & fotografie,5,,,,,,,,,,,
|
132 |
+
Nieuws op Beeld - Altijd het laatste (112) nieuws vanuit de regio Rotterdam-Rijnmond!,5,,,,,,,,,,,
|
133 |
+
ZwolleZuidNieuws: alles wat Zwolle Zuid beweegt!,4,,,,,,,,,,,
|
134 |
+
Telegraaf,4,,,,,,,,,,,
|
135 |
+
RTV Utrecht,4,,,,,,,,,,,
|
136 |
+
regioleidscherijn.nl,4,,,,,,,,,,,
|
137 |
+
Hart van Nederland,4,,,,,,,,,,,
|
138 |
+
dagblad070.nl,4,,,,,,,,,,,
|
139 |
+
nuus.be,4,,,,,,,,,,,
|
140 |
+
onswestfriesland.nl,4,,,,,,,,,,,
|
141 |
+
waldnet.nl,4,,,,,,,,,,,
|
142 |
+
NU,4,,,,,,,,,,,
|
143 |
+
www.gva.be,4,,,,,,,,,,,
|
144 |
+
bunniksnieuws.nl,4,,,,,,,,,,,
|
145 |
+
dalfsennet.nl,4,,,,,,,,,,,
|
146 |
+
112heuvelrug.nl,4,,,,,,,,,,,
|
147 |
+
hartvanlansingerland.nl,4,,,,,,,,,,,
|
148 |
+
"AD.nl, het laatste nieuws uit binnen- en buitenland, sport en show",4,,,,,,,,,,,
|
149 |
+
bruzz.be,4,,,,,,,,,,,
|
150 |
+
Vlissingen-Internetbode,3,,,,,,,,,,,
|
151 |
+
Blik op nieuws,3,,,,,,,,,,,
|
152 |
+
limburg24.nl,3,,,,,,,,,,,
|
153 |
+
www.gld.nl,3,,,,,,,,,,,
|
154 |
+
112zwolle.nl,3,,,,,,,,,,,
|
155 |
+
omroepvenray.nl,3,,,,,,,,,,,
|
156 |
+
lokaalgelderland.nl,3,,,,,,,,,,,
|
157 |
+
destadgorinchem.nl,3,,,,,,,,,,,
|
158 |
+
112veenendaal.nl,3,,,,,,,,,,,
|
159 |
+
denhaagfm.nl,3,,,,,,,,,,,
|
160 |
+
facebook.com,3,,,,,,,,,,,
|
161 |
+
112midden-zeeland.nl,3,,,,,,,,,,,
|
162 |
+
de Volkskrant,3,,,,,,,,,,,
|
163 |
+
meppelercourant.nl,3,,,,,,,,,,,
|
164 |
+
Neustadt-Geflüster,3,,,,,,,,,,,
|
165 |
+
goudsdagblad.nl,3,,,,,,,,,,,
|
166 |
+
schie.nu,3,,,,,,,,,,,
|
167 |
+
oozo.nl,3,,,,,,,,,,,
|
168 |
+
www.rd.nl,3,,,,,,,,,,,
|
169 |
+
voorburgsdagblad.nl,3,,,,,,,,,,,
|
170 |
+
NieuwsOverijssel.nl,3,,,,,,,,,,,
|
171 |
+
ZwolleZuidNieuws: alles wat Zwolle-Zuid beweegt!,3,,,,,,,,,,,
|
172 |
+
112inbeeld.nl,3,,,,,,,,,,,
|
173 |
+
bredavandaag.nl,3,,,,,,,,,,,
|
174 |
+
De Jutter | De Hofgeest,2,,,,,,,,,,,
|
175 |
+
Woerden.TV,2,,,,,,,,,,,
|
176 |
+
knipselkrant-curacao.com,2,,,,,,,,,,,
|
177 |
+
heerenveensecourant.nl,2,,,,,,,,,,,
|
178 |
+
ThePostOnline,2,,,,,,,,,,,
|
179 |
+
regio8.nl,2,,,,,,,,,,,
|
180 |
+
BarendrechtNU.nl,2,,,,,,,,,,,
|
181 |
+
"pzc.nl, het laatste nieuws uit binnen- en buitenland, sport en show",2,,,,,,,,,,,
|
182 |
+
weespernieuws.nl,2,,,,,,,,,,,
|
183 |
+
Amstelveenz,2,,,,,,,,,,,
|
184 |
+
stadtiel.nl,2,,,,,,,,,,,
|
185 |
+
gouweijsselnieuws.nl,2,,,,,,,,,,,
|
186 |
+
Nieuws op Beeld,2,,,,,,,,,,,
|
187 |
+
heerhugowaardcentraal.nl,2,,,,,,,,,,,
|
188 |
+
nieuwsbladdezaankanter.nl,2,,,,,,,,,,,
|
189 |
+
www.avs.be,2,,,,,,,,,,,
|
190 |
+
haarlemsweekblad.nl,2,,,,,,,,,,,
|
191 |
+
yomyom.net,2,,,,,,,,,,,
|
192 |
+
mooirooi.nl,2,,,,,,,,,,,
|
193 |
+
oisterwijknieuws.nl,2,,,,,,,,,,,
|
194 |
+
rtv-apeldoorn.nl,2,,,,,,,,,,,
|
195 |
+
112amersfoort.nl,2,,,,,,,,,,,
|
196 |
+
dedemsvaartsecourant.nl,2,,,,,,,,,,,
|
197 |
+
ed.nl,2,,,,,,,,,,,
|
198 |
+
soestercourant.nl,2,,,,,,,,,,,
|
199 |
+
heemsteedsecourant.nl,2,,,,,,,,,,,
|
200 |
+
112hoogezand.nl,2,,,,,,,,,,,
|
201 |
+
hetstreekblad.nl,2,,,,,,,,,,,
|
202 |
+
NRC,2,,,,,,,,,,,
|
203 |
+
112nieuws.net,2,,,,,,,,,,,
|
204 |
+
De Limburger Mobile,2,,,,,,,,,,,
|
205 |
+
0297.nl,2,,,,,,,,,,,
|
206 |
+
drachtstercourant.nl,2,,,,,,,,,,,
|
207 |
+
Sittard-Geleen,2,,,,,,,,,,,
|
208 |
+
hoogenlaag.nl,2,,,,,,,,,,,
|
209 |
+
drentsnieuws.nl,2,,,,,,,,,,,
|
210 |
+
brugnieuws.nl,2,,,,,,,,,,,
|
211 |
+
medemblikactueel.nl,2,,,,,,,,,,,
|
212 |
+
rechtspraak.nl,2,,,,,,,,,,,
|
213 |
+
gooieneembode.nl,2,,,,,,,,,,,
|
214 |
+
arenalokaal.nl,2,,,,,,,,,,,
|
215 |
+
DitisdeZaanstreek.nl,2,,,,,,,,,,,
|
216 |
+
hcnieuws.nl,2,,,,,,,,,,,
|
217 |
+
https://www.heerhugowaardsdagblad.nl/,2,,,,,,,,,,,
|
218 |
+
schagenfm.nl,2,,,,,,,,,,,
|
219 |
+
hv-almere.nl,2,,,,,,,,,,,
|
220 |
+
112achterhoek-nieuws.nl,2,,,,,,,,,,,
|
221 |
+
peelenmaasvenray.nl,2,,,,,,,,,,,
|
222 |
+
frieslandactueel.nl,2,,,,,,,,,,,
|
223 |
+
www.rtv.be,2,,,,,,,,,,,
|
224 |
+
hoogeveenschecourant.nl,2,,,,,,,,,,,
|
225 |
+
Nieuws Apeldoorn Direct,2,,,,,,,,,,,
|
226 |
+
nieuwsuitberkelland.nl,2,,,,,,,,,,,
|
227 |
+
112meerlanden.nl,2,,,,,,,,,,,
|
228 |
+
internetbode.nl,2,,,,,,,,,,,
|
229 |
+
nieuw-volendam.nl,2,,,,,,,,,,,
|
230 |
+
katwijkactueel.nl,2,,,,,,,,,,,
|
231 |
+
112schiedam.nl,2,,,,,,,,,,,
|
232 |
+
compactmedia.nl,2,,,,,,,,,,,
|
233 |
+
culemborgsecourant.nl,2,,,,,,,,,,,
|
234 |
+
Alphens.nl,2,,,,,,,,,,,
|
235 |
+
112ijmond.nl,2,,,,,,,,,,,
|
236 |
+
detoren.net,2,,,,,,,,,,,
|
237 |
+
gorkumsnieuws.nl,2,,,,,,,,,,,
|
238 |
+
Redactie24.be,2,,,,,,,,,,,
|
239 |
+
wnl.tv,2,,,,,,,,,,,
|
240 |
+
alarmeringdroid.nl,1,,,,,,,,,,,
|
241 |
+
HCNieuws,1,,,,,,,,,,,
|
242 |
+
frontpage.fok.nl,1,,,,,,,,,,,
|
243 |
+
112vdg.nl,1,,,,,,,,,,,
|
244 |
+
Ede Stad,1,,,,,,,,,,,
|
245 |
+
my net rosh haayin,1,,,,,,,,,,,
|
246 |
+
Noordhollands Dagblad,1,,,,,,,,,,,
|
247 |
+
Zundert-Internetbode,1,,,,,,,,,,,
|
248 |
+
defeanster.nl,1,,,,,,,,,,,
|
249 |
+
heerhugowaardalife.nl,1,,,,,,,,,,,
|
250 |
+
inteylingen.nl,1,,,,,,,,,,,
|
251 |
+
The News Herald,1,,,,,,,,,,,
|
252 |
+
Rijswijk.TV,1,,,,,,,,,,,
|
253 |
+
Leidsch Dagblad,1,,,,,,,,,,,
|
254 |
+
mynetkrayot,1,,,,,,,,,,,
|
255 |
+
OldambtNu.nl,1,,,,,,,,,,,
|
256 |
+
instagram.com,1,,,,,,,,,,,
|
257 |
+
Bonaire.Nu,1,,,,,,,,,,,
|
258 |
+
nieuwsbladdekoerier.nl,1,,,,,,,,,,,
|
259 |
+
BergenopZoom-Internetbode,1,,,,,,,,,,,
|
260 |
+
1twente.nl,1,,,,,,,,,,,
|
261 |
+
www.rtl.de,1,,,,,,,,,,,
|
262 |
+
tvvalkenburg.tv,1,,,,,,,,,,,
|
263 |
+
alarmfase1.nl,1,,,,,,,,,,,
|
264 |
+
gids.tv,1,,,,,,,,,,,
|
265 |
+
RTV Uitgeest,1,,,,,,,,,,,
|
266 |
+
De Telegraaf,1,,,,,,,,,,,
|
267 |
+
112-dokkum.nl,1,,,,,,,,,,,
|
268 |
+
wijksnieuws.nl,1,,,,,,,,,,,
|
269 |
+
hetkontakt.nl,1,,,,,,,,,,,
|
270 |
+
landelijkeorganisatieverkeersslachtoffers.nl,1,,,,,,,,,,,
|
271 |
+
rtv.be,1,,,,,,,,,,,
|
272 |
+
indebuurt Ede,1,,,,,,,,,,,
|
273 |
+
112 groningen.nl,1,,,,,,,,,,,
|
274 |
+
Ik hou van Arnhem,1,,,,,,,,,,,
|
275 |
+
112hardenberg.nu,1,,,,,,,,,,,
|
276 |
+
stadwageningen.nl,1,,,,,,,,,,,
|
277 |
+
ridderkerksdagblad.nl,1,,,,,,,,,,,
|
278 |
+
geenstijl.nl,1,,,,,,,,,,,
|
279 |
+
dewoudenberger.nl,1,,,,,,,,,,,
|
280 |
+
https://www.alkmaarsdagblad.nl/,1,,,,,,,,,,,
|
281 |
+
nieuwsbladnof.nl,1,,,,,,,,,,,
|
282 |
+
Nieuwe Meerbode,1,,,,,,,,,,,
|
283 |
+
looopings.nl,1,,,,,,,,,,,
|
284 |
+
amstelveensnieuwsblad.nl,1,,,,,,,,,,,
|
285 |
+
texelsecourant.nl,1,,,,,,,,,,,
|
286 |
+
anwb.nl,1,,,,,,,,,,,
|
287 |
+
indebuurt Delft,1,,,,,,,,,,,
|
288 |
+
https://www.zutphen24.nl,1,,,,,,,,,,,
|
289 |
+
Teylingen,1,,,,,,,,,,,
|
290 |
+
112Midden-Zeeland,1,,,,,,,,,,,
|
291 |
+
noorderkrant.nl,1,,,,,,,,,,,
|
292 |
+
onswestbrabant.nl,1,,,,,,,,,,,
|
293 |
+
lindanieuws.nl,1,,,,,,,,,,,
|
294 |
+
112persfotografie.nl,1,,,,,,,,,,,
|
295 |
+
antilliaansdagblad.com,1,,,,,,,,,,,
|
296 |
+
Site-Knack-NL,1,,,,,,,,,,,
|
297 |
+
alblasserdamsnieuws.nl,1,,,,,,,,,,,
|
298 |
+
112harderwijk.nl,1,,,,,,,,,,,
|
299 |
+
l1.nl,1,,,,,,,,,,,
|
300 |
+
Nederweert24,1,,,,,,,,,,,
|
301 |
+
Radio.NL,1,,,,,,,,,,,
|
302 |
+
LokaalGelderland,1,,,,,,,,,,,
|
303 |
+
hoekschnieuws.nl,1,,,,,,,,,,,
|
304 |
+
nieuwsbladgeldermalsen.nl,1,,,,,,,,,,,
|
305 |
+
Veenendaalse Krant,1,,,,,,,,,,,
|
306 |
+
112-nederland.nl,1,,,,,,,,,,,
|
307 |
+
demorgen.be,1,,,,,,,,,,,
|
308 |
+
www.gic.nl,1,,,,,,,,,,,
|
309 |
+
Unity NU is de nieuwssite voor de regio Leiden,1,,,,,,,,,,,
|
310 |
+
Middelburg-Internetbode,1,,,,,,,,,,,
|
311 |
+
groot-waterland.nl,1,,,,,,,,,,,
|
312 |
+
regiobodeonline.nl,1,,,,,,,,,,,
|
313 |
+
Nudrenthe.nl | Boven op het Nieuws |,1,,,,,,,,,,,
|
314 |
+
Gocar.be,1,,,,,,,,,,,
|
315 |
+
KW.be - Nieuws uit West-Vlaanderen,1,,,,,,,,,,,
|
316 |
+
harenerweekblad.nl,1,,,,,,,,,,,
|
317 |
+
nbcnews.com,1,,,,,,,,,,,
|
318 |
+
Omroep Brabant,1,,,,,,,,,,,
|
319 |
+
112apeldoorn.nl,1,,,,,,,,,,,
|
320 |
+
linda.nl,1,,,,,,,,,,,
|
321 |
+
assercourant.nl,1,,,,,,,,,,,
|
322 |
+
prorail.nl,1,,,,,,,,,,,
|
323 |
+
bbc.co.uk,1,,,,,,,,,,,
|
324 |
+
schipholregio.nl,1,,,,,,,,,,,
|
325 |
+
lequipe.fr,1,,,,,,,,,,,
|
326 |
+
Politie.nl,1,,,,,,,,,,,
|
327 |
+
welingelichtekringen.nl,1,,,,,,,,,,,
|
328 |
+
destadamersfoort.nl,1,,,,,,,,,,,
|
329 |
+
curacaonieuws.nu,1,,,,,,,,,,,
|
330 |
+
Incidenten Apeldoorn e.o.,1,,,,,,,,,,,
|
331 |
+
arubanieuws.nu,1,,,,,,,,,,,
|
332 |
+
Vrij Nederland,1,,,,,,,,,,,
|
333 |
+
Omroep Brabant,1,,,,,,,,,,,
|
334 |
+
hetdeventernieuws.nl,1,,,,,,,,,,,
|
335 |
+
Krimpenerwaard,1,,,,,,,,,,,
|
336 |
+
avrotros.nl,1,,,,,,,,,,,
|
337 |
+
elpais.com.co,1,,,,,,,,,,,
|
338 |
+
112marum.nl,1,,,,,,,,,,,
|
339 |
+
https://www.denheldersdagblad.nl/,1,,,,,,,,,,,
|
340 |
+
ZHZActueel,1,,,,,,,,,,,
|
341 |
+
bashinform.ru,1,,,,,,,,,,,
|
342 |
+
FOK!,1,,,,,,,,,,,
|
343 |
+
bx1.be,1,,,,,,,,,,,
|
344 |
+
denhelderactueel.nl,1,,,,,,,,,,,
|
345 |
+
www.bbc.com,1,,,,,,,,,,,
|
346 |
+
eemskrant.nl,1,,,,,,,,,,,
|
347 |
+
Regio Leidsche Rijn,1,,,,,,,,,,,
|
348 |
+
Omroep Zeeland,1,,,,,,,,,,,
|
349 |
+
topics.nl,1,,,,,,,,,,,
|
350 |
+
HetKrantje-Online.nl,1,,,,,,,,,,,
|
351 |
+
https://www.langedijkerdagblad.nl/,1,,,,,,,,,,,
|
352 |
+
SpoorPro.nl,1,,,,,,,,,,,
|
353 |
+
radio2.be,1,,,,,,,,,,,
|
354 |
+
Metronieuws.nl,1,,,,,,,,,,,
|
355 |
+
caribischnetwerk.ntr.nl,1,,,,,,,,,,,
|
356 |
+
het-westerkwartier.nl,1,,,,,,,,,,,
|
357 |
+
rijschoolpro.nl,1,,,,,,,,,,,
|
358 |
+
rn7.nl,1,,,,,,,,,,,
|
359 |
+
Eemskrant,1,,,,,,,,,,,
|
360 |
+
HS-Krant,1,,,,,,,,,,,
|
361 |
+
grootheerenveen.nl,1,,,,,,,,,,,
|
362 |
+
RTV Zaanstreek,1,,,,,,,,,,,
|
363 |
+
joustercourant.nl,1,,,,,,,,,,,
|
364 |
+
112vlissingen-souburg.nl,1,,,,,,,,,,,
|
365 |
+
112 Groningen,1,,,,,,,,,,,
|
366 |
+
ZuidOosthoeker,1,,,,,,,,,,,
|
367 |
+
AD.nl,1,,,,,,,,,,,
|
368 |
+
Eemskrant | Nieuws uit de regio,1,,,,,,,,,,,
|
369 |
+
Steenwijkerland,1,,,,,,,,,,,
|
370 |
+
112tv.nl,1,,,,,,,,,,,
|
371 |
+
Groningen,1,,,,,,,,,,,
|
372 |
+
Reno Gazette Journal,1,,,,,,,,,,,
|
373 |
+
haspengouwsnieuws.be,1,,,,,,,,,,,
|
374 |
+
stellingwerf.nl,1,,,,,,,,,,,
|
375 |
+
globo.com,1,,,,,,,,,,,
|
376 |
+
112lansingerland.nu,1,,,,,,,,,,,
|
377 |
+
bicycling.com,1,,,,,,,,,,,
|
378 |
+
woldercourant.nl,1,,,,,,,,,,,
|
379 |
+
omroepalmere.nl,1,,,,,,,,,,,
|
380 |
+
Den Helder actueel,1,,,,,,,,,,,
|
381 |
+
rtvhattem.nl,1,,,,,,,,,,,
|
382 |
+
WNL,1,,,,,,,,,,,
|
383 |
+
Omroep Venray,1,,,,,,,,,,,
|
384 |
+
Dagblad070,1,,,,,,,,,,,
|
385 |
+
friesenieuwsflitsen.nl,1,,,,,,,,,,,
|
386 |
+
Kampen Online,1,,,,,,,,,,,
|
387 |
+
dailymail.co.uk,1,,,,,,,,,,,
|
388 |
+
https://112hm.nl/2021/07/21/ernstig-ongeval-op-de-hoogeveenseweg-hazerswoude-dorp-veroorzaakt-door-overstekende-hond/,1,,,,,,,,,,,
|
389 |
+
112insteenwijkerland.nl,1,,,,,,,,,,,
|
390 |
+
varnws.nl,1,,,,,,,,,,,
|
391 |
+
actu.fr,1,,,,,,,,,,,
|
392 |
+
hetkompashardinxveld-giessendam.nl,1,,,,,,,,,,,
|
393 |
+
uitkijkpost.nl,1,,,,,,,,,,,
|
394 |
+
RN7,1,,,,,,,,,,,
|
395 |
+
NOS,1,,,,,,,,,,,
|
396 |
+
uitzendinggemist.net,1,,,,,,,,,,,
|
397 |
+
Nachrichten aus Leipzig - Leipziger Zeitung,1,,,,,,,,,,,
|
398 |
+
twentefm.nl,1,,,,,,,,,,,
|
399 |
+
Sergevanduijnhoven's Blog,1,,,,,,,,,,,
|
400 |
+
Barneveldse Krant,1,,,,,,,,,,,
|
401 |
+
leuvenactueel.be,1,,,,,,,,,,,
|
402 |
+
https://www.schagerdagblad.nl/,1,,,,,,,,,,,
|
403 |
+
coevordenhuisaanhuis.nl,1,,,,,,,,,,,
|
404 |
+
blinker.co.il,1,,,,,,,,,,,
|
405 |
+
Genderendigitaal,1,,,,,,,,,,,
|
406 |
+
De Gelderlander,1,,,,,,,,,,,
|
407 |
+
dagblad010.nl,1,,,,,,,,,,,
|
408 |
+
traumaheli-mmt.nl,1,,,,,,,,,,,
|
409 |
+
limburger.nl,1,,,,,,,,,,,
|
410 |
+
Roosendaal-Internetbode,1,,,,,,,,,,,
|
411 |
+
bommelerwaardgids.nl,1,,,,,,,,,,,
|
412 |
+
Alkmaar Centraal,1,,,,,,,,,,,
|
413 |
+
IJsselmondeNieuws en omstreken op facebook.com,1,,,,,,,,,,,
|
414 |
+
theguardian.com,1,,,,,,,,,,,
|
415 |
+
112 Vlissingen & Souburg,1,,,,,,,,,,,
|
416 |
+
rtvpurmerend.nl,1,,,,,,,,,,,
|
417 |
+
Site-KW-NL,1,,,,,,,,,,,
|
418 |
+
10yan.com,1,,,,,,,,,,,
|
419 |
+
petershotnews.nl,1,,,,,,,,,,,
|
420 |
+
Dumbarton and Vale of Leven Reporter,1,,,,,,,,,,,
|
421 |
+
cyclingweekly.com,1,,,,,,,,,,,
|
422 |
+
hanzestad.nl,1,,,,,,,,,,,
|
423 |
+
emmen.nu,1,,,,,,,,,,,
|
424 |
+
foxreno.com,1,,,,,,,,,,,
|
425 |
+
De Krant van Midden-Drenthe,1,,,,,,,,,,,
|
426 |
+
BBC,1,,,,,,,,,,,
|
427 |
+
112drachten.nl,1,,,,,,,,,,,
|
428 |
+
brummensnieuws.nl,1,,,,,,,,,,,
|
429 |
+
Streetsblog New York City,1,,,,,,,,,,,
|
430 |
+
De Heemsteder,1,,,,,,,,,,,
|
431 |
+
indebuurt Utrecht,1,,,,,,,,,,,
|
432 |
+
westfriesweekblad.nl,1,,,,,,,,,,,
|
433 |
+
1istochnik.ru,1,,,,,,,,,,,
|
434 |
+
kipa.co.il,1,,,,,,,,,,,
|
435 |
+
veluweland.nl,1,,,,,,,,,,,
|
436 |
+
DNN - Dresdner Neueste Nachrichten,1,,,,,,,,,,,
|
437 |
+
112wijchensnieuws.nl,1,,,,,,,,,,,
|
438 |
+
delpher.nl,1,,,,,,,,,,,
|
439 |
+
indebuurt Doetinchem,1,,,,,,,,,,,
|
440 |
+
news4jax.com,1,,,,,,,,,,,
|
resources/deep_frame_cache.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
resources/dep_labels.txt
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
acl:relcl↑
|
2 |
+
acl:relcl↓
|
3 |
+
acl↑
|
4 |
+
acl↓
|
5 |
+
advcl↑
|
6 |
+
advcl↓
|
7 |
+
advmod↑
|
8 |
+
advmod↓
|
9 |
+
amod↑
|
10 |
+
amod↓
|
11 |
+
appos↑
|
12 |
+
appos↓
|
13 |
+
aux:pass↑
|
14 |
+
aux:pass↓
|
15 |
+
aux↑
|
16 |
+
aux↓
|
17 |
+
case↑
|
18 |
+
case↓
|
19 |
+
ccomp↑
|
20 |
+
ccomp↓
|
21 |
+
cc↑
|
22 |
+
cc↓
|
23 |
+
compound:prt↑
|
24 |
+
compound:prt↓
|
25 |
+
compound↑
|
26 |
+
compound↓
|
27 |
+
conj↑
|
28 |
+
conj↓
|
29 |
+
cop↑
|
30 |
+
cop↓
|
31 |
+
csubj↑
|
32 |
+
csubj↓
|
33 |
+
dep↑
|
34 |
+
dep↓
|
35 |
+
det:poss↑
|
36 |
+
det:poss↓
|
37 |
+
det:predet↑
|
38 |
+
det:predet↓
|
39 |
+
det↑
|
40 |
+
det↓
|
41 |
+
discourse↑
|
42 |
+
discourse↓
|
43 |
+
expl:impers↑
|
44 |
+
expl:impers↓
|
45 |
+
expl:pass↓
|
46 |
+
expl:pv↓
|
47 |
+
expl↑
|
48 |
+
expl↓
|
49 |
+
fixed↑
|
50 |
+
fixed↓
|
51 |
+
flat:foreign↑
|
52 |
+
flat:name↑
|
53 |
+
flat:name↓
|
54 |
+
flat↑
|
55 |
+
flat↓
|
56 |
+
iobj↑
|
57 |
+
iobj↓
|
58 |
+
mark↑
|
59 |
+
mark↓
|
60 |
+
nmod:poss↑
|
61 |
+
nmod:poss↓
|
62 |
+
nmod↑
|
63 |
+
nmod↓
|
64 |
+
nsubj:pass↑
|
65 |
+
nsubj:pass↓
|
66 |
+
nsubj↑
|
67 |
+
nsubj↓
|
68 |
+
nummod↑
|
69 |
+
nummod↓
|
70 |
+
obj↑
|
71 |
+
obj↓
|
72 |
+
obl:agent↑
|
73 |
+
obl:agent↓
|
74 |
+
obl↑
|
75 |
+
obl↓
|
76 |
+
orphan↓
|
77 |
+
parataxis↑
|
78 |
+
parataxis↓
|
79 |
+
punct↑
|
80 |
+
punct↓
|
81 |
+
vocative↑
|
82 |
+
vocative↓
|
83 |
+
xcomp↑
|
84 |
+
xcomp↓
|
85 |
+
↑--acl:relcl↓
|
86 |
+
↑--acl↓
|
87 |
+
↑--advcl↓
|
88 |
+
↑--advmod↓
|
89 |
+
↑--amod↓
|
90 |
+
↑--appos↓
|
91 |
+
↑--aux:pass↓
|
92 |
+
↑--aux↓
|
93 |
+
↑--case↓
|
94 |
+
↑--ccomp↓
|
95 |
+
↑--cc↓
|
96 |
+
↑--compound:prt↓
|
97 |
+
↑--compound↓
|
98 |
+
↑--conj↓
|
99 |
+
↑--cop↓
|
100 |
+
↑--csubj↓
|
101 |
+
↑--dep↓
|
102 |
+
↑--det:poss↓
|
103 |
+
↑--det↓
|
104 |
+
↑--discourse↓
|
105 |
+
↑--expl:impers↓
|
106 |
+
↑--expl:pass↓
|
107 |
+
↑--expl↓
|
108 |
+
↑--fixed↓
|
109 |
+
↑--flat:foreign↓
|
110 |
+
↑--flat:name↓
|
111 |
+
↑--flat↓
|
112 |
+
↑--iobj↓
|
113 |
+
↑--mark↓
|
114 |
+
↑--nmod:poss↓
|
115 |
+
↑--nmod↓
|
116 |
+
↑--nsubj:pass↓
|
117 |
+
↑--nsubj↓
|
118 |
+
↑--nummod↓
|
119 |
+
↑--obj↓
|
120 |
+
↑--obl:agent↓
|
121 |
+
↑--obl↓
|
122 |
+
↑--parataxis↓
|
123 |
+
↑--xcomp↓
|
124 |
+
↓--acl:relcl↓
|
125 |
+
↓--acl↓
|
126 |
+
↓--advcl↓
|
127 |
+
↓--advmod↓
|
128 |
+
↓--amod↓
|
129 |
+
↓--appos↓
|
130 |
+
↓--aux:pass↓
|
131 |
+
↓--aux↓
|
132 |
+
↓--case↓
|
133 |
+
↓--ccomp↓
|
134 |
+
↓--cc↓
|
135 |
+
↓--compound:prt↓
|
136 |
+
↓--compound↓
|
137 |
+
↓--conj↓
|
138 |
+
↓--cop↓
|
139 |
+
↓--dep↓
|
140 |
+
↓--det:poss↓
|
141 |
+
↓--det↓
|
142 |
+
↓--expl:impers↓
|
143 |
+
↓--expl↓
|
144 |
+
↓--fixed↓
|
145 |
+
↓--flat:name↓
|
146 |
+
↓--flat↓
|
147 |
+
↓--iobj↓
|
148 |
+
↓--mark↓
|
149 |
+
↓--nmod:poss↓
|
150 |
+
↓--nmod↓
|
151 |
+
↓--nsubj:pass↓
|
152 |
+
↓--nsubj↓
|
153 |
+
↓--nummod↓
|
154 |
+
↓--obj↓
|
155 |
+
↓--obl:agent↓
|
156 |
+
↓--obl↓
|
157 |
+
↓--parataxis↓
|
158 |
+
↓--xcomp↓
|
159 |
+
⋆
|
resources/femicide_frame_list.txt
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# 1) based on Gaetana & Marion's template file
|
2 |
+
Abusing#Violence
|
3 |
+
Attack#Violence
|
4 |
+
Cause_harm#Violence
|
5 |
+
Hit_target#Violence
|
6 |
+
Killing#Murder
|
7 |
+
Rape#Violence
|
8 |
+
Use_firearm#Violence
|
9 |
+
|
10 |
+
# 2) based on Gosse's paper/document (Table 2)
|
11 |
+
Attack#Violence
|
12 |
+
Causation
|
13 |
+
Cause_harm#Violence
|
14 |
+
Cause_motion
|
15 |
+
Emotion_directed
|
16 |
+
Event#Murder
|
17 |
+
Quarreling
|
18 |
+
|
19 |
+
Dead_or_alive#Murder
|
20 |
+
Death#Murder
|
21 |
+
Experience_bodily_harm
|
22 |
+
Killing#Murder
|
23 |
+
Catastrophe#Murder
|
resources/femicides_frame_to_roles.csv
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
frame,role:perpetrator_like,role:victim_like,role:cause_like,notes
|
2 |
+
Abusing,Abuser,Victim,-,
|
3 |
+
Attack,Assailant,Victim,-,
|
4 |
+
Causation,Causer,Affected,Cause,
|
5 |
+
Cause_harm,Agent,Victim,Cause,
|
6 |
+
Cause_motion,-,-,-,does not seem to usually refer to the main murder event
|
7 |
+
Dead_or_alive,-,Protagonist,Explanation,
|
8 |
+
Death,-,Protagonist,Cause,
|
9 |
+
Emotion_directed,-,-,-,does not seem to usually refer to the main murder event
|
10 |
+
Event,-,-,-,does not involve any participants
|
11 |
+
Experience_bodily_harm,Experiencer|Body_part,-,-,
|
12 |
+
Hit_target,Agent,Target,-,
|
13 |
+
Killing,Killer,Victim,Cause,
|
14 |
+
Quarreling,-,-,-,core roles (Arguers/Arguer1/Arguer2) could denote either Perpetrator or victim
|
15 |
+
Rape,Perpetrator,Victim,-,
|
16 |
+
Use_firearm,Agent,Goal,-,
|
resources/fn_frames_to_roles.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
resources/migration_frame_list.txt
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ARRIVING#Travel
|
2 |
+
CAUSE_MOTION#Travel
|
3 |
+
DEPARTING#Travel
|
4 |
+
FLUIDIC_MOTION#Travel
|
5 |
+
SELF_MOTION#Travel
|
6 |
+
TRAVEL#Travel
|
7 |
+
DISEMBARKING#Travel
|
8 |
+
RISKY_SITUATION#Travel
|
9 |
+
DEATH#Travel
|
10 |
+
|
11 |
+
CARDINAL_NUMBERS#Quantification
|
12 |
+
CHANGE_OF_QUANTITY_OF_POSSESSION#Quantification
|
13 |
+
CHANGE_POSITION_ON_A_SCALE#Quantification
|
14 |
+
FAMILIARITY#Quantification
|
15 |
+
INCREMENT#Quantification
|
16 |
+
PROLIFERATING_IN_NUMBER#Quantification
|
17 |
+
QUANTIFIED_MASS#Quantification
|
18 |
+
QUANTITY#Quantification
|
19 |
+
|
20 |
+
ABUSING#Crime
|
21 |
+
ARREST#Crime
|
22 |
+
COMMITTING_CRIME#Crime
|
23 |
+
INTENTIONAL_DECEPTION#Crime
|
24 |
+
KILLING#Crime
|
25 |
+
RAPE#Crime
|
26 |
+
ROBBERY#Crime
|
27 |
+
SMUGGLING#Crime
|
28 |
+
PROTEST#Crime
|
29 |
+
THEFT#Crime
|
30 |
+
CAUSE_HARM#Crime
|
31 |
+
|
32 |
+
HOSTILE_ENCOUNTER#Hostility
|
33 |
+
INVADING#Hostility
|
34 |
+
ATTACK#Hostility
|
35 |
+
WEAPON#Hostility
|
36 |
+
|
37 |
+
ARRANGING#Administration
|
38 |
+
MAKING_ARRANGEMENTS#Administration
|
39 |
+
DISCUSSION#Administration
|
40 |
+
EXECUTE_PLAN#Administration
|
41 |
+
LEADERSHIP#Administration
|
42 |
+
EXPEND_RESOURCE#Administration
|
43 |
+
GATHERING_UP#Administration
|
44 |
+
PLACING#Administration
|
45 |
+
POINT_OF_DISPUTE#Administration
|
46 |
+
INHIBIT_MOVEMENT#Administration
|
47 |
+
EXPENSIVENESS#Administration
|
48 |
+
|
49 |
+
ASSISTANCE#Humanizing
|
50 |
+
HIRING#Humanizing
|
51 |
+
INTENTIONALLY_CREATE#Humanizing
|
52 |
+
SOCIAL_EVENT#Humanizing
|
53 |
+
KINSHIP#Humanizing
|
54 |
+
COLLABORATION#Humanizing
|
55 |
+
EDUCATION_TEACHING#Humanizing
|
56 |
+
RESCUING#Humanizing
|
sociofillmore/__init__.py
ADDED
File without changes
|
sociofillmore/__init__.pyc
ADDED
Binary file (112 Bytes). View file
|
|
sociofillmore/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (187 Bytes). View file
|
|
sociofillmore/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (157 Bytes). View file
|
|
sociofillmore/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (169 Bytes). View file
|
|
sociofillmore/common/__init__.py
ADDED
File without changes
|
sociofillmore/common/__pycache__/__init__.cpython-37.pyc
ADDED
Binary file (164 Bytes). View file
|
|
sociofillmore/common/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (176 Bytes). View file
|
|
sociofillmore/common/__pycache__/analyze_text.cpython-37.pyc
ADDED
Binary file (22.8 kB). View file
|
|
sociofillmore/common/__pycache__/analyze_text.cpython-39.pyc
ADDED
Binary file (23 kB). View file
|
|
sociofillmore/common/__pycache__/split_lome_files.cpython-39.pyc
ADDED
Binary file (819 Bytes). View file
|
|
sociofillmore/common/analyze_text.py
ADDED
@@ -0,0 +1,1046 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
import argparse
|
6 |
+
import re
|
7 |
+
import tarfile
|
8 |
+
from collections import defaultdict
|
9 |
+
import dataclasses
|
10 |
+
from datetime import datetime
|
11 |
+
from typing import Any, Dict, List, Tuple, Optional
|
12 |
+
|
13 |
+
import pandas as pd
|
14 |
+
import spacy
|
15 |
+
from nltk.corpus import framenet as fn
|
16 |
+
from nltk.corpus.reader.framenet import FramenetError
|
17 |
+
from spacy.tokens import Token
|
18 |
+
|
19 |
+
from sociofillmore.crashes.utils import is_a_dutch_text
|
20 |
+
|
21 |
+
ITALIAN_ACTIVE_AUX = ["avere", "ha", "ho", "hai", "avete", "hanno", "abbiamo"]
|
22 |
+
DUTCH_ACTIVE_AUX = ["heb", "hebben", "heeft"]
|
23 |
+
|
24 |
+
active_frames_df = pd.read_csv("resources/active_frames_full.csv")
|
25 |
+
ACTIVE_FRAMES = active_frames_df[active_frames_df["active"]]["frame"].tolist()
|
26 |
+
|
27 |
+
|
28 |
+
IGNORE_DEP_LABELS = ["punct"]
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
DEEP_FRAMES = [
|
33 |
+
"Transitive_action",
|
34 |
+
"Causation",
|
35 |
+
"Transition_to_a_state",
|
36 |
+
"Event",
|
37 |
+
"State",
|
38 |
+
]
|
39 |
+
# SYNTAX_ANALYSIS_CACHE_FILES = {
|
40 |
+
# "femicides/rai": "resources/rai_syntax_analysis_cache.json",
|
41 |
+
# "femicides/rai_main": "resources/rai_main_syntax_analysis_cache.json",
|
42 |
+
# "femicides/olv": "resources/olv_syntax_analysis_cache.json",
|
43 |
+
# "crashes/thecrashes": "resources/thecrashes_syntax_analysis_cache.json",
|
44 |
+
# "migration/pavia": "resources/migration_pavia_syntax_analysis_cache.json"
|
45 |
+
# }
|
46 |
+
SYNTAX_ANALYSIS_CACHE_FILES = {
|
47 |
+
"femicides/rai": "output/femicides/syntax_cache/rai_ALL",
|
48 |
+
"femicides/rai_main": "output/femicides/syntax_cache/rai_main",
|
49 |
+
"femicides/rai_ALL": "output/femicides/syntax_cache/rai_ALL",
|
50 |
+
"femicides/olv": "output/femicides/syntax_cache/olv",
|
51 |
+
"crashes/thecrashes": "output/crashes/syntax_cache/thecrashes",
|
52 |
+
"migration/pavia": "output/migration/syntax_cache/pavia",
|
53 |
+
}
|
54 |
+
|
55 |
+
|
56 |
+
DEEP_FRAMES_CACHE_FILE = "resources/deep_frame_cache.json"
|
57 |
+
|
58 |
+
DEP_LABEL_CACHE_FILE = "resources/dep_labels.txt"
|
59 |
+
|
60 |
+
POSSIBLE_CONSTRUCTIONS = [
|
61 |
+
"nonverbal",
|
62 |
+
"verbal:active",
|
63 |
+
"verbal:impersonal",
|
64 |
+
"verbal:reflexive",
|
65 |
+
"verbal:passive",
|
66 |
+
"verbal:unaccusative",
|
67 |
+
"other",
|
68 |
+
]
|
69 |
+
|
70 |
+
|
71 |
+
def load_deep_frames_cache():
|
72 |
+
if os.path.isfile(DEEP_FRAMES_CACHE_FILE):
|
73 |
+
print("Loading deep frame cache...")
|
74 |
+
with open(DEEP_FRAMES_CACHE_FILE, encoding="utf-8") as f:
|
75 |
+
deep_frames_cache = json.load(f)
|
76 |
+
else:
|
77 |
+
deep_frames_cache = {}
|
78 |
+
return deep_frames_cache
|
79 |
+
|
80 |
+
|
81 |
+
# make spacy work with google app engine
|
82 |
+
# (see https://stackoverflow.com/questions/55228492/spacy-on-gae-standard-second-python-exceeds-memory-of-largest-instance)
|
83 |
+
# nlp = spacy.load("it_core_news_md")
|
84 |
+
nlp = None
|
85 |
+
|
86 |
+
|
87 |
+
@dataclasses.dataclass
|
88 |
+
class AnnotationSpan:
|
89 |
+
tokens_idx: List[int]
|
90 |
+
tokens_str: List[str]
|
91 |
+
|
92 |
+
|
93 |
+
@dataclasses.dataclass
|
94 |
+
class FrameStructure:
|
95 |
+
frame: str
|
96 |
+
deep_frame: str
|
97 |
+
target: Optional[AnnotationSpan]
|
98 |
+
roles: List[Tuple[str, AnnotationSpan]]
|
99 |
+
deep_roles: List[Tuple[str, AnnotationSpan]]
|
100 |
+
|
101 |
+
|
102 |
+
def make_syntax_cache(dataset, skip_fn=None):
|
103 |
+
print(f"make_syntax_cache({dataset})")
|
104 |
+
|
105 |
+
if dataset == "femicides/rai":
|
106 |
+
corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_blocks"
|
107 |
+
corpus = "rai"
|
108 |
+
spacy_model = "it_core_news_md"
|
109 |
+
elif dataset == "femicides/rai_main":
|
110 |
+
corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_main_blocks"
|
111 |
+
corpus = "rai_main"
|
112 |
+
spacy_model = "it_core_news_md"
|
113 |
+
elif dataset == "femicides/rai_ALL":
|
114 |
+
corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_rai_ALL_blocks"
|
115 |
+
corpus = "rai_ALL"
|
116 |
+
spacy_model = "it_core_news_md"
|
117 |
+
elif dataset == "femicides/olv":
|
118 |
+
corpus_tarball = "output/femicides/lome/lome_0shot/multilabel_olv_blocks"
|
119 |
+
corpus = "olv"
|
120 |
+
spacy_model = "it_core_news_md"
|
121 |
+
elif dataset == "crashes/thecrashes":
|
122 |
+
corpus_tarball = "output/crashes/lome/lome_0shot/multilabel_thecrashes_blocks"
|
123 |
+
corpus = "thecrashes"
|
124 |
+
spacy_model = "nl_core_news_md"
|
125 |
+
elif dataset == "migration/pavia":
|
126 |
+
corpus_tarball = "output/migration/lome/lome_0shot/multilabel_pavia_blocks"
|
127 |
+
# corpus_tarball = "output/migration/lome/lome_zs-tgt_ev-frm/multilabel_pavia.tar.gz"
|
128 |
+
corpus = "pavia"
|
129 |
+
spacy_model = "it_core_news_md"
|
130 |
+
else:
|
131 |
+
raise ValueError("Unsupported dataset!")
|
132 |
+
|
133 |
+
print("params:")
|
134 |
+
print(f"\tcorpus_tarball: {corpus_tarball}")
|
135 |
+
print(f"\tcorpus: {corpus}")
|
136 |
+
print(f"\tspacy: {spacy_model}")
|
137 |
+
|
138 |
+
print("processing files...")
|
139 |
+
|
140 |
+
|
141 |
+
for block in os.listdir(corpus_tarball):
|
142 |
+
print(block)
|
143 |
+
|
144 |
+
with tarfile.open(os.path.join(corpus_tarball, block)) as tar_in:
|
145 |
+
|
146 |
+
# check if output tarball exists
|
147 |
+
cache_location = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
|
148 |
+
if not os.path.isdir(cache_location):
|
149 |
+
os.makedirs(cache_location)
|
150 |
+
|
151 |
+
lome_files = [f for f in tar_in.getmembers(
|
152 |
+
) if f.name.endswith(".comm.json")]
|
153 |
+
|
154 |
+
lome_files.sort(key=lambda file: file.name)
|
155 |
+
for file in lome_files:
|
156 |
+
print(f"\tprocessing file {file}")
|
157 |
+
doc_id = re.search(r"lome_(\d+)\.comm\.json", file.name).group(1)
|
158 |
+
|
159 |
+
skipped = False
|
160 |
+
if skip_fn is not None:
|
161 |
+
if skip_fn(doc_id):
|
162 |
+
print(f"\t\tskip_fn: skipping file {file}")
|
163 |
+
skipped = True
|
164 |
+
|
165 |
+
if skipped:
|
166 |
+
syntax_analyses = None
|
167 |
+
else:
|
168 |
+
file_obj = io.TextIOWrapper(tar_in.extractfile(file))
|
169 |
+
annotations = json.load(file_obj)
|
170 |
+
|
171 |
+
syntax_analyses = []
|
172 |
+
for sentence in annotations:
|
173 |
+
syntax_analyses.append(
|
174 |
+
syntax_analyze(sentence, spacy_model))
|
175 |
+
|
176 |
+
# use last two chars of filename as key
|
177 |
+
file_key = doc_id[:2]
|
178 |
+
cache_file = f"{cache_location}/{file_key}.json"
|
179 |
+
if os.path.isfile(cache_file):
|
180 |
+
with open(cache_file, encoding="utf-8") as f:
|
181 |
+
key_cache = json.load(f)
|
182 |
+
else:
|
183 |
+
key_cache = {}
|
184 |
+
key_cache[doc_id] = syntax_analyses
|
185 |
+
with open(cache_file, "w", encoding="utf-8") as f:
|
186 |
+
json.dump(key_cache, f)
|
187 |
+
|
188 |
+
|
189 |
+
def make_syntax_cache_key(filename):
|
190 |
+
doc_id = re.search(r"/\d+/lome_(\d+)\.comm\.json", filename).group(1)
|
191 |
+
return doc_id
|
192 |
+
|
193 |
+
|
194 |
+
def clean_sentence_(sentence):
|
195 |
+
idx_to_remove = []
|
196 |
+
|
197 |
+
for i, tok in enumerate(sentence["tokens"]):
|
198 |
+
|
199 |
+
# remove whitespace tokens
|
200 |
+
if not tok.strip():
|
201 |
+
idx_to_remove.append(i)
|
202 |
+
|
203 |
+
idx_to_remove.reverse()
|
204 |
+
|
205 |
+
for idx in idx_to_remove:
|
206 |
+
for annotation_list in sentence.values():
|
207 |
+
annotation_list.pop(idx)
|
208 |
+
|
209 |
+
|
210 |
+
def process_prediction_file(
|
211 |
+
filename: str,
|
212 |
+
dataset_name: str,
|
213 |
+
syntax_cache: str,
|
214 |
+
deep_frames_cache: dict,
|
215 |
+
tmp_cache: Optional[dict] = None,
|
216 |
+
file_obj: io.TextIOBase = None,
|
217 |
+
syntax_cache_key: Optional[str] = None,
|
218 |
+
deep_frames_list: Optional[List[str]] = None,
|
219 |
+
spacy_model: str = "it_core_news_md",
|
220 |
+
spacy_model_obj = None
|
221 |
+
) -> Tuple[List, ...]:
|
222 |
+
"""
|
223 |
+
Process a predictions JSON file
|
224 |
+
:param filename: path to the JSON file
|
225 |
+
:param syntax_cache: see `make_syntax_cache()`
|
226 |
+
:param spacy model: spacy model to be used for syntactic analysis
|
227 |
+
:param file_obj: already opened object corresponding to `filename`. If given, `file_obj` will be used instead
|
228 |
+
of loading it from `filename`. This is useful when reading the entire corpus from a tarball (which is what the
|
229 |
+
SocioFillmore webapp does)
|
230 |
+
:return:
|
231 |
+
"""
|
232 |
+
|
233 |
+
print("Processing", filename)
|
234 |
+
|
235 |
+
if file_obj is not None:
|
236 |
+
annotations = json.load(file_obj)
|
237 |
+
else:
|
238 |
+
with open(filename, encoding="utf-8") as f:
|
239 |
+
annotations = json.load(f)
|
240 |
+
|
241 |
+
if syntax_cache is None:
|
242 |
+
syntax_analyses = []
|
243 |
+
for sentence in annotations:
|
244 |
+
syntax_analyses.append(syntax_analyze(sentence, spacy_model, spacy_model_obj))
|
245 |
+
|
246 |
+
else:
|
247 |
+
if syntax_cache_key is None:
|
248 |
+
syntax_cache_key = make_syntax_cache_key(filename)
|
249 |
+
|
250 |
+
if tmp_cache is not None and syntax_cache_key in tmp_cache:
|
251 |
+
syntax_analyses = tmp_cache[syntax_cache_key]
|
252 |
+
|
253 |
+
else:
|
254 |
+
with open(f"{syntax_cache}/{syntax_cache_key[:2]}.json", encoding="utf-8") as cache_file:
|
255 |
+
grouped_analyses = json.load(cache_file)
|
256 |
+
syntax_analyses = grouped_analyses[syntax_cache_key]
|
257 |
+
if tmp_cache is not None:
|
258 |
+
tmp_cache.clear()
|
259 |
+
tmp_cache.update(grouped_analyses)
|
260 |
+
|
261 |
+
fn_structures: List[Dict[int, FrameStructure]] = []
|
262 |
+
sentences: List[List[str]] = []
|
263 |
+
role_analyses: List[Dict[int, Dict[str, str]]] = []
|
264 |
+
|
265 |
+
for sent_idx, sentence in enumerate(annotations):
|
266 |
+
|
267 |
+
clean_sentence_(sentence)
|
268 |
+
|
269 |
+
try:
|
270 |
+
sent_structures = process_fn_sentence(
|
271 |
+
sentence, deep_frames_cache, deep_frames_list=deep_frames_list
|
272 |
+
)
|
273 |
+
|
274 |
+
# seems to occur for one specific file in the migration set, TODO find out what happens
|
275 |
+
except AttributeError:
|
276 |
+
print("Error processing FN annotations")
|
277 |
+
sent_structures = {}
|
278 |
+
syntax = syntax_analyses[sent_idx]
|
279 |
+
|
280 |
+
# disambiguate syntactic constructions
|
281 |
+
for fs in sent_structures.values():
|
282 |
+
target_idx = str(fs.target.tokens_idx[0])
|
283 |
+
if target_idx not in syntax:
|
284 |
+
print(
|
285 |
+
f"Prediction file {filename}: Cannot find syntactic information for target at idx={target_idx}")
|
286 |
+
continue
|
287 |
+
fs_syn = syntax[target_idx][-1]
|
288 |
+
disambiguate_cxs_(fs, fs_syn)
|
289 |
+
|
290 |
+
roles = process_syn_sem_roles(sent_structures, syntax)
|
291 |
+
role_analyses.append(roles)
|
292 |
+
sentences.append(sentence["tokens"])
|
293 |
+
fn_structures.append(sent_structures)
|
294 |
+
|
295 |
+
return sentences, fn_structures, syntax_analyses, role_analyses
|
296 |
+
|
297 |
+
|
298 |
+
def disambiguate_cxs_(struct: FrameStructure, tgt_syntax):
|
299 |
+
# no "_" at the beginning: no disambiguation needed
|
300 |
+
cx = tgt_syntax["syn_construction"]
|
301 |
+
if not cx.startswith("_"):
|
302 |
+
return
|
303 |
+
|
304 |
+
# print(struct.frame, struct.deep_frame)
|
305 |
+
|
306 |
+
# NB works only for the selected relevant frames! if any other frames are added, make sure to update this
|
307 |
+
if struct.deep_frame in ["Transitive_action", "Causation", "Emotion_directed", "Quarreling", "Impact", "Committing_crime"]:
|
308 |
+
frame_agentivity_type = "active"
|
309 |
+
elif struct.frame in ACTIVE_FRAMES:
|
310 |
+
frame_agentivity_type = "active"
|
311 |
+
elif struct.frame == "Event":
|
312 |
+
frame_agentivity_type = "impersonal"
|
313 |
+
else:
|
314 |
+
frame_agentivity_type = "unaccusative"
|
315 |
+
|
316 |
+
if cx == "_verbal:ACTIVE":
|
317 |
+
new_cx = f"verbal:{frame_agentivity_type}"
|
318 |
+
elif cx in ["_verbal:ADPOS", "_verbal:OTH_PART"]:
|
319 |
+
if frame_agentivity_type == "active":
|
320 |
+
new_cx = "verbal:passive"
|
321 |
+
else:
|
322 |
+
new_cx = f"verbal:{frame_agentivity_type}"
|
323 |
+
else:
|
324 |
+
raise ValueError(f"Unknown construction placeholder {cx}")
|
325 |
+
|
326 |
+
tgt_syntax["syn_construction"] = new_cx
|
327 |
+
|
328 |
+
|
329 |
+
def find_governed_roles(
|
330 |
+
syn_self: Dict[str, Any],
|
331 |
+
syn_children: List[Dict[str, Any]],
|
332 |
+
roles: List[Tuple[str, AnnotationSpan]],
|
333 |
+
) -> Dict[str, str]:
|
334 |
+
|
335 |
+
roles_found = {}
|
336 |
+
|
337 |
+
# find roles that are governed by the predicate
|
338 |
+
for node in [syn_self] + syn_children:
|
339 |
+
for role_name, role_span in roles:
|
340 |
+
if node["lome_idx"] in role_span.tokens_idx:
|
341 |
+
dep_label = node["dependency"]
|
342 |
+
if role_name not in roles_found and dep_label not in IGNORE_DEP_LABELS:
|
343 |
+
if node == syn_self:
|
344 |
+
roles_found[role_name] = None
|
345 |
+
else:
|
346 |
+
roles_found[role_name] = dep_label + "↓"
|
347 |
+
return roles_found
|
348 |
+
|
349 |
+
|
350 |
+
def analyze_role_dependencies(
|
351 |
+
fn_struct,
|
352 |
+
syntax,
|
353 |
+
role_analysis=None,
|
354 |
+
tgt_idx=None,
|
355 |
+
min_depth=-10,
|
356 |
+
max_depth=10,
|
357 |
+
depth=0,
|
358 |
+
label_prefix="",
|
359 |
+
):
|
360 |
+
|
361 |
+
if role_analysis is None:
|
362 |
+
role_analysis = {}
|
363 |
+
|
364 |
+
if tgt_idx is None:
|
365 |
+
tgt_idx = fn_struct.target.tokens_idx[0]
|
366 |
+
|
367 |
+
if depth > max_depth:
|
368 |
+
return role_analysis
|
369 |
+
|
370 |
+
if depth < min_depth:
|
371 |
+
return role_analysis
|
372 |
+
|
373 |
+
new_analysis = {}
|
374 |
+
new_analysis.update(role_analysis)
|
375 |
+
token_syntax = syntax[str(tgt_idx)][0]
|
376 |
+
|
377 |
+
def update_analysis(mapping):
|
378 |
+
for role, dep in mapping.items():
|
379 |
+
if role not in new_analysis:
|
380 |
+
if label_prefix:
|
381 |
+
if dep is None:
|
382 |
+
label = label_prefix
|
383 |
+
depth_label = depth
|
384 |
+
else:
|
385 |
+
label = label_prefix + "--" + dep
|
386 |
+
depth_label = depth + 1 if depth > 0 else depth - 1
|
387 |
+
else:
|
388 |
+
if dep is None:
|
389 |
+
label = "⋆"
|
390 |
+
depth_label = depth
|
391 |
+
else:
|
392 |
+
label = dep
|
393 |
+
depth_label = depth + 1 if depth > 0 else depth - 1
|
394 |
+
new_analysis[role] = label, depth_label
|
395 |
+
|
396 |
+
update_analysis(
|
397 |
+
find_governed_roles(
|
398 |
+
token_syntax, token_syntax["children"], fn_struct.roles)
|
399 |
+
)
|
400 |
+
|
401 |
+
# from the initial predicate: first try the children
|
402 |
+
if depth <= 0:
|
403 |
+
for child in token_syntax["children"]:
|
404 |
+
child_analysis = analyze_role_dependencies(
|
405 |
+
fn_struct,
|
406 |
+
syntax,
|
407 |
+
role_analysis=new_analysis,
|
408 |
+
tgt_idx=child["lome_idx"],
|
409 |
+
max_depth=max_depth,
|
410 |
+
min_depth=min_depth,
|
411 |
+
depth=depth - 1,
|
412 |
+
label_prefix=child["dependency"] + "↓"
|
413 |
+
)
|
414 |
+
new_analysis.update(child_analysis)
|
415 |
+
|
416 |
+
# ... then try the ancestors
|
417 |
+
if depth >= 0:
|
418 |
+
if not token_syntax["ancestors"]:
|
419 |
+
return new_analysis
|
420 |
+
|
421 |
+
first_ancestor = token_syntax["ancestors"][0]
|
422 |
+
return analyze_role_dependencies(
|
423 |
+
fn_struct,
|
424 |
+
syntax,
|
425 |
+
role_analysis=new_analysis,
|
426 |
+
tgt_idx=first_ancestor["lome_idx"],
|
427 |
+
max_depth=max_depth,
|
428 |
+
min_depth=min_depth,
|
429 |
+
depth=depth + 1,
|
430 |
+
label_prefix=token_syntax["dependency"] + "↑",
|
431 |
+
)
|
432 |
+
|
433 |
+
else:
|
434 |
+
return new_analysis
|
435 |
+
|
436 |
+
|
437 |
+
def process_syn_sem_roles(
|
438 |
+
sent_structures: Dict[int, FrameStructure], syntax: Dict[str, List[Dict[str, Any]]]
|
439 |
+
) -> Dict[int, Dict[str, str]]:
|
440 |
+
|
441 |
+
analyses = defaultdict(dict)
|
442 |
+
# go through all frame targets
|
443 |
+
for struct in sent_structures.values():
|
444 |
+
tgt_idx = struct.target.tokens_idx[0]
|
445 |
+
role_deps = analyze_role_dependencies(struct, syntax, max_depth=10)
|
446 |
+
analyses[tgt_idx] = clean_role_deps(role_deps)
|
447 |
+
return analyses
|
448 |
+
|
449 |
+
|
450 |
+
def clean_role_deps(role_deps):
|
451 |
+
res = {}
|
452 |
+
for role, (dep_str, depth) in role_deps.items():
|
453 |
+
dep_parts = dep_str.split("--")
|
454 |
+
if len(dep_parts) == 1:
|
455 |
+
res[role] = dep_str, depth
|
456 |
+
else:
|
457 |
+
res[role] = "--".join([dp[-1]
|
458 |
+
for dp in dep_parts[:-1]] + [dep_parts[-1]]), depth
|
459 |
+
return res
|
460 |
+
|
461 |
+
|
462 |
+
def map_or_lookup_deep_frame(
|
463 |
+
frame: str, deep_frames_cache, save_modified_cache=False, deep_frames_list=None
|
464 |
+
) -> Tuple[str, Dict[str, str]]:
|
465 |
+
if frame in deep_frames_cache:
|
466 |
+
return deep_frames_cache[frame]
|
467 |
+
else:
|
468 |
+
deep_frame, mapping = map_to_deep_frame(
|
469 |
+
frame, deep_frames_list=deep_frames_list
|
470 |
+
)
|
471 |
+
deep_frames_cache[frame] = [deep_frame, mapping]
|
472 |
+
if save_modified_cache:
|
473 |
+
with open(DEEP_FRAMES_CACHE_FILE, "w", encoding="utf-8") as f:
|
474 |
+
json.dump(deep_frames_cache, f)
|
475 |
+
return deep_frames_cache[frame]
|
476 |
+
|
477 |
+
|
478 |
+
def map_to_deep_frame(
|
479 |
+
frame: str,
|
480 |
+
target: Optional[str] = None,
|
481 |
+
mapping: Optional[Dict[str, str]] = None,
|
482 |
+
self_mapping: Optional[Dict[str, str]] = None,
|
483 |
+
deep_frames_list: Optional[List[str]] = None,
|
484 |
+
) -> Tuple[str, Dict[str, str]]:
|
485 |
+
|
486 |
+
if deep_frames_list is None:
|
487 |
+
deep_frames_list = DEEP_FRAMES
|
488 |
+
|
489 |
+
# look up in FrameNet
|
490 |
+
try:
|
491 |
+
fn_entry = fn.frame(frame)
|
492 |
+
except FramenetError:
|
493 |
+
return frame, {}
|
494 |
+
except LookupError:
|
495 |
+
return frame, {}
|
496 |
+
|
497 |
+
# initial call: `target` == `frame`, mapping maps to self
|
498 |
+
if target is None:
|
499 |
+
target = frame
|
500 |
+
if mapping is None or self_mapping is None:
|
501 |
+
mapping = self_mapping = {role: role for role in fn_entry.FE.keys()}
|
502 |
+
|
503 |
+
# base case: our frame is a deep frame
|
504 |
+
if frame in deep_frames_list:
|
505 |
+
return frame, mapping
|
506 |
+
|
507 |
+
# otherwise, look at parents
|
508 |
+
inh_relations = [
|
509 |
+
fr
|
510 |
+
for fr in fn_entry.frameRelations
|
511 |
+
if fr.type.name == "Inheritance" and fr.Child == fn_entry
|
512 |
+
]
|
513 |
+
parents = [fr.Parent for fr in inh_relations]
|
514 |
+
|
515 |
+
# no parents --> failure, return original frame
|
516 |
+
if not inh_relations:
|
517 |
+
return target, self_mapping
|
518 |
+
|
519 |
+
# one parent: follow that parent
|
520 |
+
if len(inh_relations) == 1:
|
521 |
+
parent_rel = inh_relations[0]
|
522 |
+
parent = parents[0]
|
523 |
+
new_mapping = define_fe_mapping(mapping, parent_rel)
|
524 |
+
return map_to_deep_frame(
|
525 |
+
parent.name, target, new_mapping, self_mapping, deep_frames_list
|
526 |
+
)
|
527 |
+
|
528 |
+
# more parents: check if any of them leads to a deep frame
|
529 |
+
deep_frames = []
|
530 |
+
deep_mappings = []
|
531 |
+
for parent_rel, parent in zip(inh_relations, parents):
|
532 |
+
new_mapping = define_fe_mapping(mapping, parent_rel)
|
533 |
+
final_frame, final_mapping = map_to_deep_frame(
|
534 |
+
parent.name, target, new_mapping, self_mapping, deep_frames_list
|
535 |
+
)
|
536 |
+
if final_frame in deep_frames_list:
|
537 |
+
deep_frames.append(final_frame)
|
538 |
+
deep_mappings.append(final_mapping)
|
539 |
+
|
540 |
+
for deep_frame in deep_frames_list:
|
541 |
+
if deep_frame in deep_frames:
|
542 |
+
idx = deep_frames.index(deep_frame)
|
543 |
+
return deep_frame, deep_mappings[idx]
|
544 |
+
|
545 |
+
# nothing found, return original frame
|
546 |
+
return target, self_mapping
|
547 |
+
|
548 |
+
|
549 |
+
def define_fe_mapping(mapping, parent_rel):
|
550 |
+
child_to_parent_mapping = {
|
551 |
+
fer.subFEName: fer.superFEName for fer in parent_rel.feRelations
|
552 |
+
}
|
553 |
+
target_to_parent_mapping = {
|
554 |
+
role: child_to_parent_mapping[mapping[role]]
|
555 |
+
for role in mapping
|
556 |
+
if mapping[role] in child_to_parent_mapping
|
557 |
+
}
|
558 |
+
return target_to_parent_mapping
|
559 |
+
|
560 |
+
|
561 |
+
def is_at_root(syntax_info):
|
562 |
+
|
563 |
+
# you should either be the actual root...
|
564 |
+
if syntax_info["dependency"] == "ROOT":
|
565 |
+
return True
|
566 |
+
|
567 |
+
# ... or be the subject of the root
|
568 |
+
if syntax_info["dependency"] == "nsubj" and syntax_info["ancestors"][0]["dependency"] == "ROOT":
|
569 |
+
return True
|
570 |
+
|
571 |
+
return False
|
572 |
+
|
573 |
+
|
574 |
+
def get_tarball_blocks(dataset, lome_model="lome_0shot"):
|
575 |
+
if dataset == "femicides/rai":
|
576 |
+
return f"output/femicides/lome/{lome_model}/multilabel_rai_ALL_blocks"
|
577 |
+
if dataset == "femicides/rai_main":
|
578 |
+
return f"output/femicides/lome/{lome_model}/multilabel_rai_main_blocks"
|
579 |
+
elif dataset == "femicides/olv":
|
580 |
+
return f"output/femicides/lome/{lome_model}/multilabel_olv_blocks"
|
581 |
+
elif dataset == "crashes/thecrashes":
|
582 |
+
return f"output/crashes/lome/{lome_model}/multilabel_thecrashes_blocks"
|
583 |
+
elif dataset == "migration/pavia":
|
584 |
+
return f"output/migration/lome/{lome_model}/multilabel_pavia_blocks"
|
585 |
+
else:
|
586 |
+
raise ValueError("Unsupported dataset!")
|
587 |
+
|
588 |
+
|
589 |
+
def analyze_single_document(doc_id, event_id, lome_model, dataset, texts_df, deep_frames_cache):
|
590 |
+
data_domain, data_corpus = dataset.split("/")
|
591 |
+
|
592 |
+
syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
|
593 |
+
|
594 |
+
print(dataset)
|
595 |
+
|
596 |
+
if dataset == "migration/pavia": # this is a hack, fix it!
|
597 |
+
pred_file_path = f"output/migration/lome/multilabel/{lome_model}/pavia/{event_id}/lome_{doc_id}.comm.json"
|
598 |
+
elif dataset == "femicides/olv":
|
599 |
+
pred_file_path = f"output/femicides/lome/lome_0shot/multilabel/olv/{event_id}/lome_{doc_id}.comm.json"
|
600 |
+
else:
|
601 |
+
pred_file_path = f"output/{data_domain}/lome/lome_0shot/multilabel/{data_corpus}/{event_id}/lome_{doc_id}.comm.json"
|
602 |
+
print(f"Analyzing file {pred_file_path}")
|
603 |
+
|
604 |
+
doc_id = os.path.basename(pred_file_path).split(".")[0].split("_")[1]
|
605 |
+
doc_key = doc_id[:2]
|
606 |
+
tarball = get_tarball_blocks(dataset, lome_model) + f"/block_{doc_key}.tar"
|
607 |
+
with tarfile.open(tarball, "r") as tar_f:
|
608 |
+
pred_file = io.TextIOWrapper(tar_f.extractfile(pred_file_path))
|
609 |
+
|
610 |
+
(
|
611 |
+
sents,
|
612 |
+
pred_structures,
|
613 |
+
syntax_analyses,
|
614 |
+
role_analyses,
|
615 |
+
) = process_prediction_file(
|
616 |
+
filename=pred_file_path,
|
617 |
+
dataset_name=dataset,
|
618 |
+
file_obj=pred_file,
|
619 |
+
syntax_cache=syntax_cache,
|
620 |
+
deep_frames_cache=deep_frames_cache
|
621 |
+
)
|
622 |
+
output = []
|
623 |
+
for sent, structs, syntax, roles in zip(
|
624 |
+
sents, pred_structures, syntax_analyses, role_analyses
|
625 |
+
):
|
626 |
+
output.append(
|
627 |
+
{
|
628 |
+
"sentence": sent,
|
629 |
+
"fn_structures": [
|
630 |
+
dataclasses.asdict(fs) for fs in structs.values()
|
631 |
+
],
|
632 |
+
"syntax": syntax,
|
633 |
+
"roles": roles,
|
634 |
+
"meta": {
|
635 |
+
"event_id": event_id,
|
636 |
+
"doc_id": doc_id,
|
637 |
+
"text_meta": get_text_meta(doc_id, texts_df),
|
638 |
+
},
|
639 |
+
}
|
640 |
+
)
|
641 |
+
return output
|
642 |
+
|
643 |
+
|
644 |
+
def get_text_meta(doc_id, texts_df):
|
645 |
+
row = texts_df[texts_df["text_id"] == int(doc_id)].iloc[0]
|
646 |
+
if "pubdate" in row:
|
647 |
+
pubdate = row["pubdate"] if not pd.isna(row["pubdate"]) else None
|
648 |
+
elif "pubyear" in row:
|
649 |
+
pubdate = int(row["pubyear"])
|
650 |
+
else:
|
651 |
+
pubdate = None
|
652 |
+
return {
|
653 |
+
"url": row["url"] if "url" in row else None,
|
654 |
+
"pubdate": pubdate,
|
655 |
+
"provider": row["provider"],
|
656 |
+
"title": row["title"] if not pd.isna(row["title"]) else None,
|
657 |
+
"days_after_event": int(row["days_after_event"]) if "days_after_event" in row and not pd.isna(row["days_after_event"]) else 0
|
658 |
+
}
|
659 |
+
|
660 |
+
|
661 |
+
def process_fn_sentence(
|
662 |
+
sentence, deep_frames_cache, post_process=True, deep_frames_list=None
|
663 |
+
):
|
664 |
+
# frame structures in the sentence
|
665 |
+
sent_structures: Dict[int, FrameStructure] = {}
|
666 |
+
|
667 |
+
# role spans currently being built up (per structure + role name)
|
668 |
+
cur_spans: Dict[Tuple[int, str]] = {}
|
669 |
+
for token_idx, (token_str, frame_annos) in enumerate(
|
670 |
+
zip(sentence["tokens"], sentence["frame_list"])
|
671 |
+
):
|
672 |
+
for fa in frame_annos:
|
673 |
+
# remove "virtual root" nonsense token
|
674 |
+
if "@@VIRTUAL_ROOT@@" in fa:
|
675 |
+
continue
|
676 |
+
fa = fa.split("@@")[0] # remove confidence score if it's there
|
677 |
+
anno, struct_id_str = fa.split("@")
|
678 |
+
struct_id = int(struct_id_str)
|
679 |
+
frame_name = anno.split(":")[1]
|
680 |
+
deep_frame, deep_frame_mapping = map_or_lookup_deep_frame(
|
681 |
+
frame_name, deep_frames_cache, deep_frames_list=deep_frames_list
|
682 |
+
)
|
683 |
+
if struct_id not in sent_structures:
|
684 |
+
sent_structures[struct_id] = FrameStructure(
|
685 |
+
frame=frame_name,
|
686 |
+
deep_frame=deep_frame,
|
687 |
+
target=None,
|
688 |
+
roles=[],
|
689 |
+
deep_roles=[],
|
690 |
+
)
|
691 |
+
cur_struct = sent_structures[struct_id]
|
692 |
+
|
693 |
+
# TODO: get rid of this hack
|
694 |
+
anno = anno.replace("I::", "I:")
|
695 |
+
anno = anno.replace("B::", "B:")
|
696 |
+
|
697 |
+
if anno.split(":")[0] == "T":
|
698 |
+
if cur_struct.target is None:
|
699 |
+
cur_struct.target = AnnotationSpan(
|
700 |
+
[token_idx], [token_str])
|
701 |
+
else:
|
702 |
+
cur_struct.target.tokens_idx.append(token_idx)
|
703 |
+
cur_struct.target.tokens_str.append(token_str)
|
704 |
+
elif anno.split(":")[0] == "B":
|
705 |
+
role_name = anno.split(":")[2]
|
706 |
+
role_span = AnnotationSpan([token_idx], [token_str])
|
707 |
+
cur_struct.roles.append((role_name, role_span))
|
708 |
+
if role_name in deep_frame_mapping:
|
709 |
+
cur_struct.deep_roles.append(
|
710 |
+
(deep_frame_mapping[role_name], role_span)
|
711 |
+
)
|
712 |
+
cur_spans[(struct_id, role_name)] = role_span
|
713 |
+
elif anno.split(":")[0] == "I":
|
714 |
+
role_name = anno.split(":")[2]
|
715 |
+
role_span = cur_spans[(struct_id, role_name)]
|
716 |
+
role_span.tokens_str.append(token_str)
|
717 |
+
role_span.tokens_idx.append(token_idx)
|
718 |
+
|
719 |
+
# post-process: remove punctuation in targets
|
720 |
+
if post_process:
|
721 |
+
for fs in sent_structures.values():
|
722 |
+
if len(fs.target.tokens_str) > 1:
|
723 |
+
target_tok_str_to_remove = []
|
724 |
+
target_tok_idx_to_remove = []
|
725 |
+
for tok_str, tok_idx in zip(fs.target.tokens_str, fs.target.tokens_idx):
|
726 |
+
if tok_str in ["``", "''", "`", "'", ".", ",", ";", ":"]:
|
727 |
+
target_tok_str_to_remove.append(tok_str)
|
728 |
+
target_tok_idx_to_remove.append(tok_idx)
|
729 |
+
for tok_str, tok_idx in zip(
|
730 |
+
target_tok_str_to_remove, target_tok_idx_to_remove
|
731 |
+
):
|
732 |
+
fs.target.tokens_str.remove(tok_str)
|
733 |
+
fs.target.tokens_idx.remove(tok_idx)
|
734 |
+
|
735 |
+
return sent_structures
|
736 |
+
|
737 |
+
|
738 |
+
def map_back_spacy_lome_tokens(spacy_doc, lome_tokens):
|
739 |
+
if len(lome_tokens) > len(spacy_doc):
|
740 |
+
raise ValueError(
|
741 |
+
f"Cannot re-tokenize (#lome={len(lome_tokens)} // #spacy={len(spacy_doc)})"
|
742 |
+
)
|
743 |
+
|
744 |
+
spacy_to_lome = {}
|
745 |
+
lome_idx = 0
|
746 |
+
for spacy_idx, spacy_token in enumerate(spacy_doc):
|
747 |
+
spacy_to_lome[spacy_idx] = lome_idx
|
748 |
+
|
749 |
+
# whitespace after token: tokens correspond
|
750 |
+
if spacy_token.whitespace_:
|
751 |
+
lome_idx += 1
|
752 |
+
return spacy_to_lome
|
753 |
+
|
754 |
+
|
755 |
+
def get_syn_category(spacy_token):
|
756 |
+
if spacy_token.pos_ == "NOUN":
|
757 |
+
return "n"
|
758 |
+
if spacy_token.pos_ == "ADJ":
|
759 |
+
return "adj"
|
760 |
+
if spacy_token.pos_ == "ADV":
|
761 |
+
return "adv"
|
762 |
+
if spacy_token.pos_ == "ADP":
|
763 |
+
return "p"
|
764 |
+
if spacy_token.pos_ == "VERB":
|
765 |
+
if spacy_token.morph.get("VerbForm") == ["Fin"]:
|
766 |
+
return "v:fin"
|
767 |
+
if spacy_token.morph.get("VerbForm") == ["Part"]:
|
768 |
+
return "v:part"
|
769 |
+
if spacy_token.morph.get("VerbForm") == ["Ger"]:
|
770 |
+
return "v:ger"
|
771 |
+
if spacy_token.morph.get("VerbForm") == ["Inf"]:
|
772 |
+
return "v:inf"
|
773 |
+
return "other"
|
774 |
+
|
775 |
+
|
776 |
+
def syntax_analyze(sentence, spacy_model_name, spacy_model_obj=None) -> Dict[str, Dict[str, Any]]:
|
777 |
+
lome_tokens = sentence["tokens"]
|
778 |
+
|
779 |
+
# load spacy model locally (so that it works in GAE)
|
780 |
+
# global nlp
|
781 |
+
if spacy_model_obj is not None:
|
782 |
+
nlp = spacy_model_obj
|
783 |
+
else:
|
784 |
+
nlp = spacy.load(spacy_model_name)
|
785 |
+
|
786 |
+
spacy_doc = nlp(" ".join(lome_tokens))
|
787 |
+
analysis = defaultdict(list)
|
788 |
+
spacy_to_lome_tokens = map_back_spacy_lome_tokens(spacy_doc, lome_tokens)
|
789 |
+
for spacy_idx, token in enumerate(spacy_doc):
|
790 |
+
lome_idx = spacy_to_lome_tokens[spacy_idx]
|
791 |
+
syn_category = get_syn_category(token)
|
792 |
+
syn_construction = get_syn_construction(token, syn_category)
|
793 |
+
children = []
|
794 |
+
for c in token.children:
|
795 |
+
children.append(
|
796 |
+
{
|
797 |
+
"token": c.text,
|
798 |
+
"spacy_idx": c.i,
|
799 |
+
"lome_idx": spacy_to_lome_tokens[c.i],
|
800 |
+
"syn_category": get_syn_category(c),
|
801 |
+
"dependency": c.dep_,
|
802 |
+
}
|
803 |
+
)
|
804 |
+
ancestors = []
|
805 |
+
for a in token.ancestors:
|
806 |
+
ancestors.append(
|
807 |
+
{
|
808 |
+
"token": a.text,
|
809 |
+
"spacy_idx": a.i,
|
810 |
+
"lome_idx": spacy_to_lome_tokens[a.i],
|
811 |
+
"syn_category": get_syn_category(a),
|
812 |
+
"dependency": a.dep_,
|
813 |
+
}
|
814 |
+
)
|
815 |
+
|
816 |
+
# str key so that it doesn't change when converting to JSON
|
817 |
+
lome_key = str(lome_idx)
|
818 |
+
analysis[lome_key].append(
|
819 |
+
{
|
820 |
+
"token": token.text,
|
821 |
+
"dependency": token.dep_,
|
822 |
+
"spacy_idx": spacy_idx,
|
823 |
+
"lome_idx": lome_idx,
|
824 |
+
"syn_category": syn_category,
|
825 |
+
"syn_construction": syn_construction,
|
826 |
+
"children": children,
|
827 |
+
"ancestors": ancestors,
|
828 |
+
}
|
829 |
+
)
|
830 |
+
return analysis
|
831 |
+
|
832 |
+
|
833 |
+
def get_syn_construction(token: Token, syn_category: str) -> str:
|
834 |
+
if syn_category in ["n", "adj", "adv", "p"]:
|
835 |
+
return "nonverbal"
|
836 |
+
|
837 |
+
if syn_category.startswith("v:"):
|
838 |
+
# find reflexives
|
839 |
+
for c in token.children:
|
840 |
+
if c.lemma_.lower() in ["si", "zich", "zichzelf"]:
|
841 |
+
return "verbal:reflexive"
|
842 |
+
|
843 |
+
# find impersonal constructions
|
844 |
+
for c in token.children:
|
845 |
+
if c.dep_ == "expl":
|
846 |
+
return "verbal:impersonal"
|
847 |
+
|
848 |
+
# all other finite verbs/gerunds/infinites -> active construction
|
849 |
+
if syn_category in ["v:fin", "v:ger", "v:inf"]:
|
850 |
+
return "_verbal:ACTIVE"
|
851 |
+
|
852 |
+
if syn_category == "v:part":
|
853 |
+
|
854 |
+
if token.dep_ == "acl":
|
855 |
+
return "_verbal:ADPOS"
|
856 |
+
|
857 |
+
for c in token.children:
|
858 |
+
|
859 |
+
# passive subj or auxiliary present: it's a passive
|
860 |
+
if c.dep_ in ["nsubj:pass", "aux:pass"]:
|
861 |
+
return "verbal:passive"
|
862 |
+
|
863 |
+
# auxiliary "HAVE" (avere/hebben) present: it's an active
|
864 |
+
if (
|
865 |
+
c.dep_ == "aux"
|
866 |
+
and c.lemma_.lower() in ITALIAN_ACTIVE_AUX + DUTCH_ACTIVE_AUX
|
867 |
+
):
|
868 |
+
return "verbal:active"
|
869 |
+
|
870 |
+
return "_verbal:OTH_PART"
|
871 |
+
|
872 |
+
return "other"
|
873 |
+
|
874 |
+
|
875 |
+
def get_syntax_info(struct: FrameStructure, syntax: Dict) -> Dict:
|
876 |
+
target_idx = str(struct.target.tokens_idx[0])
|
877 |
+
# print(target_idx, syntax)
|
878 |
+
syntax_for_target = syntax[target_idx]
|
879 |
+
return syntax_for_target[-1]
|
880 |
+
|
881 |
+
|
882 |
+
def enrich_texts_df(texts_df: pd.DataFrame, events_df: pd.DataFrame):
|
883 |
+
time_delta_rows: List[Optional[int]] = []
|
884 |
+
for idx, text_row in texts_df.iterrows():
|
885 |
+
try:
|
886 |
+
event_row = events_df[events_df["event:id"]
|
887 |
+
== text_row["event_id"]].iloc[0]
|
888 |
+
except IndexError:
|
889 |
+
print(f"Skipping {idx} (IndexError)")
|
890 |
+
time_delta_rows.append(None)
|
891 |
+
if "pubdate" not in text_row or pd.isna(text_row["pubdate"]) or pd.isna(event_row["event:date"]):
|
892 |
+
time_delta_rows.append(None)
|
893 |
+
else:
|
894 |
+
try:
|
895 |
+
pub_date = datetime.strptime(
|
896 |
+
text_row["pubdate"], "%Y-%m-%d %H:%M:%S")
|
897 |
+
event_date = datetime.strptime(
|
898 |
+
event_row["event:date"], "%Y-%m-%d")
|
899 |
+
time_delta = pub_date - event_date
|
900 |
+
time_delta_days = time_delta.days
|
901 |
+
time_delta_rows.append(time_delta_days)
|
902 |
+
except ValueError as e:
|
903 |
+
print(
|
904 |
+
f"\t\terror parsing dates, see below for more info:\n\t\t{e}")
|
905 |
+
time_delta_rows.append(None)
|
906 |
+
|
907 |
+
return texts_df.assign(days_after_event=time_delta_rows)
|
908 |
+
|
909 |
+
|
910 |
+
def read_frames_of_interest(dataset) -> List[str]:
|
911 |
+
if dataset in ["femicides/rai", "femicides/olv"]:
|
912 |
+
file = "resources/femicide_frame_list.txt"
|
913 |
+
elif dataset == "crashes/thecrashes":
|
914 |
+
file = "resources/crashes_frame_list.txt"
|
915 |
+
elif dataset == "migration/pavia":
|
916 |
+
file = "resources/migration_frame_list.txt"
|
917 |
+
else:
|
918 |
+
raise ValueError("Unsupported dataset")
|
919 |
+
|
920 |
+
frames = set()
|
921 |
+
with open(file, encoding="utf-8") as f:
|
922 |
+
for line in f:
|
923 |
+
line = line.strip()
|
924 |
+
if line.startswith("#") or not line:
|
925 |
+
continue
|
926 |
+
frames.add(line[0].upper() + line[1:].lower())
|
927 |
+
return sorted(frames)
|
928 |
+
|
929 |
+
|
930 |
+
def make_dep_label_cache():
|
931 |
+
|
932 |
+
labels = set()
|
933 |
+
|
934 |
+
for dataset in ["femicides/rai", "crashes/thecrashes", "migration/pavia"]:
|
935 |
+
|
936 |
+
tarball = (
|
937 |
+
"output/femicides/lome/lome_0shot/multilabel_rai.tar.gz"
|
938 |
+
if dataset == "femicides/rai"
|
939 |
+
else "output/crashes/lome/lome_0shot/multilabel_thecrashes.tar.gz"
|
940 |
+
if dataset == "crashes/thecrashes"
|
941 |
+
else "output/migration/lome/lome_0shot/multilabel_pavia.tar.gz"
|
942 |
+
)
|
943 |
+
|
944 |
+
spacy_model = (
|
945 |
+
"it_core_news_md" if dataset["femicides/rai",
|
946 |
+
"migration/pavia"] else "nl_core_news_md"
|
947 |
+
)
|
948 |
+
|
949 |
+
deep_frames_cache = load_deep_frames_cache(dataset)
|
950 |
+
syntax_cache = SYNTAX_ANALYSIS_CACHE_FILES[dataset]
|
951 |
+
|
952 |
+
with tarfile.open(tarball, "r:gz") as tar_f:
|
953 |
+
for mem in [
|
954 |
+
m.name for m in tar_f.getmembers() if m.name.endswith(".comm.json")
|
955 |
+
]:
|
956 |
+
if mem is None:
|
957 |
+
continue
|
958 |
+
|
959 |
+
print(mem)
|
960 |
+
mem_obj = io.TextIOWrapper(tar_f.extractfile(mem))
|
961 |
+
(_, _, _, role_analyses,) = process_prediction_file(
|
962 |
+
filename=mem,
|
963 |
+
dataset_name=dataset,
|
964 |
+
file_obj=mem_obj,
|
965 |
+
syntax_cache=syntax_cache,
|
966 |
+
deep_frames_cache=deep_frames_cache,
|
967 |
+
spacy_model=spacy_model,
|
968 |
+
)
|
969 |
+
if role_analyses is None:
|
970 |
+
print(f"\tSkipping file {mem}, no role analyses found")
|
971 |
+
continue
|
972 |
+
for sent_ra in role_analyses:
|
973 |
+
for ra in sent_ra.values():
|
974 |
+
for dep, _ in ra.values():
|
975 |
+
labels.add(dep)
|
976 |
+
with open(DEP_LABEL_CACHE_FILE, "w", encoding="utf-8") as f_out:
|
977 |
+
for label in sorted(labels):
|
978 |
+
f_out.write(label + os.linesep)
|
979 |
+
|
980 |
+
|
981 |
+
def analyze_external_file(file_in, file_out, spacy_model):
|
982 |
+
deep_frames_cache = load_deep_frames_cache()
|
983 |
+
(
|
984 |
+
sents,
|
985 |
+
pred_structures,
|
986 |
+
syntax_analyses,
|
987 |
+
role_analyses,
|
988 |
+
) = process_prediction_file(file_in, "", None, deep_frames_cache, spacy_model_obj=spacy_model)
|
989 |
+
output = []
|
990 |
+
for sent, structs, syntax, roles in zip(
|
991 |
+
sents, pred_structures, syntax_analyses, role_analyses
|
992 |
+
):
|
993 |
+
output.append(
|
994 |
+
{
|
995 |
+
"sentence": sent,
|
996 |
+
"fn_structures": [
|
997 |
+
dataclasses.asdict(fs) for fs in structs.values()
|
998 |
+
],
|
999 |
+
"syntax": syntax,
|
1000 |
+
"roles": roles
|
1001 |
+
}
|
1002 |
+
)
|
1003 |
+
with open(file_out, "w", encoding="utf-8") as f_out:
|
1004 |
+
json.dump(output, f_out, indent=4)
|
1005 |
+
|
1006 |
+
|
1007 |
+
if __name__ == "__main__":
|
1008 |
+
ap = argparse.ArgumentParser()
|
1009 |
+
ap.add_argument("command", choices=[
|
1010 |
+
"make_syntax_cache", "make_dep_label_cache", "analyze_file"
|
1011 |
+
])
|
1012 |
+
ap.add_argument("dataset", choices=["femicides/rai", "femicides/rai_main", "femicides/rai_ALL",
|
1013 |
+
"femicides/olv", "crashes/thecrashes", "migration/pavia", "*"])
|
1014 |
+
ap.add_argument("--input_file", type=str, default="")
|
1015 |
+
ap.add_argument("--output_file", type=str, default="")
|
1016 |
+
args = ap.parse_args()
|
1017 |
+
|
1018 |
+
if args.command == "make_syntax_cache":
|
1019 |
+
|
1020 |
+
if args.dataset == "*":
|
1021 |
+
raise ValueError(
|
1022 |
+
"Please specificy a dataset for `make_syntax_cache`")
|
1023 |
+
|
1024 |
+
if args.dataset == "crashes/thecrashes":
|
1025 |
+
make_syntax_cache(
|
1026 |
+
"crashes/thecrashes", skip_fn=lambda f: not is_a_dutch_text(f)
|
1027 |
+
)
|
1028 |
+
elif args.dataset == "femicides/rai":
|
1029 |
+
make_syntax_cache("femicides/rai")
|
1030 |
+
elif args.dataset == "femicides/rai_main":
|
1031 |
+
make_syntax_cache("femicides/rai_main")
|
1032 |
+
elif args.dataset == "femicides/rai_ALL":
|
1033 |
+
make_syntax_cache("femicides/rai_ALL")
|
1034 |
+
elif args.dataset == "femicides/olv":
|
1035 |
+
make_syntax_cache("femicides/olv")
|
1036 |
+
else:
|
1037 |
+
make_syntax_cache("migration/pavia")
|
1038 |
+
|
1039 |
+
elif args.command == "make_dep_label_cache":
|
1040 |
+
make_dep_label_cache()
|
1041 |
+
|
1042 |
+
elif args.command == "analyze_file":
|
1043 |
+
analyze_external_file(args.input_file, args.output_file)
|
1044 |
+
|
1045 |
+
|
1046 |
+
|
sociofillmore/common/convert_comms.py
ADDED
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Adapted from comm2multilabel.py from the Bert-for-FrameNet project (https://gitlab.com/gosseminnema/bert-for-framenet)
|
3 |
+
"""
|
4 |
+
|
5 |
+
import dataclasses
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
import glob
|
9 |
+
import sys
|
10 |
+
from collections import defaultdict
|
11 |
+
from typing import List, Optional
|
12 |
+
|
13 |
+
import nltk
|
14 |
+
from concrete import Communication
|
15 |
+
from concrete.util import read_communication_from_file, lun, get_tokens
|
16 |
+
|
17 |
+
|
18 |
+
@dataclasses.dataclass
|
19 |
+
class FrameAnnotation:
|
20 |
+
tokens: List[str] = dataclasses.field(default_factory=list)
|
21 |
+
pos: List[str] = dataclasses.field(default_factory=list)
|
22 |
+
|
23 |
+
|
24 |
+
@dataclasses.dataclass
|
25 |
+
class MultiLabelAnnotation(FrameAnnotation):
|
26 |
+
frame_list: List[List[str]] = dataclasses.field(default_factory=list)
|
27 |
+
lu_list: List[Optional[str]] = dataclasses.field(default_factory=list)
|
28 |
+
|
29 |
+
def to_txt(self):
|
30 |
+
for i, tok in enumerate(self.tokens):
|
31 |
+
yield f"{tok} {self.pos[i]} {'|'.join(self.frame_list[i]) or '_'} {self.lu_list[i] or '_'}"
|
32 |
+
|
33 |
+
@staticmethod
|
34 |
+
def from_txt(sentence_lines):
|
35 |
+
|
36 |
+
tokens = []
|
37 |
+
pos = []
|
38 |
+
frame_list = []
|
39 |
+
lu_list = []
|
40 |
+
for line in sentence_lines:
|
41 |
+
|
42 |
+
# ignore any spaces
|
43 |
+
if line.startswith(" "):
|
44 |
+
continue
|
45 |
+
|
46 |
+
columns = line.split()
|
47 |
+
tokens.append(columns[0])
|
48 |
+
pos.append(columns[1])
|
49 |
+
|
50 |
+
# read frame list, handle empty lists
|
51 |
+
if columns[2] == "_":
|
52 |
+
frame_list.append([])
|
53 |
+
else:
|
54 |
+
frame_list.append(columns[2].split("|"))
|
55 |
+
|
56 |
+
# read lu list, handle nulls
|
57 |
+
if columns[3] == "_":
|
58 |
+
lu_list.append(None)
|
59 |
+
else:
|
60 |
+
lu_list.append(columns[3])
|
61 |
+
return MultiLabelAnnotation(tokens, pos, frame_list, lu_list)
|
62 |
+
|
63 |
+
def get_label_set(self):
|
64 |
+
label_set = set()
|
65 |
+
for tok_labels in self.frame_list:
|
66 |
+
for label in tok_labels:
|
67 |
+
label_set.add(label)
|
68 |
+
return label_set
|
69 |
+
|
70 |
+
|
71 |
+
def convert_file(file, language="english", confidence_filter=0.0):
|
72 |
+
print("Reading input file...")
|
73 |
+
comm = read_communication_from_file(file)
|
74 |
+
|
75 |
+
print("Mapping sentences to situations...")
|
76 |
+
tok_uuid_to_situation = map_sent_to_situation(comm)
|
77 |
+
|
78 |
+
print("# sentences with situations:", len(tok_uuid_to_situation))
|
79 |
+
|
80 |
+
for section in lun(comm.sectionList):
|
81 |
+
for sentence in lun(section.sentenceList):
|
82 |
+
tokens = get_tokens(sentence.tokenization)
|
83 |
+
situations = tok_uuid_to_situation[sentence.tokenization.uuid.uuidString]
|
84 |
+
tok_to_annos = map_tokens_to_annotations(comm, situations, confidence_filter)
|
85 |
+
|
86 |
+
frame_list, tok_list = prepare_ml_lists(language, tok_to_annos, tokens)
|
87 |
+
|
88 |
+
ml_anno = MultiLabelAnnotation(tok_list, ["_" for _ in tok_list], frame_list,
|
89 |
+
[None for _ in tok_list])
|
90 |
+
yield ml_anno
|
91 |
+
|
92 |
+
|
93 |
+
def prepare_ml_lists(language, tok_to_annos, tokens):
|
94 |
+
tok_list = []
|
95 |
+
frame_list = []
|
96 |
+
for tok_idx, tok in enumerate(tokens):
|
97 |
+
# split tokens that include punctuation
|
98 |
+
split_tok = nltk.word_tokenize(tok.text, language=language)
|
99 |
+
tok_list.extend(split_tok)
|
100 |
+
tok_anno = []
|
101 |
+
for anno in tok_to_annos.get(tok_idx, []):
|
102 |
+
tok_anno.append(anno)
|
103 |
+
frame_list.extend([list(tok_anno) for _ in split_tok])
|
104 |
+
|
105 |
+
# remove annotations from final punctuation & solve BIO weird stuff
|
106 |
+
for idx, (tok, frame_annos) in enumerate(zip(tok_list, frame_list)):
|
107 |
+
if tok in ",.:;\"'`«»":
|
108 |
+
to_delete = []
|
109 |
+
for fa in frame_annos:
|
110 |
+
if fa.startswith("T:"):
|
111 |
+
compare_fa = fa
|
112 |
+
else:
|
113 |
+
compare_fa = "I" + fa[1:]
|
114 |
+
|
115 |
+
if idx == len(tok_list) - 1:
|
116 |
+
to_delete.append(fa)
|
117 |
+
elif compare_fa not in frame_list[idx + 1]:
|
118 |
+
to_delete.append(fa)
|
119 |
+
|
120 |
+
for fa in to_delete:
|
121 |
+
frame_annos.remove(fa)
|
122 |
+
|
123 |
+
for fa_idx, fa in enumerate(frame_annos):
|
124 |
+
|
125 |
+
if fa.startswith("B:"):
|
126 |
+
# check if we had exactly the same label the token before
|
127 |
+
if idx > 0 and fa in frame_list[idx - 1]:
|
128 |
+
frame_annos[fa_idx] = "I" + fa[1:]
|
129 |
+
|
130 |
+
return frame_list, tok_list
|
131 |
+
|
132 |
+
|
133 |
+
def map_tokens_to_annotations(comm: Communication, situations: List[str], confidence_filter: float):
|
134 |
+
tok_to_annos = defaultdict(list)
|
135 |
+
for sit_idx, sit_uuid in enumerate(situations):
|
136 |
+
situation = comm.situationMentionForUUID[sit_uuid]
|
137 |
+
if situation.confidence < confidence_filter:
|
138 |
+
continue
|
139 |
+
|
140 |
+
frame_type = situation.situationKind
|
141 |
+
tgt_tokens = situation.tokens.tokenIndexList
|
142 |
+
|
143 |
+
if frame_type == "@@VIRTUAL_ROOT@@":
|
144 |
+
continue
|
145 |
+
|
146 |
+
for tok_id in tgt_tokens:
|
147 |
+
tok_to_annos[tok_id].append(f"T:{frame_type}@{sit_idx:02}@@{situation.confidence}")
|
148 |
+
for arg in situation.argumentList:
|
149 |
+
if arg.confidence < confidence_filter:
|
150 |
+
continue
|
151 |
+
|
152 |
+
fe_type = arg.role
|
153 |
+
fe_tokens = arg.entityMention.tokens.tokenIndexList
|
154 |
+
for tok_n, tok_id in enumerate(fe_tokens):
|
155 |
+
if tok_n == 0:
|
156 |
+
bio = "B"
|
157 |
+
else:
|
158 |
+
bio = "I"
|
159 |
+
tok_to_annos[tok_id].append(f"{bio}:{frame_type}:{fe_type}@{sit_idx:02}@@{arg.confidence}")
|
160 |
+
return tok_to_annos
|
161 |
+
|
162 |
+
|
163 |
+
def map_sent_to_situation(comm):
|
164 |
+
tok_uuid_to_situation = defaultdict(list)
|
165 |
+
for situation in comm.situationMentionSetList:
|
166 |
+
for mention in situation.mentionList:
|
167 |
+
tok_uuid_to_situation[mention.tokens.tokenizationId.uuidString].append(mention.uuid.uuidString)
|
168 |
+
return tok_uuid_to_situation
|
169 |
+
|
170 |
+
|
171 |
+
def main():
|
172 |
+
file_in = sys.argv[1]
|
173 |
+
language = sys.argv[2]
|
174 |
+
output_directory = sys.argv[3]
|
175 |
+
confidence_filter = float(sys.argv[4])
|
176 |
+
split_by_migration_files = False
|
177 |
+
|
178 |
+
file_in_base = os.path.basename(file_in)
|
179 |
+
file_out = f"{output_directory}/lome_{file_in_base}"
|
180 |
+
multi_label_annos = list(convert_file(file_in, language=language, confidence_filter=confidence_filter))
|
181 |
+
multi_label_json = [dataclasses.asdict(anno) for anno in multi_label_annos]
|
182 |
+
|
183 |
+
if split_by_migration_files:
|
184 |
+
files = glob.glob("output/migration/split_data/split_dev10_sep_txt_files/*.orig.txt")
|
185 |
+
files.sort(key=lambda f: int(f.split("/")[-1].rstrip(".orig.txt")))
|
186 |
+
|
187 |
+
for anno, file in zip(multi_label_annos, files):
|
188 |
+
basename = file.split("/")[-1].rstrip(".orig.txt")
|
189 |
+
spl_file_out = f"{output_directory}/{basename}"
|
190 |
+
with open(f"{spl_file_out}.txt", "w", encoding="utf-8") as f_txt:
|
191 |
+
for line in anno.to_txt():
|
192 |
+
f_txt.write(line + os.linesep)
|
193 |
+
f_txt.write(os.linesep)
|
194 |
+
|
195 |
+
else:
|
196 |
+
print(file_out)
|
197 |
+
with open(f"{file_out}.json", "w", encoding="utf-8") as f_json:
|
198 |
+
json.dump(multi_label_json, f_json, indent=4)
|
199 |
+
|
200 |
+
with open(f"{file_out}.txt", "w", encoding="utf-8") as f_txt:
|
201 |
+
for anno in multi_label_annos:
|
202 |
+
for line in anno.to_txt():
|
203 |
+
f_txt.write(line + os.linesep)
|
204 |
+
f_txt.write(os.linesep)
|
205 |
+
|
206 |
+
|
207 |
+
if __name__ == '__main__':
|
208 |
+
main()
|
sociofillmore/common/filter_lang.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import langdetect
|
2 |
+
|
3 |
+
import json
|
4 |
+
|
5 |
+
|
6 |
+
DATA_FILE = "data/thecrashes_data.json"
|
7 |
+
|
8 |
+
|
9 |
+
def main():
|
10 |
+
texts = get_texts()
|
11 |
+
for text in texts:
|
12 |
+
if langdetect.detect(text) == "en":
|
13 |
+
print("\n<-------------------------------")
|
14 |
+
print(text)
|
15 |
+
print("------------------------------>\n")
|
16 |
+
|
17 |
+
|
18 |
+
def get_texts():
|
19 |
+
with open(DATA_FILE, encoding="utf-8") as f:
|
20 |
+
data = json.load(f)
|
21 |
+
|
22 |
+
texts = []
|
23 |
+
|
24 |
+
for event in data:
|
25 |
+
for article in event["articles"]:
|
26 |
+
texts.append(article["title"] + "\n\n" + article["summary"])
|
27 |
+
|
28 |
+
return texts
|
29 |
+
|
30 |
+
|
31 |
+
if __name__ == '__main__':
|
32 |
+
main()
|
sociofillmore/common/get_nltk_fn_roles.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk.corpus import framenet as fn
|
2 |
+
import json
|
3 |
+
|
4 |
+
frames_to_roles = {}
|
5 |
+
|
6 |
+
for frame in fn.frames():
|
7 |
+
frames_to_roles[frame.name] = list(frame.FE.keys())
|
8 |
+
|
9 |
+
|
10 |
+
with open("resources/fn_frames_to_roles.json", "w", encoding="utf-8") as f:
|
11 |
+
json.dump(frames_to_roles, f)
|
sociofillmore/common/pos_based_targetid.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import spacy
|
5 |
+
|
6 |
+
|
7 |
+
TARGET_POS = [
|
8 |
+
"NOUN",
|
9 |
+
"VERB",
|
10 |
+
"ADJ",
|
11 |
+
"ADV"
|
12 |
+
]
|
13 |
+
|
14 |
+
|
15 |
+
def do_frameid():
|
16 |
+
nlp = spacy.load("it_core_news_md")
|
17 |
+
|
18 |
+
with open("data/migration/corpus_titoli_all_raw.txt", encoding="utf-8") as f_in, \
|
19 |
+
open("output/migration/pos_based_targetid/corpus_titoli_all_raw.jsonl", "w", encoding="utf-8") as f_out:
|
20 |
+
|
21 |
+
for line in f_in:
|
22 |
+
doc = nlp(line.strip())
|
23 |
+
out = {
|
24 |
+
"tokens": [t.text for t in doc],
|
25 |
+
"predicates": [i for i, t in enumerate(doc) if t.pos_ in TARGET_POS]
|
26 |
+
}
|
27 |
+
f_out.write(json.dumps(out) + os.linesep)
|
28 |
+
|
29 |
+
|
30 |
+
if __name__ == "__main__":
|
31 |
+
do_frameid()
|
sociofillmore/common/split_lome_files.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import tarfile
|
4 |
+
|
5 |
+
|
6 |
+
def split_lome_files(lome_folder, output_folder):
|
7 |
+
for file in glob.glob(f"{lome_folder}/**/*.comm.*"):
|
8 |
+
doc_id = os.path.basename(file).split(".")[0].split("_")[1]
|
9 |
+
doc_key = doc_id[:2]
|
10 |
+
|
11 |
+
print(file, "->", doc_key)
|
12 |
+
|
13 |
+
with tarfile.open(f"{output_folder}/block_{doc_key}.tar", "a") as tar_f:
|
14 |
+
tar_f.add(file)
|
15 |
+
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
#split_lome_files("output/migration/lome/multilabel/lome_0shot/pavia/", "output/migration/lome/lome_0shot/multilabel_pavia_blocks")
|
19 |
+
# split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai/", "output/femicides/lome/lome_0shot/multilabel_rai_blocks")
|
20 |
+
split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai_ALL/", "output/femicides/lome/lome_0shot/multilabel_rai_ALL_blocks")
|
21 |
+
# split_lome_files("output/femicides/lome/lome_0shot/multilabel/olv/", "output/femicides/lome/lome_0shot/multilabel_olv_blocks")
|
22 |
+
# split_lome_files("output/crashes/lome/lome_0shot/multilabel/thecrashes/", "output/crashes/lome/lome_0shot/multilabel_thecrashes_blocks")
|
sociofillmore/crashes/__pycache__/utils.cpython-37.pyc
ADDED
Binary file (629 Bytes). View file
|
|
sociofillmore/crashes/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (645 Bytes). View file
|
|
sociofillmore/crashes/generate_templates.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
|
5 |
+
random.seed(2021)
|
6 |
+
|
7 |
+
|
8 |
+
NUM_SENTENCES = 100_000
|
9 |
+
NUM_FAILS = 25
|
10 |
+
|
11 |
+
|
12 |
+
SENT_TYPES = ("0_PTY", "1_PTY", "2_PTY")
|
13 |
+
SENT_1_PTY_TYPES = ("VICTIM", "OUTCOME", "DRIVE")
|
14 |
+
SENT_ACTIVE_TYPES = ("ACTIVE", "NON_ACTIVE")
|
15 |
+
|
16 |
+
SENTS_0_PTY_OUTCOME = ("[[OUTCOME]] [[CIRCUMSTANCE]] [[PLACE]]",
|
17 |
+
"[[OUTCOME]] [[CIRCUMSTANCE]] [[TIME]]", "[[OUTCOME]] [[CIRCUMSTANCE]]")
|
18 |
+
SENTS_1_PTY_VICTIM = ("[[SUBJECT]] [[VERB_V2]] [[PLACE]]",
|
19 |
+
"[[SUBJECT]] [[TIME]] [[VERB_V2]]", "[[SUBJECT]] [[VERB_V2]]")
|
20 |
+
SENTS_1_PTY_OUTCOME = ("[[SUBJECT]] [[OUTCOME]] [[PLACE]] [[CIRCUMSTANCE]]",
|
21 |
+
"[[SUBJECT]] [[OUTCOME]] [[CIRCUMSTANCE]]")
|
22 |
+
SENTS_1_PTY_DRIVE = ("[[SUBJECT]] [[VP_DRIVE]] [[PLACE]]",
|
23 |
+
"[[SUBJECT]] [[VP_DRIVE]]")
|
24 |
+
SENTS_2_PTYS = ("[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]] [[PLACE]]",
|
25 |
+
"[[SUBJECT]] [[VERB_V2]] [[TIME]] [[VERB_P]] [[OTHER]] [[VERB_REST]]", "[[SUBJECT]] [[VERB_V2]] [[VERB_P]] [[OTHER]] [[VERB_REST]]")
|
26 |
+
|
27 |
+
PLACES = ("op stationsplein", "in stadscentrum", "op kruispunt Westerhaven", "op A27", "op A10", "in Lelystad",
|
28 |
+
"in Assen", "in Amsterdam", "bij Renkum", "in Schilderswijk", "bij knooppunt Lunetten", "op zuidelijke ringweg",
|
29 |
+
"in de buurt van de Erasmusbrug", "op schoolplein Stedelijk Gymnasium", "bij afrit Rotterdam-Noord", "op Kanaleneiland")
|
30 |
+
TIMES = ("tijdens avondspits", "vrijdagavond",
|
31 |
+
"dinsdagochtend", "donderdagnacht", "rond middaguur")
|
32 |
+
CIRCUMSTANCES = ("na ongeluk", "na aanrijding", "na botsing", "na crash")
|
33 |
+
CIRCUMSTANCES_AGT = (", dader ervandoor", ", dader ervandoor", ", dader rijdt door", ", bestuurder rijdt door")
|
34 |
+
|
35 |
+
OUTCOME_0_TYPES = ("TRAFFIC", "HUMAN")
|
36 |
+
OUTCOMES_0_TRAFFIC = ("verkeersopstopping", "file", "veel vertraging")
|
37 |
+
OUTCOMES_0_HUMAN = ("dode", "zwaargewonde", "gewonde", "drie gewonden")
|
38 |
+
OUTCOMES_1 = ("dood", "overleden", "zwaargewond", "lichtgewond", "ongedeerd")
|
39 |
+
|
40 |
+
SUBJECT_TYPES = ("WEAK_PTY", "DRIVER", "VERHICLE")
|
41 |
+
|
42 |
+
VPS_DRIVE_ACTIVE = ("rijdt tegen boom", "veroorzaakt ongeluk")
|
43 |
+
VPS_DRIVE_NON_ACTIVE = ("verongelukt", "gecrasht", "uit de bocht gevlogen", "raakt gewond", "raakt gewond door klap")
|
44 |
+
EVENT_VERBS_1_VICTIM = ("aangereden", "geschept", "raakt gewond", "raakt gewond door klap")
|
45 |
+
EVENT_VERBS_2_ACTIVE_ANY = ("raakt|_|_", "botst|op|_", "botst|tegen|_")
|
46 |
+
EVENT_VERBS_2_ACTIVE_DRIVE = ("rijdt|_|aan", "rijdt|_|dood", "schept|_|_")
|
47 |
+
EVENT_VERBS_2_NON_ACTIVE_DRIVER = (
|
48 |
+
"aangereden|door|_", "geschept|door|_")
|
49 |
+
EVENT_VERBS_2_NON_ACTIVE_VEHICLE = (
|
50 |
+
"aangereden|door|_", "geschept|door|_", "komt|onder|_")
|
51 |
+
EVENT_VERBS_2_NON_ACTIVE_ANY = (
|
52 |
+
"geraakt|door|_",)
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
WEAK_PTY_NPS = ("fietser", "skateboarder", "wielrenner", "rolschaatser", "jogger", "voetganger", "motorrijder",
|
57 |
+
"fietskoerier", "[[PERSON]] op fiets", "[[PERSON]] op e-bike")
|
58 |
+
ANY_PERSON_NPS = ("vrouw", "man", "meisje", "jongen",
|
59 |
+
"bejaarde vrouw", "bejaarde man", "Duitser", "toerist")
|
60 |
+
CYCLIST_PERSON_NPS = ("postbode", "maaltijdbezorger", "politieagent")
|
61 |
+
DRIVER_NPS = ("automobilist", "automobiliste", "bestuurder", "dronken automobilist", "dronken bestuurder", "motorrijder",
|
62 |
+
"minderjarige bestuurder", "trucker", "taxichauffeur", "[[PERSON]] in auto", "dronken [[PERSON]] in auto")
|
63 |
+
VEHICLE_NPS = ("auto", "personenauto", "vrachtwagen", "tractor", "auto met caravan", "scooter", "motor",
|
64 |
+
"tram", "stadsbus", "lijn 10", "touringcar", "camper", "vorkheftruck")
|
65 |
+
|
66 |
+
|
67 |
+
def generate_weak_pty():
|
68 |
+
noun_phrase = random.choice(WEAK_PTY_NPS)
|
69 |
+
if "[[PERSON]]" in noun_phrase:
|
70 |
+
person = random.choice(ANY_PERSON_NPS + CYCLIST_PERSON_NPS)
|
71 |
+
return noun_phrase.replace("[[PERSON]]", person)
|
72 |
+
else:
|
73 |
+
return noun_phrase
|
74 |
+
|
75 |
+
|
76 |
+
def generate_driver():
|
77 |
+
noun_phrase = random.choice(DRIVER_NPS)
|
78 |
+
if "[[PERSON]]" in noun_phrase:
|
79 |
+
person = random.choice(ANY_PERSON_NPS)
|
80 |
+
return noun_phrase.replace("[[PERSON]]", person)
|
81 |
+
else:
|
82 |
+
return noun_phrase
|
83 |
+
|
84 |
+
|
85 |
+
def make_sentence(template, fields):
|
86 |
+
sentence = template
|
87 |
+
for field, value in fields.items():
|
88 |
+
sentence = sentence.replace(f"[[{field}]]", value)
|
89 |
+
sentence = sentence.replace("_", "").replace(" ", " ").strip()
|
90 |
+
sentence = sentence[0].upper() + sentence[1:]
|
91 |
+
return sentence
|
92 |
+
|
93 |
+
|
94 |
+
def main():
|
95 |
+
sentences = {}
|
96 |
+
|
97 |
+
dup_fails = 0
|
98 |
+
while len(sentences) < NUM_SENTENCES and dup_fails < NUM_FAILS:
|
99 |
+
fields = {}
|
100 |
+
|
101 |
+
label = {"party_mentioned": 0, "party_human": 0, "active": False}
|
102 |
+
|
103 |
+
fields["TIME"] = random.choice(TIMES)
|
104 |
+
fields["PLACE"] = random.choice(PLACES)
|
105 |
+
|
106 |
+
sent_type = random.choice(SENT_TYPES)
|
107 |
+
if sent_type == "0_PTY":
|
108 |
+
if random.random() < 0.5:
|
109 |
+
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
|
110 |
+
else:
|
111 |
+
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
|
112 |
+
label["party_mentioned"] += 1
|
113 |
+
label["party_human"] += 1
|
114 |
+
|
115 |
+
outcome_type = random.choice(OUTCOME_0_TYPES)
|
116 |
+
if outcome_type == "TRAFFIC":
|
117 |
+
fields["OUTCOME"] = random.choice(OUTCOMES_0_TRAFFIC)
|
118 |
+
else:
|
119 |
+
fields["OUTCOME"] = random.choice(OUTCOMES_0_HUMAN)
|
120 |
+
label["party_mentioned"] += 1
|
121 |
+
label["party_human"] += 1
|
122 |
+
sentence = make_sentence(
|
123 |
+
random.choice(SENTS_0_PTY_OUTCOME), fields)
|
124 |
+
|
125 |
+
elif sent_type == "1_PTY":
|
126 |
+
if random.random() < 0.5:
|
127 |
+
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES)
|
128 |
+
else:
|
129 |
+
fields["CIRCUMSTANCE"] = random.choice(CIRCUMSTANCES_AGT)
|
130 |
+
label["party_mentioned"] += 1
|
131 |
+
label["party_human"] += 1
|
132 |
+
|
133 |
+
sent_subtype = random.choice(SENT_1_PTY_TYPES)
|
134 |
+
if sent_subtype == "VICTIM":
|
135 |
+
label["party_mentioned"] += 1
|
136 |
+
label["party_human"] += 1
|
137 |
+
fields["SUBJECT"] = generate_weak_pty()
|
138 |
+
fields["VERB_V2"] = random.choice(EVENT_VERBS_1_VICTIM)
|
139 |
+
sentence = make_sentence(
|
140 |
+
random.choice(SENTS_1_PTY_VICTIM), fields)
|
141 |
+
elif sent_subtype == "OUTCOME":
|
142 |
+
subject_type = random.choice(["WEAK_PTY", "DRIVER"])
|
143 |
+
fields["OUTCOME"] = random.choice(OUTCOMES_1)
|
144 |
+
if subject_type == "WEAK_PTY":
|
145 |
+
label["party_mentioned"] += 1
|
146 |
+
label["party_human"] += 1
|
147 |
+
fields["SUBJECT"] = generate_weak_pty()
|
148 |
+
else: # driver
|
149 |
+
label["party_mentioned"] += 1
|
150 |
+
label["party_human"] += 1
|
151 |
+
fields["SUBJECT"] = generate_driver()
|
152 |
+
sentence = make_sentence(
|
153 |
+
random.choice(SENTS_1_PTY_OUTCOME), fields)
|
154 |
+
else: # drive
|
155 |
+
subject_type = random.choice(["DRIVER", "VERHICLE"])
|
156 |
+
active_type = random.choice(SENT_ACTIVE_TYPES)
|
157 |
+
if active_type == "ACTIVE":
|
158 |
+
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_ACTIVE)
|
159 |
+
label["active"] = True
|
160 |
+
else:
|
161 |
+
fields["VP_DRIVE"] = random.choice(VPS_DRIVE_NON_ACTIVE)
|
162 |
+
if subject_type == "DRIVER":
|
163 |
+
label["party_mentioned"] += 1
|
164 |
+
label["party_human"] += 1
|
165 |
+
fields["SUBJECT"] = generate_driver()
|
166 |
+
else: # vehicle
|
167 |
+
label["party_mentioned"] += 1
|
168 |
+
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
|
169 |
+
sentence = make_sentence(
|
170 |
+
random.choice(SENTS_1_PTY_DRIVE), fields)
|
171 |
+
else: # 2 pty
|
172 |
+
active_type = random.choice(SENT_ACTIVE_TYPES)
|
173 |
+
if active_type == "ACTIVE":
|
174 |
+
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VERHICLE"])
|
175 |
+
label["active"] = True
|
176 |
+
|
177 |
+
if subject_type == "WEAK_PTY":
|
178 |
+
label["party_mentioned"] += 1
|
179 |
+
label["party_human"] += 1
|
180 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
181 |
+
EVENT_VERBS_2_ACTIVE_ANY).split("|")
|
182 |
+
fields["SUBJECT"] = generate_weak_pty()
|
183 |
+
other_type = random.choice(["WEAK_PTY", "VEHICLE"])
|
184 |
+
elif subject_type == "DRIVER":
|
185 |
+
label["party_mentioned"] += 1
|
186 |
+
label["party_human"] += 1
|
187 |
+
fields["SUBJECT"] = generate_driver()
|
188 |
+
if random.random() < 0.5:
|
189 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
190 |
+
EVENT_VERBS_2_ACTIVE_ANY).split("|")
|
191 |
+
other_type = random.choice(["WEAK_PTY", "VEHICLE"])
|
192 |
+
else:
|
193 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
194 |
+
EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
|
195 |
+
other_type = "WEAK_PTY"
|
196 |
+
|
197 |
+
else: # vehicle
|
198 |
+
label["party_mentioned"] += 1
|
199 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
200 |
+
EVENT_VERBS_2_ACTIVE_ANY + EVENT_VERBS_2_ACTIVE_DRIVE).split("|")
|
201 |
+
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
|
202 |
+
|
203 |
+
if other_type == "WEAK_PTY":
|
204 |
+
label["party_mentioned"] += 1
|
205 |
+
label["party_human"] += 1
|
206 |
+
fields["OTHER"] = generate_weak_pty()
|
207 |
+
elif other_type == "DRIVER":
|
208 |
+
label["party_mentioned"] += 1
|
209 |
+
label["party_human"] += 1
|
210 |
+
fields["OTHER"] = generate_driver()
|
211 |
+
else: # vehicle
|
212 |
+
label["party_mentioned"] += 1
|
213 |
+
fields["OTHER"] = random.choice(VEHICLE_NPS)
|
214 |
+
|
215 |
+
else: # non-active
|
216 |
+
other_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
|
217 |
+
if other_type == "WEAK_PTY":
|
218 |
+
label["party_mentioned"] += 1
|
219 |
+
label["party_human"] += 1
|
220 |
+
fields["OTHER"] = generate_weak_pty()
|
221 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
222 |
+
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
|
223 |
+
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
|
224 |
+
|
225 |
+
elif other_type == "DRIVER":
|
226 |
+
label["party_mentioned"] += 1
|
227 |
+
label["party_human"] += 1
|
228 |
+
fields["OTHER"] = generate_driver()
|
229 |
+
if random.random() < 0.5:
|
230 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
231 |
+
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
|
232 |
+
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
|
233 |
+
else:
|
234 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
235 |
+
EVENT_VERBS_2_NON_ACTIVE_DRIVER).split("|")
|
236 |
+
subject_type = random.choice(["WEAK_PTY"])
|
237 |
+
|
238 |
+
else: # "vehicle"
|
239 |
+
label["party_mentioned"] += 1
|
240 |
+
fields["OTHER"] = random.choice(VEHICLE_NPS)
|
241 |
+
if random.random() < 0.5:
|
242 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
243 |
+
EVENT_VERBS_2_NON_ACTIVE_ANY).split("|")
|
244 |
+
subject_type = random.choice(["WEAK_PTY", "DRIVER", "VEHICLE"])
|
245 |
+
else:
|
246 |
+
fields["VERB_V2"], fields["VERB_P"], fields["VERB_REST"] = random.choice(
|
247 |
+
EVENT_VERBS_2_NON_ACTIVE_VEHICLE).split("|")
|
248 |
+
subject_type = random.choice(["WEAK_PTY"])
|
249 |
+
|
250 |
+
if subject_type == "WEAK_PTY":
|
251 |
+
label["party_mentioned"] += 1
|
252 |
+
label["party_human"] += 1
|
253 |
+
fields["SUBJECT"] = generate_weak_pty()
|
254 |
+
elif subject_type == "DRIVER":
|
255 |
+
label["party_mentioned"] += 1
|
256 |
+
label["party_human"] += 1
|
257 |
+
fields["SUBJECT"] = generate_driver()
|
258 |
+
else: # vehicle
|
259 |
+
label["party_mentioned"] += 1
|
260 |
+
fields["SUBJECT"] = random.choice(VEHICLE_NPS)
|
261 |
+
|
262 |
+
sentence = make_sentence(random.choice(SENTS_2_PTYS), fields)
|
263 |
+
|
264 |
+
if sentence not in sentences:
|
265 |
+
sentences[sentence] = label
|
266 |
+
dup_fails = 0
|
267 |
+
else:
|
268 |
+
dup_fails += 1
|
269 |
+
|
270 |
+
with open("output/crashes/generate_templates/sentences.jsonl", "w", encoding="utf-8") as f_out:
|
271 |
+
for sentence, label in sentences.items():
|
272 |
+
f_out.write(json.dumps({"sentence": sentence, "label": label}) + os.linesep)
|
273 |
+
f_out.write(os.linesep)
|
274 |
+
|
275 |
+
|
276 |
+
if __name__ == "__main__":
|
277 |
+
main()
|
sociofillmore/crashes/make_bechdel_dicts.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import os
|
3 |
+
import json
|
4 |
+
|
5 |
+
sys.path.append("./libs")
|
6 |
+
from OpenDutchWordnet import Wn_grid_parser
|
7 |
+
|
8 |
+
|
9 |
+
def find_all_le_hyponyms(instance, le_id):
|
10 |
+
print(f"Starting from `{le_id}`...")
|
11 |
+
le_el = instance.les_find_le(le_id)
|
12 |
+
le_ss = le_el.get_synset_id()
|
13 |
+
siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
|
14 |
+
print(f"Siblings: {siblings}")
|
15 |
+
synset_el = instance.synsets_find_synset(le_ss)
|
16 |
+
print(f"Top-level synset: `{le_el.get_synset_id()}`...")
|
17 |
+
hyponyms = find_all_synset_hyponyms(instance, synset_el)
|
18 |
+
return siblings.union(hyponyms)
|
19 |
+
|
20 |
+
|
21 |
+
def find_all_synset_hyponyms(instance, synset_el):
|
22 |
+
print(f"Finding hyponyms of synset with gloss: `{synset_el.get_glosses()[:1]}`...")
|
23 |
+
hypo_les = set()
|
24 |
+
hypo_rels = synset_el.get_relations("has_hyponym")
|
25 |
+
for rel in hypo_rels:
|
26 |
+
hypo_ss = rel.get_target()
|
27 |
+
print(hypo_ss)
|
28 |
+
ss_les = {le.get_id() for le in instance.les_all_les_of_one_synset(hypo_ss)}
|
29 |
+
for i in ss_les:
|
30 |
+
print(f"\tfound LE: {i}")
|
31 |
+
ss_les.update(find_all_synset_hyponyms(instance, instance.synsets_find_synset(hypo_ss)))
|
32 |
+
hypo_les.update(ss_les)
|
33 |
+
return hypo_les
|
34 |
+
|
35 |
+
|
36 |
+
def find_siblings_and_hyperonym(instance, le_id):
|
37 |
+
le_el = instance.les_find_le(le_id)
|
38 |
+
le_ss = le_el.get_synset_id()
|
39 |
+
siblings = {le.get_id() for le in instance.les_all_les_of_one_synset(le_ss)}
|
40 |
+
print(siblings)
|
41 |
+
synset_el = instance.synsets_find_synset(le_ss)
|
42 |
+
hyper = synset_el.get_relations("has_hyperonym")[0]
|
43 |
+
hyper_ss = instance.synsets_find_synset(hyper.get_target())
|
44 |
+
print(hyper_ss.get_glosses())
|
45 |
+
print({le.get_id() for le in instance.les_all_les_of_one_synset(hyper.get_target())})
|
46 |
+
|
47 |
+
|
48 |
+
def main():
|
49 |
+
instance = Wn_grid_parser(Wn_grid_parser.odwn)
|
50 |
+
# find_all_le_hyponyms(instance, "slachtoffer-n-4")
|
51 |
+
dicts = {
|
52 |
+
"vehicles": {
|
53 |
+
"WN:cars": sorted(find_all_le_hyponyms(instance, "automobiel-n-1")),
|
54 |
+
"WN:motorbikes": sorted(find_all_le_hyponyms(instance, "motorfiets-n-1")),
|
55 |
+
"WN:bikes": sorted(find_all_le_hyponyms(instance, "fiets-n-1")),
|
56 |
+
"WN:buses": sorted(find_all_le_hyponyms(instance, "autobus-n-1")),
|
57 |
+
"extra": sorted(["scootmobiel", "e-bike"])
|
58 |
+
},
|
59 |
+
"persons": {
|
60 |
+
"WN:driver": sorted(find_all_le_hyponyms(instance, "bestuurder-n-2")),
|
61 |
+
"WN:cyclist": sorted(find_all_le_hyponyms(instance, "fietser-n-1")),
|
62 |
+
"WN:walker": sorted(find_all_le_hyponyms(instance, "loper-n-4")),
|
63 |
+
"WN:pedestrian": sorted(find_all_le_hyponyms(instance, "voetganger-n-1")),
|
64 |
+
"WN:victim": sorted(find_all_le_hyponyms(instance, "slachtoffer-n-4")),
|
65 |
+
"extra": sorted(
|
66 |
+
["man", "vrouw", "jongen", "meisje", "persoon", "bejaarde", "maaltijdbezorger"]
|
67 |
+
)
|
68 |
+
}
|
69 |
+
}
|
70 |
+
|
71 |
+
ignore_file = "output/crashes/predict_bechdel/lexical_dicts_ignore.json"
|
72 |
+
if os.path.isfile(ignore_file):
|
73 |
+
with open(ignore_file, encoding="utf-8") as f_ign:
|
74 |
+
ignore = json.load(f_ign)
|
75 |
+
|
76 |
+
cleaned_dicts = {}
|
77 |
+
for category in dicts.keys():
|
78 |
+
cleaned_dicts[category] = {}
|
79 |
+
for subcat, words in dicts[category].items():
|
80 |
+
ignore_subcat = ignore.get(category, {}).get(subcat, [])
|
81 |
+
cleaned_dicts[category][subcat] = [w for w in words if w not in ignore_subcat]
|
82 |
+
else:
|
83 |
+
cleaned_dicts = dicts
|
84 |
+
|
85 |
+
with open("output/crashes/predict_bechdel/lexical_dicts.json", "w", encoding="utf-8") as f_out:
|
86 |
+
json.dump(cleaned_dicts, f_out, indent=4)
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == "__main__":
|
90 |
+
main()
|
sociofillmore/crashes/predict_bechdel.py
ADDED
@@ -0,0 +1,500 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
from typing import Dict, Iterable, List, Optional, Tuple
|
4 |
+
import json
|
5 |
+
import random
|
6 |
+
import argparse
|
7 |
+
from allennlp.data.fields.field import Field
|
8 |
+
from allennlp.data.fields.sequence_field import SequenceField
|
9 |
+
from allennlp.models.model import Model
|
10 |
+
from allennlp.nn.util import get_text_field_mask
|
11 |
+
from allennlp.predictors.predictor import Predictor
|
12 |
+
|
13 |
+
import pandas as pd
|
14 |
+
import spacy
|
15 |
+
import torch
|
16 |
+
from sklearn.preprocessing import MultiLabelBinarizer
|
17 |
+
|
18 |
+
from allennlp.common.util import pad_sequence_to_length
|
19 |
+
from allennlp.data import TextFieldTensors
|
20 |
+
from allennlp.data.vocabulary import Vocabulary
|
21 |
+
from allennlp.data import DatasetReader, TokenIndexer, Instance, Token
|
22 |
+
from allennlp.data.fields import TextField, LabelField
|
23 |
+
from allennlp.data.token_indexers.pretrained_transformer_indexer import (
|
24 |
+
PretrainedTransformerIndexer,
|
25 |
+
)
|
26 |
+
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import (
|
27 |
+
PretrainedTransformerTokenizer,
|
28 |
+
)
|
29 |
+
from allennlp.models import BasicClassifier
|
30 |
+
from allennlp.modules.text_field_embedders.basic_text_field_embedder import (
|
31 |
+
BasicTextFieldEmbedder,
|
32 |
+
)
|
33 |
+
from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
|
34 |
+
PretrainedTransformerEmbedder,
|
35 |
+
)
|
36 |
+
from allennlp.modules.seq2vec_encoders.bert_pooler import BertPooler
|
37 |
+
from allennlp.modules.seq2vec_encoders.cls_pooler import ClsPooler
|
38 |
+
from allennlp.training.checkpointer import Checkpointer
|
39 |
+
from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
|
40 |
+
from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader
|
41 |
+
from allennlp.training.optimizers import AdamOptimizer
|
42 |
+
from allennlp.predictors.text_classifier import TextClassifierPredictor
|
43 |
+
from allennlp.training.callbacks.tensorboard import TensorBoardCallback
|
44 |
+
from torch import nn
|
45 |
+
from torch.nn.functional import binary_cross_entropy_with_logits
|
46 |
+
|
47 |
+
|
48 |
+
random.seed(1986)
|
49 |
+
|
50 |
+
|
51 |
+
SEQ_LABELS = ["humansMentioned", "vehiclesMentioned", "eventVerb", "activeEventVerb"]
|
52 |
+
|
53 |
+
|
54 |
+
# adapted from bert-for-framenet project
|
55 |
+
class SequenceMultiLabelField(Field):
|
56 |
+
|
57 |
+
def __init__(self,
|
58 |
+
labels: List[List[str]],
|
59 |
+
sequence_field: SequenceField,
|
60 |
+
binarizer: MultiLabelBinarizer,
|
61 |
+
label_namespace: str
|
62 |
+
):
|
63 |
+
self.labels = labels
|
64 |
+
self._indexed_labels = None
|
65 |
+
self._label_namespace = label_namespace
|
66 |
+
self.sequence_field = sequence_field
|
67 |
+
self.binarizer = binarizer
|
68 |
+
|
69 |
+
@staticmethod
|
70 |
+
def retokenize_tags(tags: List[List[str]],
|
71 |
+
offsets: List[Tuple[int, int]],
|
72 |
+
wp_primary_token: str = "last",
|
73 |
+
wp_secondary_tokens: str = "empty",
|
74 |
+
empty_value=lambda: []
|
75 |
+
) -> List[List[str]]:
|
76 |
+
tags_per_wordpiece = [
|
77 |
+
empty_value() # [CLS]
|
78 |
+
]
|
79 |
+
|
80 |
+
for i, (off_start, off_end) in enumerate(offsets):
|
81 |
+
tag = tags[i]
|
82 |
+
|
83 |
+
# put a tag on the first wordpiece corresponding to the word token
|
84 |
+
# e.g. "hello" --> "he" + "##ll" + "##o" --> 2 extra tokens
|
85 |
+
# TAGS: [..., TAG, None, None, ...]
|
86 |
+
num_extra_tokens = off_end - off_start
|
87 |
+
if wp_primary_token == "first":
|
88 |
+
tags_per_wordpiece.append(tag)
|
89 |
+
if wp_secondary_tokens == "repeat":
|
90 |
+
tags_per_wordpiece.extend(num_extra_tokens * [tag])
|
91 |
+
else:
|
92 |
+
tags_per_wordpiece.extend(num_extra_tokens * [empty_value()])
|
93 |
+
if wp_primary_token == "last":
|
94 |
+
tags_per_wordpiece.append(tag)
|
95 |
+
|
96 |
+
tags_per_wordpiece.append(empty_value()) # [SEP]
|
97 |
+
|
98 |
+
return tags_per_wordpiece
|
99 |
+
|
100 |
+
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]):
|
101 |
+
for label_list in self.labels:
|
102 |
+
for label in label_list:
|
103 |
+
counter[self._label_namespace][label] += 1
|
104 |
+
|
105 |
+
def get_padding_lengths(self) -> Dict[str, int]:
|
106 |
+
return {"num_tokens": self.sequence_field.sequence_length()}
|
107 |
+
|
108 |
+
def index(self, vocab: Vocabulary):
|
109 |
+
|
110 |
+
indexed_labels: List[List[int]] = []
|
111 |
+
for sentence_labels in self.labels:
|
112 |
+
sentence_indexed_labels = []
|
113 |
+
for label in sentence_labels:
|
114 |
+
try:
|
115 |
+
sentence_indexed_labels.append(
|
116 |
+
vocab.get_token_index(label, self._label_namespace))
|
117 |
+
except KeyError:
|
118 |
+
print(f"[WARNING] Ignore unknown label {label}")
|
119 |
+
indexed_labels.append(sentence_indexed_labels)
|
120 |
+
self._indexed_labels = indexed_labels
|
121 |
+
|
122 |
+
def as_tensor(self, padding_lengths: Dict[str, int]) -> torch.Tensor:
|
123 |
+
|
124 |
+
# binarize
|
125 |
+
binarized_seq = self.binarizer.transform(self._indexed_labels).tolist()
|
126 |
+
|
127 |
+
# padding
|
128 |
+
desired_num_tokens = padding_lengths["num_tokens"]
|
129 |
+
padded_tags = pad_sequence_to_length(binarized_seq, desired_num_tokens,
|
130 |
+
default_value=lambda: list(self.binarizer.transform([[]])[0]))
|
131 |
+
|
132 |
+
tensor = torch.tensor(padded_tags, dtype=torch.float)
|
133 |
+
return tensor
|
134 |
+
|
135 |
+
def empty_field(self) -> 'Field':
|
136 |
+
|
137 |
+
field = SequenceMultiLabelField(
|
138 |
+
[], self.sequence_field.empty_field(), self.binarizer, self._label_namespace)
|
139 |
+
field._indexed_labels = []
|
140 |
+
return field
|
141 |
+
|
142 |
+
|
143 |
+
# adapted from bert-for-framenet project
|
144 |
+
class MultiSequenceLabelModel(Model):
|
145 |
+
|
146 |
+
def __init__(self, embedder: PretrainedTransformerEmbedder, decoder_output_size: int, hidden_size: int, vocab: Vocabulary, embedding_size: int = 768):
|
147 |
+
super().__init__(vocab)
|
148 |
+
self.embedder = embedder
|
149 |
+
self.out_features = decoder_output_size
|
150 |
+
self.hidden_size = hidden_size
|
151 |
+
self.layers = nn.Sequential(
|
152 |
+
nn.Linear(in_features=embedding_size,
|
153 |
+
out_features=self.hidden_size),
|
154 |
+
nn.ReLU(),
|
155 |
+
nn.Linear(in_features=self.hidden_size,
|
156 |
+
out_features=self.out_features)
|
157 |
+
)
|
158 |
+
|
159 |
+
def forward(self, tokens: TextFieldTensors, label: Optional[torch.FloatTensor] = None):
|
160 |
+
embeddings = self.embedder(tokens["token_ids"])
|
161 |
+
mask = get_text_field_mask(tokens).float()
|
162 |
+
tag_logits = self.layers(embeddings)
|
163 |
+
mask = mask.reshape(mask.shape[0], mask.shape[1], 1).repeat(1, 1, self.out_features)
|
164 |
+
output = {"tag_logits": tag_logits}
|
165 |
+
if label is not None:
|
166 |
+
loss = binary_cross_entropy_with_logits(tag_logits, label, mask)
|
167 |
+
output["loss"] = loss
|
168 |
+
|
169 |
+
def get_metrics(self, _) -> Dict[str, float]:
|
170 |
+
return {}
|
171 |
+
|
172 |
+
def make_human_readable(self,
|
173 |
+
prediction,
|
174 |
+
label_namespace,
|
175 |
+
threshold=0.2,
|
176 |
+
sigmoid=True
|
177 |
+
) -> Tuple[List[str], Optional[List[float]]]:
|
178 |
+
if sigmoid:
|
179 |
+
prediction = torch.sigmoid(prediction)
|
180 |
+
|
181 |
+
predicted_labels: List[List[str]] = [[] for _ in range(len(prediction))]
|
182 |
+
|
183 |
+
# get all predictions with a positive probability
|
184 |
+
for coord in torch.nonzero(prediction > threshold):
|
185 |
+
label = self.vocab.get_token_from_index(int(coord[1]), label_namespace)
|
186 |
+
predicted_labels[coord[0]].append(f"{label}:{prediction[coord[0], coord[1]]:.3f}")
|
187 |
+
|
188 |
+
str_predictions: List[str] = []
|
189 |
+
for label_list in predicted_labels:
|
190 |
+
str_predictions.append("|".join(label_list) or "_")
|
191 |
+
|
192 |
+
return str_predictions
|
193 |
+
|
194 |
+
|
195 |
+
class TrafficBechdelReader(DatasetReader):
|
196 |
+
|
197 |
+
def __init__(self, token_indexers, tokenizer, binarizer):
|
198 |
+
self.token_indexers = token_indexers
|
199 |
+
self.tokenizer: PretrainedTransformerTokenizer = tokenizer
|
200 |
+
self.binarizer = binarizer
|
201 |
+
self.orig_data = []
|
202 |
+
super().__init__()
|
203 |
+
|
204 |
+
def _read(self, file_path) -> Iterable[Instance]:
|
205 |
+
self.orig_data.clear()
|
206 |
+
|
207 |
+
with open(file_path, encoding="utf-8") as f:
|
208 |
+
for line in f:
|
209 |
+
# skip any empty lines
|
210 |
+
if not line.strip():
|
211 |
+
continue
|
212 |
+
|
213 |
+
sentence_parts = line.lstrip("[").rstrip("]").split(",")
|
214 |
+
token_txts = []
|
215 |
+
token_mlabels = []
|
216 |
+
|
217 |
+
for sp in sentence_parts:
|
218 |
+
sp_txt, sp_lbl_str = sp.split(":")
|
219 |
+
if sp_lbl_str == "[]":
|
220 |
+
sp_lbls = []
|
221 |
+
else:
|
222 |
+
sp_lbls = sp_lbl_str.lstrip("[").rstrip("]").split("|")
|
223 |
+
|
224 |
+
# if the text is a WordNet thingy
|
225 |
+
wn_match = re.match(r"^(.+)-n-\d+$", sp_txt)
|
226 |
+
if wn_match:
|
227 |
+
sp_txt = wn_match.group(1)
|
228 |
+
|
229 |
+
# multi-token text
|
230 |
+
sp_toks = sp_txt.split()
|
231 |
+
for tok in sp_toks:
|
232 |
+
token_txts.append(tok)
|
233 |
+
token_mlabels.append(sp_lbls)
|
234 |
+
|
235 |
+
self.orig_data.append({
|
236 |
+
"sentence": token_txts,
|
237 |
+
"labels": token_mlabels,
|
238 |
+
})
|
239 |
+
yield self.text_to_instance(token_txts, token_mlabels)
|
240 |
+
|
241 |
+
def text_to_instance(self, sentence: List[str], labels: List[List[str]] = None) -> Instance:
|
242 |
+
tokens, offsets = self.tokenizer.intra_word_tokenize(sentence)
|
243 |
+
|
244 |
+
text_field = TextField(tokens, self.token_indexers)
|
245 |
+
fields = {"tokens": text_field}
|
246 |
+
if labels is not None:
|
247 |
+
labels_ = SequenceMultiLabelField.retokenize_tags(labels, offsets)
|
248 |
+
label_field = SequenceMultiLabelField(labels_, text_field, self.binarizer, "labels")
|
249 |
+
fields["label"] = label_field
|
250 |
+
return Instance(fields)
|
251 |
+
|
252 |
+
|
253 |
+
def count_parties(sentence, lexical_dicts, nlp):
|
254 |
+
|
255 |
+
num_humans = 0
|
256 |
+
num_vehicles = 0
|
257 |
+
|
258 |
+
def is_in_words(l, category):
|
259 |
+
for subcategory, words in lexical_dicts[category].items():
|
260 |
+
if subcategory.startswith("WN:"):
|
261 |
+
words = [re.match(r"^(.+)-n-\d+$", w).group(1) for w in words]
|
262 |
+
if l in words:
|
263 |
+
return True
|
264 |
+
return False
|
265 |
+
|
266 |
+
doc = nlp(sentence.lower())
|
267 |
+
for token in doc:
|
268 |
+
lemma = token.lemma_
|
269 |
+
if is_in_words(lemma, "persons"):
|
270 |
+
num_humans += 1
|
271 |
+
if is_in_words(lemma, "vehicles"):
|
272 |
+
num_vehicles += 1
|
273 |
+
|
274 |
+
return num_humans, num_vehicles
|
275 |
+
|
276 |
+
|
277 |
+
def predict_rule_based(annotations="data/crashes/bechdel_annotations_dev_first_25.csv"):
|
278 |
+
data_crashes = pd.read_csv(annotations)
|
279 |
+
with open("output/crashes/predict_bechdel/lexical_dicts.json", encoding="utf-8") as f:
|
280 |
+
lexical_dicts = json.load(f)
|
281 |
+
|
282 |
+
nlp = spacy.load("nl_core_news_md")
|
283 |
+
|
284 |
+
for _, row in data_crashes.iterrows():
|
285 |
+
sentence = row["sentence"]
|
286 |
+
num_humans, num_vehicles = count_parties(sentence, lexical_dicts, nlp)
|
287 |
+
print(sentence)
|
288 |
+
print(f"\thumans={num_humans}, vehicles={num_vehicles}")
|
289 |
+
|
290 |
+
|
291 |
+
def evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_first_25.csv", out_file="output/crashes/predict_bechdel/predictions_crashes25.csv"):
|
292 |
+
data_crashes = pd.read_csv(annotations)
|
293 |
+
labels_crashes = [
|
294 |
+
{
|
295 |
+
"party_mentioned": str(row["mentioned"]),
|
296 |
+
"party_human": str(row["as_human"]),
|
297 |
+
"active": str(True) if str(row["active"]).lower() == "true" else str(False)
|
298 |
+
}
|
299 |
+
for _, row in data_crashes.iterrows()
|
300 |
+
]
|
301 |
+
predictions_crashes = [predictor.predict(
|
302 |
+
row["sentence"]) for i, row in data_crashes.iterrows()]
|
303 |
+
crashes_out = []
|
304 |
+
correct = 0
|
305 |
+
partial_2_attrs = 0
|
306 |
+
partial_1_attr = 0
|
307 |
+
correct_mentions = 0
|
308 |
+
correct_humans = 0
|
309 |
+
correct_active = 0
|
310 |
+
|
311 |
+
for sentence, label, prediction in zip(data_crashes["sentence"], labels_crashes, predictions_crashes):
|
312 |
+
predicted = prediction["label"]
|
313 |
+
if attrib == "all":
|
314 |
+
gold = "|".join([f"{k}={v}" for k, v in label.items()])
|
315 |
+
else:
|
316 |
+
gold = label["attrib"]
|
317 |
+
if gold == predicted:
|
318 |
+
correct += 1
|
319 |
+
if attrib == "all":
|
320 |
+
partial_2_attrs += 1
|
321 |
+
partial_1_attr += 1
|
322 |
+
|
323 |
+
if attrib == "all":
|
324 |
+
gold_attrs = set(gold.split("|"))
|
325 |
+
pred_attrs = set(predicted.split("|"))
|
326 |
+
if len(gold_attrs & pred_attrs) == 2:
|
327 |
+
partial_2_attrs += 1
|
328 |
+
partial_1_attr += 1
|
329 |
+
elif len(gold_attrs & pred_attrs) == 1:
|
330 |
+
partial_1_attr += 1
|
331 |
+
|
332 |
+
if gold.split("|")[0] == predicted.split("|")[0]:
|
333 |
+
correct_mentions += 1
|
334 |
+
if gold.split("|")[1] == predicted.split("|")[1]:
|
335 |
+
correct_humans += 1
|
336 |
+
if gold.split("|")[2] == predicted.split("|")[2]:
|
337 |
+
correct_active += 1
|
338 |
+
|
339 |
+
crashes_out.append(
|
340 |
+
{"sentence": sentence, "gold": gold, "prediction": predicted})
|
341 |
+
|
342 |
+
print("ACC_crashes (strict) = ", correct/len(data_crashes))
|
343 |
+
print("ACC_crashes (partial:2) = ", partial_2_attrs/len(data_crashes))
|
344 |
+
print("ACC_crashes (partial:1) = ", partial_1_attr/len(data_crashes))
|
345 |
+
print("ACC_crashes (mentions) = ", correct_mentions/len(data_crashes))
|
346 |
+
print("ACC_crashes (humans) = ", correct_humans/len(data_crashes))
|
347 |
+
print("ACC_crashes (active) = ", correct_active/len(data_crashes))
|
348 |
+
|
349 |
+
pd.DataFrame(crashes_out).to_csv(out_file)
|
350 |
+
|
351 |
+
|
352 |
+
def filter_events_for_bechdel():
|
353 |
+
|
354 |
+
with open("data/crashes/thecrashes_data_all_text.json", encoding="utf-8") as f:
|
355 |
+
events = json.load(f)
|
356 |
+
|
357 |
+
total_articles = 0
|
358 |
+
data_out = []
|
359 |
+
for ev in events:
|
360 |
+
total_articles += len(ev["articles"])
|
361 |
+
|
362 |
+
num_persons = len(ev["persons"])
|
363 |
+
num_transport_modes = len({p["transportationmode"]
|
364 |
+
for p in ev["persons"]})
|
365 |
+
|
366 |
+
if num_transport_modes <= 2:
|
367 |
+
for art in ev["articles"]:
|
368 |
+
data_out.append({"event_id": ev["id"], "article_id": art["id"], "headline": art["title"],
|
369 |
+
"num_persons": num_persons, "num_transport_modes": num_transport_modes})
|
370 |
+
|
371 |
+
print("Total articles = ", total_articles)
|
372 |
+
|
373 |
+
print("Filtered articles: ", len(data_out))
|
374 |
+
out_df = pd.DataFrame(data_out)
|
375 |
+
out_df.to_csv("output/crashes/predict_bechdel/filtered_headlines.csv")
|
376 |
+
|
377 |
+
|
378 |
+
def train_and_eval(train=True):
|
379 |
+
|
380 |
+
# use_gpu = False
|
381 |
+
use_gpu = True
|
382 |
+
cuda_device = None if use_gpu and torch.cuda.is_available() else -1
|
383 |
+
|
384 |
+
transformer = "GroNLP/bert-base-dutch-cased"
|
385 |
+
# transformer = "xlm-roberta-large"
|
386 |
+
token_indexers = {"tokens": PretrainedTransformerIndexer(transformer)}
|
387 |
+
tokenizer = PretrainedTransformerTokenizer(transformer)
|
388 |
+
|
389 |
+
binarizer = MultiLabelBinarizer()
|
390 |
+
binarizer.fit([SEQ_LABELS])
|
391 |
+
reader = TrafficBechdelReader(token_indexers, tokenizer, binarizer)
|
392 |
+
instances = list(reader.read("output/prolog/bechdel_headlines.txt"))
|
393 |
+
orig_data = reader.orig_data
|
394 |
+
zipped = list(zip(instances, orig_data))
|
395 |
+
random.shuffle(zipped)
|
396 |
+
instances_ = [i[0] for i in zipped]
|
397 |
+
orig_data_ = [i[1] for i in zipped]
|
398 |
+
|
399 |
+
num_dev = round(0.05 * len(instances_))
|
400 |
+
num_test = round(0.25 * len(instances_))
|
401 |
+
num_train = len(instances_) - num_dev - num_test
|
402 |
+
print("LEN(train/dev/test)=", num_train, num_dev, num_test)
|
403 |
+
|
404 |
+
instances_train = instances_[:num_train]
|
405 |
+
instances_dev = instances_[num_train:num_train + num_dev]
|
406 |
+
# instances_test = instances_[num_train+num_dev:num_train:]
|
407 |
+
|
408 |
+
# orig_train = orig_data_[:num_train]
|
409 |
+
orig_dev = orig_data_[num_train:num_train + num_dev]
|
410 |
+
|
411 |
+
vocab = Vocabulary.from_instances(instances_train + instances_dev)
|
412 |
+
|
413 |
+
embedder = BasicTextFieldEmbedder(
|
414 |
+
{"tokens": PretrainedTransformerEmbedder(transformer)})
|
415 |
+
model = MultiSequenceLabelModel(embedder, len(SEQ_LABELS), 1000, vocab)
|
416 |
+
if use_gpu:
|
417 |
+
model = model.cuda(cuda_device)
|
418 |
+
|
419 |
+
# checkpoint_dir = f"output/crashes/predict_bechdel/model_{attrib}/"
|
420 |
+
checkpoint_dir = f"/scratch/p289731/predict_bechdel/model_seqlabel/"
|
421 |
+
serialization_dir = f"/scratch/p289731/predict_bechdel/serialization_seqlabel/"
|
422 |
+
|
423 |
+
if train:
|
424 |
+
os.makedirs(checkpoint_dir)
|
425 |
+
os.makedirs(serialization_dir)
|
426 |
+
tensorboard = TensorBoardCallback(
|
427 |
+
serialization_dir, should_log_learning_rate=True)
|
428 |
+
checkpointer = Checkpointer(serialization_dir=checkpoint_dir)
|
429 |
+
optimizer = AdamOptimizer(
|
430 |
+
[(n, p) for n, p in model.named_parameters() if p.requires_grad],
|
431 |
+
lr=1e-5
|
432 |
+
)
|
433 |
+
train_loader = SimpleDataLoader(
|
434 |
+
instances_train, batch_size=8, shuffle=True)
|
435 |
+
dev_loader = SimpleDataLoader(
|
436 |
+
instances_dev, batch_size=8, shuffle=False)
|
437 |
+
train_loader.index_with(vocab)
|
438 |
+
dev_loader.index_with(vocab)
|
439 |
+
|
440 |
+
print("\t\tTraining BERT model")
|
441 |
+
trainer = GradientDescentTrainer(
|
442 |
+
model,
|
443 |
+
optimizer,
|
444 |
+
train_loader,
|
445 |
+
validation_data_loader=dev_loader,
|
446 |
+
# patience=32,
|
447 |
+
patience=2,
|
448 |
+
# num_epochs=1,
|
449 |
+
checkpointer=checkpointer,
|
450 |
+
cuda_device=cuda_device,
|
451 |
+
serialization_dir=serialization_dir,
|
452 |
+
callbacks=[tensorboard]
|
453 |
+
)
|
454 |
+
trainer.train()
|
455 |
+
else:
|
456 |
+
state_dict = torch.load(
|
457 |
+
"/scratch/p289731/predict_bechdel/serialization_all/best.th", map_location=cuda_device)
|
458 |
+
model.load_state_dict(state_dict)
|
459 |
+
|
460 |
+
print("\t\tProducing predictions...")
|
461 |
+
|
462 |
+
predictor = Predictor(model, reader)
|
463 |
+
predictions_dev = [predictor.predict_instance(i) for i in instances_dev]
|
464 |
+
|
465 |
+
data_out = []
|
466 |
+
for sentence, prediction in zip(orig_dev, predictions_dev):
|
467 |
+
readable = model.make_human_readable(prediction, "labels")
|
468 |
+
text = sentence["sentence"]
|
469 |
+
gold = sentence["labels"]
|
470 |
+
predicted = readable
|
471 |
+
data_out.append(
|
472 |
+
{"sentence": text, "gold": gold, "predicted": predicted})
|
473 |
+
df_out = pd.DataFrame(data_out)
|
474 |
+
df_out.to_csv("output/crashes/predict_bechdel/predictions_dev.csv")
|
475 |
+
|
476 |
+
# print()
|
477 |
+
|
478 |
+
# print("First 25 crashes:")
|
479 |
+
# evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_first_25.csv",
|
480 |
+
# out_file="output/crashes/predict_bechdel/predictions_first_25.csv")
|
481 |
+
# print()
|
482 |
+
# print("Next 75 crashes:")
|
483 |
+
# evaluate_crashes(predictor, attrib, annotations="data/crashes/bechdel_annotations_dev_next_75.csv",
|
484 |
+
# out_file="output/crashes/predict_bechdel/predictions_next_75.csv")
|
485 |
+
|
486 |
+
|
487 |
+
if __name__ == "__main__":
|
488 |
+
ap = argparse.ArgumentParser()
|
489 |
+
ap.add_argument("action", choices=["train", "predict", "rules", "filter"])
|
490 |
+
|
491 |
+
args = ap.parse_args()
|
492 |
+
|
493 |
+
if args.action == "train":
|
494 |
+
train_and_eval(train=True)
|
495 |
+
elif args.action == "predict":
|
496 |
+
train_and_eval(train=False)
|
497 |
+
elif args.action == "rules":
|
498 |
+
predict_rule_based()
|
499 |
+
else:
|
500 |
+
filter_events_for_bechdel()
|
sociofillmore/crashes/split_data.py
ADDED
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import shutil
|
5 |
+
from datetime import datetime
|
6 |
+
|
7 |
+
import langdetect
|
8 |
+
import nltk
|
9 |
+
import pandas as pd
|
10 |
+
from langdetect import DetectorFactory, LangDetectException
|
11 |
+
|
12 |
+
DATA_FILE = "data/crashes/thecrashes_data_all_text.json"
|
13 |
+
|
14 |
+
DEV_PORTION = .10
|
15 |
+
|
16 |
+
random.seed(2001)
|
17 |
+
DetectorFactory.seed = 0
|
18 |
+
|
19 |
+
|
20 |
+
def is_a_real_time(timestamp):
|
21 |
+
"""Helper function, checks if a given timestamp really has a time"""
|
22 |
+
|
23 |
+
# 00:00:00 (midnight) is the "empty" timestamp, ignore it
|
24 |
+
if timestamp.hour == timestamp.minute == timestamp.second == 0:
|
25 |
+
return False
|
26 |
+
return True
|
27 |
+
|
28 |
+
|
29 |
+
def main():
|
30 |
+
process_events()
|
31 |
+
|
32 |
+
|
33 |
+
def detect_language(article):
|
34 |
+
if article["alltext"]:
|
35 |
+
sample = article["alltext"]
|
36 |
+
elif article["summary"]:
|
37 |
+
sample = article["summary"]
|
38 |
+
else:
|
39 |
+
sample = article["title"]
|
40 |
+
|
41 |
+
try:
|
42 |
+
return langdetect.detect(sample)
|
43 |
+
except LangDetectException:
|
44 |
+
print(f"\tCould not detect language for text_id={article['id']}")
|
45 |
+
print(f"\tSample={sample})")
|
46 |
+
print()
|
47 |
+
return "UNK_LANG"
|
48 |
+
|
49 |
+
|
50 |
+
def extract_text_info(event):
|
51 |
+
ev_text_lines = []
|
52 |
+
ev_id_lines = []
|
53 |
+
ev_meta_rows = []
|
54 |
+
|
55 |
+
for article in event["articles"]:
|
56 |
+
text_id = article["id"]
|
57 |
+
try:
|
58 |
+
pubdate = datetime.fromisoformat(article["publishedtime"]).strftime("%Y-%m-%d %H:%M:%S")
|
59 |
+
except ValueError:
|
60 |
+
print(f"\t\tcould not parse date {article['publishedtime']}")
|
61 |
+
pubdate = None
|
62 |
+
url = article["url"]
|
63 |
+
provider = article["sitename"]
|
64 |
+
title = article["title"]
|
65 |
+
language = detect_language(article)
|
66 |
+
ev_meta_rows.append({
|
67 |
+
"event_id": event["id"],
|
68 |
+
"text_id": text_id,
|
69 |
+
"pubdate": pubdate,
|
70 |
+
"language": language,
|
71 |
+
"url": url,
|
72 |
+
"provider": provider,
|
73 |
+
"title": title
|
74 |
+
})
|
75 |
+
|
76 |
+
summary = article["summary"]
|
77 |
+
body = article["alltext"]
|
78 |
+
|
79 |
+
text_lines = []
|
80 |
+
id_lines = []
|
81 |
+
|
82 |
+
for line in segment(title, language):
|
83 |
+
text_lines.append(line)
|
84 |
+
id_lines.append(f"event {event['id']}\ttext {text_id}\ttitle")
|
85 |
+
|
86 |
+
for line in segment(summary, language):
|
87 |
+
text_lines.append(line)
|
88 |
+
id_lines.append(f"event {event['id']}\ttext {text_id}\tsummary")
|
89 |
+
|
90 |
+
for line in segment(body, language):
|
91 |
+
text_lines.append(line)
|
92 |
+
id_lines.append(f"event {event['id']}\ttext {text_id}\tbody")
|
93 |
+
|
94 |
+
ev_text_lines.append(text_lines)
|
95 |
+
ev_id_lines.append(id_lines)
|
96 |
+
|
97 |
+
return ev_text_lines, ev_id_lines, ev_meta_rows
|
98 |
+
|
99 |
+
|
100 |
+
def segment(text, language):
|
101 |
+
# don't split Hebrew and Vietnamese (because we don't have a segmenter for it)
|
102 |
+
if language in ["he", "vi"]:
|
103 |
+
return text
|
104 |
+
|
105 |
+
lang_map = {
|
106 |
+
"nl": "dutch",
|
107 |
+
"en": "english",
|
108 |
+
"es": "spanish",
|
109 |
+
"de": "german",
|
110 |
+
"fr": "french",
|
111 |
+
"ru": "russian",
|
112 |
+
"pt": "portuguese"
|
113 |
+
}
|
114 |
+
|
115 |
+
nltk_lang = lang_map.get(language)
|
116 |
+
|
117 |
+
# what to do with languages without sent tokenizer in NLTK (apart from Hebrew):
|
118 |
+
if not nltk_lang:
|
119 |
+
if language == "af":
|
120 |
+
# treat Afrikaans as Dutch
|
121 |
+
nltk_lang = "dutch"
|
122 |
+
else:
|
123 |
+
print(f"Found an article with unsupported language={language}, falling back to English NLTK")
|
124 |
+
nltk_lang = "english"
|
125 |
+
|
126 |
+
return nltk.sent_tokenize(text, nltk_lang)
|
127 |
+
|
128 |
+
|
129 |
+
def write_to_text_by_event(text_lines, text_meta_lines, event_id, split_to_dir, split):
|
130 |
+
event_dir = f"{split_to_dir[split]}/{event_id}"
|
131 |
+
os.makedirs(event_dir, exist_ok=True)
|
132 |
+
for art_lines, row in zip(text_lines, text_meta_lines):
|
133 |
+
text_file = f"{event_dir}/{row['text_id']}.txt"
|
134 |
+
with open(text_file, "w", encoding="utf-8") as f:
|
135 |
+
for line in art_lines:
|
136 |
+
f.write(line + os.linesep)
|
137 |
+
|
138 |
+
|
139 |
+
def process_events():
|
140 |
+
print("Loading data file...")
|
141 |
+
with open(DATA_FILE, encoding="utf-8") as f:
|
142 |
+
data = json.load(f)
|
143 |
+
event_all_rows = []
|
144 |
+
event_dev_rows = []
|
145 |
+
event_main_rows = []
|
146 |
+
|
147 |
+
text_all_rows = []
|
148 |
+
text_dev_rows = []
|
149 |
+
text_main_rows = []
|
150 |
+
|
151 |
+
# make empty text files
|
152 |
+
text_file_basenames = {
|
153 |
+
"all": "output/crashes/split_data/all.texts",
|
154 |
+
"dev": "output/crashes/split_data/split_dev10.texts",
|
155 |
+
"main": "output/crashes/split_data/split_main.texts"
|
156 |
+
}
|
157 |
+
for split, bn in text_file_basenames.items():
|
158 |
+
for ext in [".text.txt", ".ids.txt"]:
|
159 |
+
f = open(f"{bn}{ext}", "w", encoding="utf-8")
|
160 |
+
f.close()
|
161 |
+
|
162 |
+
# clear & make text file directories
|
163 |
+
text_files_by_event_dir = {}
|
164 |
+
for split in ["all", "dev", "main"]:
|
165 |
+
prefix = "split_dev10" if split == "dev" else "split_main" if split == "main" else "all"
|
166 |
+
text_dir = f"output/crashes/split_data/{prefix}_texts_by_event"
|
167 |
+
text_files_by_event_dir[split] = text_dir
|
168 |
+
if os.path.exists(text_dir):
|
169 |
+
shutil.rmtree(text_dir)
|
170 |
+
os.mkdir(text_dir)
|
171 |
+
|
172 |
+
# helper function for writing text files
|
173 |
+
def append_to_txt(txt_file, lines):
|
174 |
+
with open(txt_file, "a", encoding="utf-8") as f_out:
|
175 |
+
for art_lines in lines:
|
176 |
+
for line in art_lines:
|
177 |
+
f_out.write(line + os.linesep)
|
178 |
+
|
179 |
+
print("Processing events...")
|
180 |
+
for event in data:
|
181 |
+
event_id = event["id"]
|
182 |
+
print(f"\tevent_id={event_id}")
|
183 |
+
try:
|
184 |
+
timestamp = datetime.fromisoformat(event["date"])
|
185 |
+
except ValueError:
|
186 |
+
timestamp = None
|
187 |
+
|
188 |
+
event_row = {
|
189 |
+
"event:id": event_id,
|
190 |
+
"event:date": timestamp.strftime("%Y-%m-%d") if timestamp else None,
|
191 |
+
"event:time": timestamp.strftime("%H-%M-%S") if timestamp and is_a_real_time(timestamp) else None,
|
192 |
+
"event:coordinates": f"{event['latitude'], event['longitude']}",
|
193 |
+
"vehicle_involved": 1 if any(p for p in event["persons"] if p["transportationmode"] in range(5, 14)) else 0
|
194 |
+
}
|
195 |
+
|
196 |
+
for health, health_code in (("dead", 3), ("injured", 2)):
|
197 |
+
all_with_health = [p for p in event["persons"] if p["health"] == health_code]
|
198 |
+
event_row[f"outcomes:{health}:total"] = len(all_with_health)
|
199 |
+
event_row[f"outcomes:{health}:child"] = len([p for p in all_with_health if p["child"] == 1])
|
200 |
+
for mode, mode_codes in (("pedestrian", [1]), ("cyclist", [2]), ("vehicle", range(5, 14))):
|
201 |
+
event_row[f"outcomes:{health}:{mode}"] = len([p for p in all_with_health
|
202 |
+
if p["transportationmode"] in mode_codes])
|
203 |
+
|
204 |
+
text_lines, text_id_lines, text_meta_rows = extract_text_info(event)
|
205 |
+
|
206 |
+
event_all_rows.append(event_row)
|
207 |
+
text_all_rows.extend(text_meta_rows)
|
208 |
+
append_to_txt(text_file_basenames["all"] + ".text.txt", text_lines)
|
209 |
+
append_to_txt(text_file_basenames["all"] + ".ids.txt", text_id_lines)
|
210 |
+
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "all")
|
211 |
+
|
212 |
+
if random.random() < DEV_PORTION:
|
213 |
+
event_dev_rows.append(event_row)
|
214 |
+
text_dev_rows.extend(text_meta_rows)
|
215 |
+
append_to_txt(text_file_basenames["dev"] + ".text.txt", text_lines)
|
216 |
+
append_to_txt(text_file_basenames["dev"] + ".ids.txt", text_id_lines)
|
217 |
+
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "dev")
|
218 |
+
|
219 |
+
else:
|
220 |
+
event_main_rows.append(event_row)
|
221 |
+
text_main_rows.extend(text_meta_rows)
|
222 |
+
append_to_txt(text_file_basenames["main"] + ".text.txt", text_lines)
|
223 |
+
append_to_txt(text_file_basenames["main"] + ".ids.txt", text_id_lines)
|
224 |
+
write_to_text_by_event(text_lines, text_meta_rows, event_id, text_files_by_event_dir, "main")
|
225 |
+
|
226 |
+
all_ev_df = pd.DataFrame(event_all_rows)
|
227 |
+
main_ev_df = pd.DataFrame(event_main_rows)
|
228 |
+
dev_ev_df = pd.DataFrame(event_dev_rows)
|
229 |
+
for df, file in ((all_ev_df, "all.events"), (main_ev_df, "split_main.events"), (dev_ev_df, "split_dev10.events")):
|
230 |
+
df.to_csv(f"output/crashes/split_data/{file}.csv")
|
231 |
+
|
232 |
+
all_txt_df = pd.DataFrame(text_all_rows)
|
233 |
+
main_txt_df = pd.DataFrame(text_main_rows)
|
234 |
+
dev_txt_df = pd.DataFrame(text_dev_rows)
|
235 |
+
for df, file in ((all_txt_df, "all.texts"), (main_txt_df, "split_main.texts"), (dev_txt_df, "split_dev10.texts")):
|
236 |
+
df.to_csv(f"output/crashes/split_data/{file}.meta.csv")
|
237 |
+
|
238 |
+
|
239 |
+
if __name__ == '__main__':
|
240 |
+
main()
|
sociofillmore/crashes/utils.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
texts_meta = pd.read_csv("output/crashes/split_data/split_dev10.texts.meta.csv", index_col=0, dtype={"text_id": str})
|
4 |
+
|
5 |
+
|
6 |
+
def is_a_dutch_text(doc_id, exclude_frisian=True):
|
7 |
+
filtered_for_doc = texts_meta[texts_meta["text_id"] == doc_id]
|
8 |
+
if len(filtered_for_doc) >= 1:
|
9 |
+
if exclude_frisian:
|
10 |
+
# exclude newsproviders publishing mainly in Frisian
|
11 |
+
# (NB these texts are recognized as Dutch by langdetect, hence the need for a provider filter)
|
12 |
+
if filtered_for_doc["provider"].iloc[0] == "omropfryslan.nl":
|
13 |
+
return False
|
14 |
+
if filtered_for_doc["language"].iloc[0] == "nl":
|
15 |
+
return True
|
16 |
+
return False
|
sociofillmore/femicides/compare_lome_models.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import defaultdict
|
2 |
+
import glob
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import random
|
7 |
+
import sys
|
8 |
+
from typing import List, Dict, Tuple
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
from sociofillmore.common.analyze_text import load_caches, process_fn_sentence, FrameStructure, read_frames_of_interest
|
14 |
+
|
15 |
+
RANDOM_SEED = 9718
|
16 |
+
NUM_EVALUATION_SENTENCES = 150
|
17 |
+
|
18 |
+
EVALITA_MODEL = "lome_evalita_plus_fn"
|
19 |
+
# EVALITA_MODEL = "lome_evalita_plus_fn_0conf"
|
20 |
+
OUT_FOLDER = f"0shot__vs__{EVALITA_MODEL.split('_', maxsplit=1)[1]}"
|
21 |
+
print(OUT_FOLDER)
|
22 |
+
|
23 |
+
|
24 |
+
random.seed(RANDOM_SEED)
|
25 |
+
|
26 |
+
|
27 |
+
def map_predicates_to_frames(structures: List[FrameStructure]) -> Dict[str, str]:
|
28 |
+
mapping = {}
|
29 |
+
for struct in structures:
|
30 |
+
pred_key = "_".join(struct.target.tokens_str)
|
31 |
+
mapping[pred_key] = struct.frame
|
32 |
+
return mapping
|
33 |
+
|
34 |
+
|
35 |
+
def make_evaluation_sample(diffs_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
36 |
+
|
37 |
+
def make_experimental_columns(row: pd.Series):
|
38 |
+
if random.choice((True, False)):
|
39 |
+
left_col = "predicted_frame_0shot"
|
40 |
+
right_col = "predicted_frame_evalita"
|
41 |
+
else:
|
42 |
+
left_col = "predicted_frame_evalita"
|
43 |
+
right_col = "predicted_frame_0shot"
|
44 |
+
|
45 |
+
exp_info = pd.Series({
|
46 |
+
"prediction_1": row[left_col],
|
47 |
+
"prediction_2": row[right_col],
|
48 |
+
"model_1": left_col,
|
49 |
+
"model_2": right_col
|
50 |
+
})
|
51 |
+
|
52 |
+
return row.append(exp_info)
|
53 |
+
|
54 |
+
sample = diffs_df.sample(n=NUM_EVALUATION_SENTENCES,
|
55 |
+
random_state=RANDOM_SEED).reset_index(drop=True)
|
56 |
+
with_exp_info = sample.apply(make_experimental_columns, axis=1)
|
57 |
+
annotator_sheet = with_exp_info[[
|
58 |
+
"sentence", "predicate", "prediction_1", "prediction_2"]]
|
59 |
+
# add answer columns
|
60 |
+
for answer_field in ["1_is_best", "2_is_best", "both_are_good", "both_are_bad", "missing_frame"]:
|
61 |
+
annotator_sheet.insert(len(annotator_sheet.columns),
|
62 |
+
f"answer::{answer_field}", np.nan)
|
63 |
+
# annotator_sheet[f"answer::{answer_field}"] = np.nan
|
64 |
+
|
65 |
+
return annotator_sheet, with_exp_info
|
66 |
+
|
67 |
+
|
68 |
+
def make_annotation_experiment():
|
69 |
+
_, deep_frame_cache = load_caches("femicides/rai")
|
70 |
+
frames_of_interest = read_frames_of_interest("femicides/rai")
|
71 |
+
|
72 |
+
all_differences = []
|
73 |
+
foi_differences = [] # foi='frame of interest'
|
74 |
+
|
75 |
+
# number of predicates that have been annotated by at least one model
|
76 |
+
num_all_predictions = 0
|
77 |
+
num_foi_predictions = 0
|
78 |
+
|
79 |
+
num_z_shot_all_predictions = 0
|
80 |
+
num_z_shot_foi_predictions = 0
|
81 |
+
|
82 |
+
num_evalita_all_predictions = 0
|
83 |
+
num_evalita_foi_predictions = 0
|
84 |
+
|
85 |
+
for ev_dir in sorted(glob.glob("output/femicides/lome/lome_0shot/multilabel/rai/*")):
|
86 |
+
ev_id = os.path.basename(ev_dir).rstrip("/")
|
87 |
+
print(f"event={ev_id}")
|
88 |
+
for doc_file in sorted(glob.glob(f"{ev_dir}/*.comm.json")):
|
89 |
+
doc_id = re.search(r'/lome_(\d+)\.comm\.json', doc_file).group(1)
|
90 |
+
print(f"\tdoc={doc_id}")
|
91 |
+
|
92 |
+
with open(doc_file, encoding="utf-8") as f:
|
93 |
+
z_shot_annotations = json.load(f)
|
94 |
+
|
95 |
+
with open(doc_file.replace("/lome_0shot/", f"/{EVALITA_MODEL}/"), encoding="utf-8") as f:
|
96 |
+
evalita_annotations = json.load(f)
|
97 |
+
|
98 |
+
for sent_idx, (z_shot_sent, evalita_sent) in enumerate(zip(z_shot_annotations, evalita_annotations)):
|
99 |
+
z_shot_structs = process_fn_sentence(
|
100 |
+
z_shot_sent, deep_frame_cache)
|
101 |
+
evalita_structs = process_fn_sentence(
|
102 |
+
evalita_sent, deep_frame_cache)
|
103 |
+
|
104 |
+
z_shot_frames = {s.frame for s in z_shot_structs.values()}
|
105 |
+
evalita_frames = {s.frame for s in evalita_structs.values()}
|
106 |
+
overlapping_frames = z_shot_frames.intersection(evalita_frames)
|
107 |
+
|
108 |
+
print(f"\t\tsent #{sent_idx}: {len(z_shot_frames)}x lome_0shot frames, "
|
109 |
+
f"{len(evalita_frames)}x evalita frames, {len(overlapping_frames)}x overlapping")
|
110 |
+
|
111 |
+
z_shot_preds_to_frames = map_predicates_to_frames(
|
112 |
+
z_shot_structs.values())
|
113 |
+
evalita_preds_to_frames = map_predicates_to_frames(
|
114 |
+
evalita_structs.values())
|
115 |
+
all_predicates = sorted(set(z_shot_preds_to_frames.keys()).union(
|
116 |
+
evalita_preds_to_frames.keys()))
|
117 |
+
|
118 |
+
for predicate in all_predicates:
|
119 |
+
print(f"\t\t\tpredicate={predicate}")
|
120 |
+
z_shot_frame = z_shot_preds_to_frames.get(predicate)
|
121 |
+
evalita_frame = evalita_preds_to_frames.get(predicate)
|
122 |
+
has_relevant_frame = z_shot_frame in frames_of_interest or evalita_frame in frames_of_interest
|
123 |
+
|
124 |
+
if z_shot_frame is not None:
|
125 |
+
num_z_shot_all_predictions += 1
|
126 |
+
if z_shot_frame in frames_of_interest:
|
127 |
+
num_z_shot_foi_predictions += 1
|
128 |
+
|
129 |
+
if evalita_frame is not None:
|
130 |
+
num_evalita_all_predictions += 1
|
131 |
+
if evalita_frame in frames_of_interest:
|
132 |
+
num_evalita_foi_predictions += 1
|
133 |
+
|
134 |
+
num_all_predictions += 1
|
135 |
+
if has_relevant_frame:
|
136 |
+
num_foi_predictions += 1
|
137 |
+
|
138 |
+
if z_shot_frame != evalita_frame:
|
139 |
+
diff = {
|
140 |
+
"ev_id": ev_id,
|
141 |
+
"doc_id": doc_id,
|
142 |
+
"sent_idx": sent_idx,
|
143 |
+
"sentence": " ".join(z_shot_sent["tokens"]),
|
144 |
+
"predicate": predicate,
|
145 |
+
"predicted_frame_0shot": z_shot_frame or "_",
|
146 |
+
"predicted_frame_evalita": evalita_frame or "_"
|
147 |
+
}
|
148 |
+
all_differences.append(diff)
|
149 |
+
if has_relevant_frame:
|
150 |
+
foi_differences.append(diff)
|
151 |
+
|
152 |
+
print()
|
153 |
+
|
154 |
+
print()
|
155 |
+
|
156 |
+
print(f"num_z_shot_all_predictions = {num_z_shot_all_predictions}")
|
157 |
+
print(f"num_z_shot_foi_predictions = {num_z_shot_foi_predictions}")
|
158 |
+
print(f"num_evalita_all_predictions = {num_evalita_all_predictions}")
|
159 |
+
print(f"num_evalita_foi_predictions = {num_evalita_foi_predictions}")
|
160 |
+
|
161 |
+
print(
|
162 |
+
f"all_differences: {len(all_differences)}/{num_all_predictions}={len(all_differences)/num_all_predictions}")
|
163 |
+
print(
|
164 |
+
f"foi_differences: {len(foi_differences)}/{num_foi_predictions}={len(foi_differences) / num_foi_predictions}")
|
165 |
+
|
166 |
+
# all_diffs_df = pd.DataFrame(all_differences)
|
167 |
+
# foi_diffs_df = pd.DataFrame(foi_differences)
|
168 |
+
|
169 |
+
# all_diffs_df.to_csv("output/femicides/compare_lome_models/all_differences.csv")
|
170 |
+
# foi_diffs_df.to_csv("output/femicides/compare_lome_models/foi_differences.csv")
|
171 |
+
|
172 |
+
# annotator_sheet, experiment_sheet = make_evaluation_sample(foi_diffs_df)
|
173 |
+
# annotator_sheet.to_csv("output/femicides/compare_lome_models/annotator_sheet.csv")
|
174 |
+
# experiment_sheet.to_csv("output/femicides/compare_lome_models/experiment_sheet.csv")
|
175 |
+
|
176 |
+
|
177 |
+
def analyze_annotations():
|
178 |
+
ann_df = pd.read_excel("resources/sara_lome_annotations.xlsx", index_col=0)
|
179 |
+
exp_df = pd.read_csv(
|
180 |
+
f"output/femicides/compare_lome_models/{OUT_FOLDER}/experiment_sheet.csv", index_col=0)
|
181 |
+
ann_df_ = ann_df.join(exp_df[["model_1", "model_2"]])
|
182 |
+
ann_df_proc = ann_df_.apply(combine_labels, axis=1)
|
183 |
+
print(ann_df_proc.head())
|
184 |
+
ann_df_proc.to_csv(
|
185 |
+
f"output/femicides/compare_lome_models/{OUT_FOLDER}/annotator_sheet_processed.csv")
|
186 |
+
|
187 |
+
|
188 |
+
def combine_labels(row: pd.Series) -> pd.Series:
|
189 |
+
|
190 |
+
model_1 = row["model_1"].split("_")[-1]
|
191 |
+
model_2 = row["model_2"].split("_")[-1]
|
192 |
+
|
193 |
+
if row["answer::1_is_best"] == "X":
|
194 |
+
answer = f"{model_1}_is_best"
|
195 |
+
elif row["answer::2_is_best"] == "X":
|
196 |
+
answer = f"{model_2}_is_best"
|
197 |
+
elif row["answer::both_are_good"] == "X":
|
198 |
+
answer = "both_are_good"
|
199 |
+
elif row["answer::both_are_bad"] == "X":
|
200 |
+
answer = "both_are_bad"
|
201 |
+
elif row["answer::missing_frame"] == "X":
|
202 |
+
answer = "missing_frame"
|
203 |
+
else:
|
204 |
+
raise ValueError(f"Missing annotation in row {row}")
|
205 |
+
|
206 |
+
row_ = row.drop([k for k in row.keys() if k.startswith("answer::")])
|
207 |
+
return row_.append(pd.Series({"answer": answer}))
|
208 |
+
|
209 |
+
|
210 |
+
def prep_svm_challenge():
|
211 |
+
annotated_df = pd.read_csv(
|
212 |
+
"output/femicides/compare_lome_models/0shot__vs__evalita_plus_fn/annotator_sheet_processed.csv", index_col=0)
|
213 |
+
|
214 |
+
evalita_train_data = []
|
215 |
+
with open("../stupid-svm-frameid/data/evalita_jsonl/evalita_train.jsonl", encoding="utf-8") as f_in:
|
216 |
+
for line in f_in:
|
217 |
+
evalita_train_data.append(json.loads(line))
|
218 |
+
# evalita_frame_labels = {annotation["label"] for sentence in evalita_train_data for annotation in sentence["annotations"]}
|
219 |
+
evalita_frame_labels = defaultdict(int)
|
220 |
+
for sentence in evalita_train_data:
|
221 |
+
for annotation in sentence["annotations"]:
|
222 |
+
evalita_frame_labels[annotation["label"]] += 1
|
223 |
+
evalita_train_counts = pd.DataFrame(evalita_frame_labels.items(), columns=["label", "count"]).sort_values(by="count")
|
224 |
+
evalita_train_counts.to_csv("output/femicides/compare_lome_models/evalita_trainset_counts.csv")
|
225 |
+
|
226 |
+
print("Evalita frame labels:", sorted(evalita_frame_labels.keys()))
|
227 |
+
|
228 |
+
out = []
|
229 |
+
zshot_score = 0
|
230 |
+
evalita_score = 0
|
231 |
+
|
232 |
+
for _, row in annotated_df.iterrows():
|
233 |
+
answer = row["answer"]
|
234 |
+
if answer not in ["0shot_is_best", "evalita_is_best", "both_are_good"]:
|
235 |
+
continue
|
236 |
+
|
237 |
+
tokens = row["sentence"].split()
|
238 |
+
predicate = row["predicate"].split("_")[0] # to keep things simple, only look at first token of predicate
|
239 |
+
predicate_idx = [i for i, tok in enumerate(tokens) if tok == predicate][0]
|
240 |
+
|
241 |
+
if answer == "0shot_is_best":
|
242 |
+
if row["model_1"] == "predicted_frame_0shot":
|
243 |
+
zshot_label = label = row["prediction_1"]
|
244 |
+
evalita_label = row["prediction_2"]
|
245 |
+
else:
|
246 |
+
zshot_label = label = row["prediction_2"]
|
247 |
+
evalita_label = row["prediction_1"]
|
248 |
+
elif answer == "evalita_is_best":
|
249 |
+
if row["model_1"] == "predicted_frame_evalita":
|
250 |
+
evalita_label = label = row["prediction_1"]
|
251 |
+
zshot_label = row["prediction_2"]
|
252 |
+
else:
|
253 |
+
evalita_label = label = row["prediction_2"]
|
254 |
+
zshot_label = row["prediction_1"]
|
255 |
+
else:
|
256 |
+
label = row["prediction_1"]
|
257 |
+
if row["model_1"] == "predicted_frame_evalita":
|
258 |
+
evalita_label = row["prediction_1"]
|
259 |
+
zshot_label = row["prediction_2"]
|
260 |
+
else:
|
261 |
+
evalita_label = row["prediction_2"]
|
262 |
+
zshot_label = row["prediction_1"]
|
263 |
+
|
264 |
+
if label not in evalita_frame_labels:
|
265 |
+
print("\tskipping gold frame label not present in EVALITA: ", label)
|
266 |
+
continue
|
267 |
+
|
268 |
+
if zshot_label == label:
|
269 |
+
zshot_score += 1
|
270 |
+
if evalita_label == label:
|
271 |
+
evalita_score += 1
|
272 |
+
|
273 |
+
out.append({"tokens": tokens, "annotations": [{"label": label, "span": [predicate_idx, predicate_idx], "lu": None, "children": []}]})
|
274 |
+
|
275 |
+
print(f"Found {len(out)} relevant annotations")
|
276 |
+
print("0-shot score: ", zshot_score / len(out))
|
277 |
+
print("evalita score: ", evalita_score / len(out))
|
278 |
+
|
279 |
+
|
280 |
+
with open("output/femicides/compare_lome_models/svm_challenge.jsonl", "w", encoding="utf-8") as f_out:
|
281 |
+
for line in out:
|
282 |
+
f_out.write(json.dumps(line) + os.linesep)
|
283 |
+
f_out.write(os.linesep)
|
284 |
+
|
285 |
+
|
286 |
+
|
287 |
+
if __name__ == '__main__':
|
288 |
+
action = sys.argv[1]
|
289 |
+
assert action in ["make", "analyze", "prep_svm_challenge"]
|
290 |
+
|
291 |
+
if action == "make":
|
292 |
+
make_annotation_experiment()
|
293 |
+
elif action == "analyze":
|
294 |
+
analyze_annotations()
|
295 |
+
else:
|
296 |
+
prep_svm_challenge()
|
sociofillmore/femicides/evalita_err_analysis.py
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from itertools import product
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
from scipy.spatial.distance import cosine
|
6 |
+
|
7 |
+
from nltk.corpus import framenet as fn
|
8 |
+
|
9 |
+
from sociofillmore.common.analyze_text import read_frames_of_interest
|
10 |
+
|
11 |
+
|
12 |
+
COSINE_THRESH = [0.1, 0.2, 0.3, 0.4, 0.5]
|
13 |
+
|
14 |
+
|
15 |
+
PREDICTION_FILES = {
|
16 |
+
"evalita-dev": {
|
17 |
+
"stupid-svm": "../stupid-svm-frameid/evalita_predictions.csv",
|
18 |
+
"lome-en": "misc/frame_prediction_output_lome-en_dev.csv",
|
19 |
+
"lome-it": "misc/frame_prediction_output_lome-it-best_dev.csv",
|
20 |
+
},
|
21 |
+
"evalita-test": {
|
22 |
+
"stupid-svm": "../stupid-svm-frameid/evalita_predictions_test.csv",
|
23 |
+
"lome-en": "misc/frame_prediction_output_lome-en_test.csv",
|
24 |
+
"lome-it": "misc/frame_prediction_output_lome-it-best_test.csv",
|
25 |
+
},
|
26 |
+
"rai_femicides": {
|
27 |
+
"stupid-svm": "../stupid-svm-frameid/rai_predictions.csv",
|
28 |
+
"lome-en": "misc/frame_prediction_output_lome-en_rai.csv",
|
29 |
+
"lome-it": "misc/frame_prediction_output_lome-it-best_rai.csv",
|
30 |
+
},
|
31 |
+
}
|
32 |
+
|
33 |
+
|
34 |
+
def load_embeddings(embedding_file):
|
35 |
+
frame_vocab = []
|
36 |
+
word_vocab = []
|
37 |
+
vectors = []
|
38 |
+
|
39 |
+
with open(embedding_file, encoding="utf-8") as f:
|
40 |
+
for line in f:
|
41 |
+
columns = line.split()
|
42 |
+
frame = columns[0]
|
43 |
+
words = tuple(columns[1].split("+"))
|
44 |
+
vector = np.array([float(i) for i in columns[2:]])
|
45 |
+
|
46 |
+
frame_vocab.append(frame)
|
47 |
+
word_vocab.append(words)
|
48 |
+
vectors.append(vector)
|
49 |
+
|
50 |
+
frames_to_idxs = {}
|
51 |
+
for i, frame in enumerate(frame_vocab):
|
52 |
+
frames_to_idxs[frame] = i
|
53 |
+
|
54 |
+
return np.array(vectors, dtype=np.float64), frames_to_idxs
|
55 |
+
|
56 |
+
|
57 |
+
def femicide_frame_distances(embeddings, frame_to_idx):
|
58 |
+
femicide_frames = read_frames_of_interest("femicides/rai")
|
59 |
+
print("Cosines: ")
|
60 |
+
for fr1, fr2 in product(femicide_frames, femicide_frames):
|
61 |
+
dist = cosine(embeddings[frame_to_idx[fr1]], embeddings[frame_to_idx[fr2]])
|
62 |
+
print(f"\t{fr1}-{fr2}: {dist:.4f}")
|
63 |
+
|
64 |
+
|
65 |
+
def embedding_scores(predictions, embeddings, frame_to_idx):
|
66 |
+
correct = 0
|
67 |
+
close_calls = {threshold: 0 for threshold in COSINE_THRESH}
|
68 |
+
total_dist = 0.0
|
69 |
+
|
70 |
+
for _, row in predictions.iterrows():
|
71 |
+
predicted = row["frame_pred"]
|
72 |
+
gold = row["frame_gold"]
|
73 |
+
dist = cosine(
|
74 |
+
embeddings[frame_to_idx[predicted]], embeddings[frame_to_idx[gold]]
|
75 |
+
)
|
76 |
+
if predicted == gold:
|
77 |
+
correct += 1
|
78 |
+
else:
|
79 |
+
for threshold in COSINE_THRESH:
|
80 |
+
if dist < threshold:
|
81 |
+
close_calls[threshold] += 1
|
82 |
+
total_dist += dist
|
83 |
+
|
84 |
+
print("#correct: ", correct / len(predictions))
|
85 |
+
print("#close calls: ")
|
86 |
+
for threshold in COSINE_THRESH:
|
87 |
+
print("\t", threshold, (close_calls[threshold]) / len(predictions))
|
88 |
+
print("#correct or close: ")
|
89 |
+
for threshold in COSINE_THRESH:
|
90 |
+
print("\t", threshold, (correct + close_calls[threshold]) / len(predictions))
|
91 |
+
print("avg cosine dist: ", total_dist / len(predictions))
|
92 |
+
|
93 |
+
|
94 |
+
def generalization_exp(predictions, evalita_train_counts, fn_frames, femicide_frames):
|
95 |
+
|
96 |
+
all_frames = predictions
|
97 |
+
ifn_frames = predictions[
|
98 |
+
predictions["frame_gold"].isin(evalita_train_counts["label"])
|
99 |
+
]
|
100 |
+
bfn_frames = predictions[predictions["frame_gold"].isin(fn_frames)]
|
101 |
+
rai_frames = predictions[predictions["frame_gold"].isin(femicide_frames)]
|
102 |
+
|
103 |
+
|
104 |
+
print("LEN (ALL/IFN/BFN/RAI:)")
|
105 |
+
print(
|
106 |
+
"\t".join(
|
107 |
+
[
|
108 |
+
str(len(preds))
|
109 |
+
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
|
110 |
+
]
|
111 |
+
)
|
112 |
+
)
|
113 |
+
|
114 |
+
print("ACC (ALL/IFN/BFN/RAI:)")
|
115 |
+
print(
|
116 |
+
"\t".join(
|
117 |
+
[
|
118 |
+
str(len(preds[preds["frame_gold"] == preds["frame_pred"]]) / len(preds))
|
119 |
+
for preds in [all_frames, ifn_frames, bfn_frames, rai_frames]
|
120 |
+
]
|
121 |
+
)
|
122 |
+
)
|
123 |
+
|
124 |
+
|
125 |
+
def main():
|
126 |
+
|
127 |
+
evalita_train_counts = pd.read_csv(
|
128 |
+
"output/femicides/compare_lome_models/evalita_trainset_counts.csv"
|
129 |
+
)
|
130 |
+
|
131 |
+
fn_frames = {fr.name for fr in fn.frames()}
|
132 |
+
femicide_frames = read_frames_of_interest("femicides/rai")
|
133 |
+
evalita_train_counts = pd.read_csv(
|
134 |
+
"output/femicides/compare_lome_models/evalita_trainset_counts.csv"
|
135 |
+
)
|
136 |
+
|
137 |
+
for dataset in PREDICTION_FILES:
|
138 |
+
print(f"==={dataset}===")
|
139 |
+
for model, predictions_file in PREDICTION_FILES[dataset].items():
|
140 |
+
|
141 |
+
print(f"---{model}---")
|
142 |
+
|
143 |
+
predictions = pd.read_csv(predictions_file, index_col=0)
|
144 |
+
print("Total predictions:", len(predictions))
|
145 |
+
|
146 |
+
# predictions_with_fn_frames = predictions[
|
147 |
+
# predictions["frame_gold"].isin(fn_frames)
|
148 |
+
# & predictions["frame_pred"].isin(fn_frames)
|
149 |
+
# ]
|
150 |
+
# print("Predictions with FN frames: ", len(predictions_with_fn_frames))
|
151 |
+
|
152 |
+
# errors = predictions[predictions["frame_gold"] != predictions["frame_pred"]]
|
153 |
+
# print("Total errors: ", len(errors))
|
154 |
+
|
155 |
+
# errors_with_fn_frames = errors[
|
156 |
+
# errors["frame_gold"].isin(fn_frames) & errors["frame_pred"].isin(fn_frames)
|
157 |
+
# ]
|
158 |
+
# print("Errors with FN frames: ", len(errors_with_fn_frames))
|
159 |
+
|
160 |
+
# print("Loading embeddings...")
|
161 |
+
# embeddings, frame_to_idx = load_embeddings(
|
162 |
+
# "../bert-for-framenet/data/embeddings/bag_of_lu_embeddings.txt"
|
163 |
+
# )
|
164 |
+
# # femicide_frame_distances(embeddings, frame_to_idx)
|
165 |
+
# embedding_scores(predictions_with_fn_frames, embeddings, frame_to_idx)
|
166 |
+
|
167 |
+
if dataset == "rai_femicides":
|
168 |
+
predictions = predictions[predictions["frame_gold"].isin(femicide_frames)]
|
169 |
+
|
170 |
+
|
171 |
+
femicide_frames = read_frames_of_interest("femicides/rai")
|
172 |
+
generalization_exp(
|
173 |
+
predictions, evalita_train_counts, fn_frames, femicide_frames
|
174 |
+
)
|
175 |
+
|
176 |
+
|
177 |
+
print()
|
178 |
+
print()
|
179 |
+
|
180 |
+
|
181 |
+
if __name__ == "__main__":
|
182 |
+
main()
|
sociofillmore/femicides/extract_texts.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
|
4 |
+
def extract_texts():
|
5 |
+
df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
|
6 |
+
print(df)
|
7 |
+
for _, row in df.iterrows():
|
8 |
+
source_id = row["ID"]
|
9 |
+
text = f"{row['title']}\n\n{row['text']}"
|
10 |
+
with open(f"output/femicides/extract_text/source_{source_id}.txt", "w", encoding="utf-8") as f:
|
11 |
+
f.write(text)
|
12 |
+
|
13 |
+
|
14 |
+
if __name__ == '__main__':
|
15 |
+
extract_texts()
|
sociofillmore/femicides/split_data.py
ADDED
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Split the dataset into a "DEV10" portion (10% of events) for initial experimentation; and "MAIN", the rest of the
|
3 |
+
dataset, to be used later
|
4 |
+
"""
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
import random
|
8 |
+
import argparse
|
9 |
+
from collections import defaultdict
|
10 |
+
from typing import List, Tuple, Dict, Any
|
11 |
+
|
12 |
+
import pandas as pd
|
13 |
+
import nltk
|
14 |
+
|
15 |
+
random.seed(1996)
|
16 |
+
|
17 |
+
|
18 |
+
def split_rai_femicides():
|
19 |
+
# process the excel file
|
20 |
+
print("Processing excel file...")
|
21 |
+
femicide_events = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017.edited_colnames.xlsx",
|
22 |
+
sheet_name="dati", header=0)
|
23 |
+
event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts = read_events(femicide_events)
|
24 |
+
|
25 |
+
# save information about correspondences between victims and events
|
26 |
+
# (we will need this later to retrieve the correct texts for each event, because the XLSX with texts uses victim
|
27 |
+
# names as keys)
|
28 |
+
dicts_to_save = (
|
29 |
+
(victim_to_event_id, "victim_to_event_id"),
|
30 |
+
(event_id_to_victims, "event_id_to_victims"),
|
31 |
+
(victim_duplicate_counts, "victim_duplicate_counts")
|
32 |
+
)
|
33 |
+
write_dict_to_json(dicts_to_save)
|
34 |
+
|
35 |
+
# shuffle and split
|
36 |
+
print("Shuffling and splitting...")
|
37 |
+
shuffled_event_ids = list(event_ids)
|
38 |
+
random.shuffle(shuffled_event_ids)
|
39 |
+
dev10_idx = shuffled_event_ids[:78]
|
40 |
+
main_idx = shuffled_event_ids[78:]
|
41 |
+
dev10_df, main_df = create_split_df(dev10_idx, femicide_events)
|
42 |
+
|
43 |
+
# write split dataframes
|
44 |
+
for df, df_name in ((dev10_df, "dev10"), (main_df, "main")):
|
45 |
+
df.to_csv(f"output/femicides/split_data/rai/split_{df_name}.events.csv")
|
46 |
+
df.to_excel(f"output/femicides/split_data/rai/split_{df_name}.events.xlsx")
|
47 |
+
|
48 |
+
# write filtered victim data
|
49 |
+
dev10_victims = {e: victims for e, victims in event_id_to_victims.items() if e in dev10_idx}
|
50 |
+
main_victims = {e: victims for e, victims in event_id_to_victims.items() if e in main_idx}
|
51 |
+
filtered_dicts_to_save = (
|
52 |
+
(dev10_victims, "event_id_to_victims.dev10"),
|
53 |
+
(main_victims, "event_id_to_victims.main"),
|
54 |
+
)
|
55 |
+
write_dict_to_json(filtered_dicts_to_save)
|
56 |
+
|
57 |
+
# retrieve texts for filtered data
|
58 |
+
print("Filtering & writing texts...")
|
59 |
+
texts_df = pd.read_excel("data/femicides/rai/EventiFemminicidio_from2015to2017_fonti.xlsx")
|
60 |
+
filter_texts("dev10", texts_df, dev10_victims, victim_duplicate_counts)
|
61 |
+
filter_texts("main", texts_df, main_victims, victim_duplicate_counts)
|
62 |
+
|
63 |
+
|
64 |
+
def split_olv_femicides():
|
65 |
+
texts_df = pd.read_csv("data/femicides/olv/texts_scrape_match_scrape_2021-10-28.csv")
|
66 |
+
events_df = pd.read_csv("data/femicides/olv/events_scrape_match_scrape_2021-10-28.csv")
|
67 |
+
|
68 |
+
event_ids = events_df["event:id"].tolist()
|
69 |
+
random.shuffle(event_ids)
|
70 |
+
|
71 |
+
num_dev_events = round(len(event_ids) * 0.10)
|
72 |
+
dev10_ids = event_ids[:num_dev_events]
|
73 |
+
dev10_df, main_df = create_split_df(dev10_ids, events_df)
|
74 |
+
|
75 |
+
# split texts
|
76 |
+
dev10_texts_df = texts_df[texts_df["event_id"].isin(dev10_ids)]
|
77 |
+
main_texts_df = texts_df[~texts_df["event_id"].isin(dev10_ids)]
|
78 |
+
|
79 |
+
# write to files
|
80 |
+
for events_df, texts_df, split_name in ((dev10_df, dev10_texts_df, "dev10"), (main_df, main_texts_df, "main")):
|
81 |
+
events_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.events.csv")
|
82 |
+
texts_df.to_csv(f"output/femicides/split_data/olv/split_{split_name}.texts.csv")
|
83 |
+
events_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.events.xlsx")
|
84 |
+
texts_df.to_excel(f"output/femicides/split_data/olv/split_{split_name}.texts.xlsx")
|
85 |
+
|
86 |
+
for _, row in texts_df.iterrows():
|
87 |
+
event_id = row["event_id"]
|
88 |
+
text_id = row["text_id"]
|
89 |
+
event_dir = f"output/femicides/split_data/olv/split_{split_name}_texts_by_event/{event_id}/"
|
90 |
+
os.makedirs(event_dir, exist_ok=True)
|
91 |
+
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
|
92 |
+
title = row["title"] if not pd.isna(row["title"]) else ""
|
93 |
+
for line in nltk.sent_tokenize(title, language="italian"):
|
94 |
+
f_by_event.write(line + os.linesep)
|
95 |
+
f_by_event.write(os.linesep)
|
96 |
+
fulltext = row["fulltext"] if not pd.isna(row["fulltext"]) else ""
|
97 |
+
if not fulltext:
|
98 |
+
print(f"WARNING: empty fulltext in text_id={text_id}")
|
99 |
+
for line in nltk.sent_tokenize(fulltext, language="italian"):
|
100 |
+
line = line.strip()
|
101 |
+
if not line:
|
102 |
+
continue
|
103 |
+
f_by_event.write(line + os.linesep)
|
104 |
+
|
105 |
+
|
106 |
+
def write_dict_to_json(filtered_dicts_to_save):
|
107 |
+
for dict_data, dict_name in filtered_dicts_to_save:
|
108 |
+
with open(f"output/femicides/split_data/rai/{dict_name}.json", "w", encoding="utf-8") as f:
|
109 |
+
json.dump(dict_data, f, indent=4, sort_keys=True)
|
110 |
+
|
111 |
+
|
112 |
+
def create_split_df(dev10: List[int], femicide_events: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
113 |
+
dev10_rows = []
|
114 |
+
main_rows = []
|
115 |
+
for idx, row in femicide_events.iterrows():
|
116 |
+
event_id = row["event:id"]
|
117 |
+
if pd.isna(event_id):
|
118 |
+
continue
|
119 |
+
event_id = int(event_id)
|
120 |
+
if event_id in dev10:
|
121 |
+
dev10_rows.append(row)
|
122 |
+
else:
|
123 |
+
main_rows.append(row)
|
124 |
+
dev10_df = pd.DataFrame(dev10_rows)
|
125 |
+
main_df = pd.DataFrame(main_rows)
|
126 |
+
return dev10_df, main_df
|
127 |
+
|
128 |
+
|
129 |
+
def read_events(events_df):
|
130 |
+
event_ids: List[int] = []
|
131 |
+
victim_to_event_id: Dict[str, int] = {}
|
132 |
+
event_id_to_victims: Dict[int, List[Tuple[str, int]]] = defaultdict(list)
|
133 |
+
victim_duplicate_counts: Dict[str, int] = defaultdict(int)
|
134 |
+
|
135 |
+
for idx, row in events_df.iterrows():
|
136 |
+
event_id = row["event:id"]
|
137 |
+
if pd.isna(event_id):
|
138 |
+
continue
|
139 |
+
event_id = int(event_id)
|
140 |
+
|
141 |
+
# unspecified name --> "UNKNOWN_X"
|
142 |
+
victim = row["victim:name"]
|
143 |
+
if victim == "non rilevato" or pd.isna(victim):
|
144 |
+
victim = f"UNKNOWN_{event_id}"
|
145 |
+
|
146 |
+
# disambiguate victims with duplicate names
|
147 |
+
victim_duplicate_counts[victim] += 1
|
148 |
+
duplicate_id = victim_duplicate_counts[victim]
|
149 |
+
|
150 |
+
event_ids.append(event_id)
|
151 |
+
victim_to_event_id[f"{victim}/{duplicate_id}"] = event_id
|
152 |
+
event_id_to_victims[event_id].append((victim, duplicate_id))
|
153 |
+
return event_ids, victim_to_event_id, event_id_to_victims, victim_duplicate_counts
|
154 |
+
|
155 |
+
|
156 |
+
def filter_texts(split_name: str,
|
157 |
+
texts_df: pd.DataFrame,
|
158 |
+
event_idx_to_victims: Dict[int, List[Tuple[str, int]]],
|
159 |
+
victim_duplicate_counts: Dict[str, int]):
|
160 |
+
|
161 |
+
print(f"\tfilter_texts: filtering split {split_name}")
|
162 |
+
|
163 |
+
# first filter victims
|
164 |
+
victim_to_event_idx = {}
|
165 |
+
for e_id in event_idx_to_victims:
|
166 |
+
for victim in event_idx_to_victims[e_id]:
|
167 |
+
victim_name, victim_dup_id = victim
|
168 |
+
if victim_duplicate_counts[victim_name] > 1:
|
169 |
+
print(f"\tfilter_texts: removing ambiguous victim name '{victim_name}'")
|
170 |
+
continue
|
171 |
+
victim_to_event_idx[victim_name] = e_id
|
172 |
+
|
173 |
+
meta_rows: List[Dict[str, Any]] = []
|
174 |
+
with open(f"output/femicides/split_data/rai/split_{split_name}.texts.text.txt", "w", encoding="utf-8") as f_txt, \
|
175 |
+
open(f"output/femicides/split_data/rai/split_{split_name}.texts.ids.txt", "w", encoding="utf-8") as f_id:
|
176 |
+
for _, row in texts_df.iterrows():
|
177 |
+
text_victim = row["vittima"].strip()
|
178 |
+
if text_victim in victim_to_event_idx:
|
179 |
+
e_id = victim_to_event_idx[text_victim]
|
180 |
+
text_id = int(row["ID"])
|
181 |
+
url = row["link"]
|
182 |
+
pubdate = row["pubdate"]
|
183 |
+
provider = row["provider"]
|
184 |
+
title = row["title"]
|
185 |
+
|
186 |
+
meta_rows.append({
|
187 |
+
"event_id": e_id,
|
188 |
+
"text_id": text_id,
|
189 |
+
"url": url,
|
190 |
+
"pubdate": pubdate,
|
191 |
+
"provider": provider,
|
192 |
+
"title": title
|
193 |
+
})
|
194 |
+
|
195 |
+
# body_text_lines = row["text"].split("\n")
|
196 |
+
body_text_lines = nltk.sent_tokenize(row["text"], language="italian")
|
197 |
+
title_lines = nltk.sent_tokenize(title, language="italian")
|
198 |
+
|
199 |
+
# f_txt.write(title.strip() + os.linesep)
|
200 |
+
# f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
|
201 |
+
for line in title_lines:
|
202 |
+
f_txt.write(line + os.linesep)
|
203 |
+
f_id.write(f"event {e_id}\ttext {text_id}\ttitle" + os.linesep)
|
204 |
+
|
205 |
+
event_dir = f"output/femicides/split_data/rai/split_{split_name}_texts_by_event/{e_id}/"
|
206 |
+
os.makedirs(event_dir, exist_ok=True)
|
207 |
+
with open(event_dir + f"/{text_id}.txt", "w", encoding="utf-8") as f_by_event:
|
208 |
+
# f_by_event.write(title.strip() + os.linesep)
|
209 |
+
for line in title_lines:
|
210 |
+
f_by_event.write(line + os.linesep)
|
211 |
+
f_by_event.write(os.linesep)
|
212 |
+
for line in body_text_lines:
|
213 |
+
line = line.strip()
|
214 |
+
if not line:
|
215 |
+
continue
|
216 |
+
f_txt.write(line + os.linesep)
|
217 |
+
f_by_event.write(line + os.linesep)
|
218 |
+
f_id.write(f"event {e_id}\ttext {text_id}\tbody" + os.linesep)
|
219 |
+
|
220 |
+
meta_df = pd.DataFrame(meta_rows)
|
221 |
+
meta_df.to_csv(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.csv")
|
222 |
+
meta_df.to_excel(f"output/femicides/split_data/rai/split_{split_name}.texts.meta.xlsx")
|
223 |
+
print()
|
224 |
+
|
225 |
+
|
226 |
+
if __name__ == '__main__':
|
227 |
+
|
228 |
+
ap = argparse.ArgumentParser()
|
229 |
+
ap.add_argument("dataset", choices=["rai", "olv"])
|
230 |
+
args = ap.parse_args()
|
231 |
+
|
232 |
+
if args.dataset == "rai":
|
233 |
+
split_rai_femicides()
|
234 |
+
else:
|
235 |
+
split_olv_femicides()
|
sociofillmore/migration/cda_classify.py
ADDED
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Learn to classify the manually annotated CDA attributes (frames, 'riferimento', orientation)
|
3 |
+
"""
|
4 |
+
|
5 |
+
import sys
|
6 |
+
|
7 |
+
import torch
|
8 |
+
|
9 |
+
from allennlp.data.vocabulary import Vocabulary
|
10 |
+
from allennlp.data import DatasetReader, TokenIndexer, Instance, Token
|
11 |
+
from allennlp.data.fields import TextField, LabelField
|
12 |
+
from allennlp.data.token_indexers.pretrained_transformer_indexer import (
|
13 |
+
PretrainedTransformerIndexer,
|
14 |
+
)
|
15 |
+
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import (
|
16 |
+
PretrainedTransformerTokenizer,
|
17 |
+
)
|
18 |
+
from allennlp.models import BasicClassifier
|
19 |
+
from allennlp.modules.text_field_embedders.basic_text_field_embedder import (
|
20 |
+
BasicTextFieldEmbedder,
|
21 |
+
)
|
22 |
+
from allennlp.modules.token_embedders.pretrained_transformer_embedder import (
|
23 |
+
PretrainedTransformerEmbedder,
|
24 |
+
)
|
25 |
+
from allennlp.modules.seq2vec_encoders.bert_pooler import BertPooler
|
26 |
+
from allennlp.training.checkpointer import Checkpointer
|
27 |
+
from allennlp.training.gradient_descent_trainer import GradientDescentTrainer
|
28 |
+
from allennlp.data.data_loaders.simple_data_loader import SimpleDataLoader
|
29 |
+
from allennlp.training.optimizers import AdamOptimizer
|
30 |
+
from allennlp.predictors.text_classifier import TextClassifierPredictor
|
31 |
+
|
32 |
+
from sklearn.svm import SVC
|
33 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
34 |
+
from sklearn.metrics import precision_recall_fscore_support
|
35 |
+
from sklearn.tree import DecisionTreeClassifier
|
36 |
+
from sklearn.dummy import DummyClassifier
|
37 |
+
|
38 |
+
import pandas as pd
|
39 |
+
import numpy as np
|
40 |
+
import spacy
|
41 |
+
|
42 |
+
import json
|
43 |
+
import os
|
44 |
+
from typing import Dict, Iterable
|
45 |
+
|
46 |
+
|
47 |
+
class MigrationReader(DatasetReader):
|
48 |
+
def __init__(self, token_indexers, tokenizer):
|
49 |
+
self.token_indexers = token_indexers
|
50 |
+
self.tokenizer = tokenizer
|
51 |
+
|
52 |
+
def text_to_instance(self, sentence, label=None) -> Instance:
|
53 |
+
text_field = TextField(self.tokenizer.tokenize(sentence), self.token_indexers)
|
54 |
+
fields = {"tokens": text_field}
|
55 |
+
if label is not None:
|
56 |
+
label_field = LabelField(label)
|
57 |
+
fields["label"] = label_field
|
58 |
+
return Instance(fields)
|
59 |
+
|
60 |
+
|
61 |
+
def read_instances(
|
62 |
+
self, text: pd.Series, labels: pd.Series
|
63 |
+
) -> Iterable[Instance]:
|
64 |
+
for sentence, label in zip(text, labels):
|
65 |
+
instance = self.text_to_instance(sentence, label)
|
66 |
+
yield instance
|
67 |
+
|
68 |
+
|
69 |
+
def train(attrib, use_gpu=False):
|
70 |
+
assert attrib in ["cda_frame", "riferimento", "orientation", "fake"]
|
71 |
+
|
72 |
+
# load data
|
73 |
+
print("Loading data...")
|
74 |
+
x_train, y_train, x_dev, y_dev = load_data(attrib)
|
75 |
+
print(f"\t\ttrain size: {len(x_train)}")
|
76 |
+
print(f"\t\tdev size: {len(x_dev)}")
|
77 |
+
|
78 |
+
# try different setups
|
79 |
+
print("Running training setups...")
|
80 |
+
scores = []
|
81 |
+
setups = [
|
82 |
+
# defaults: remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False
|
83 |
+
# ({}, {}, {"type": "svm", "options": {"kernel": "linear", "C": 1.0}}),
|
84 |
+
(
|
85 |
+
{},
|
86 |
+
{},
|
87 |
+
{
|
88 |
+
"type": "bert",
|
89 |
+
"options": {"transformer": "Musixmatch/umberto-commoncrawl-cased-v1"},
|
90 |
+
},
|
91 |
+
),
|
92 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, {"type": "svm", "options": {"kernel": "linear", "C": 0.8}}),
|
93 |
+
# ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "svm", "options": {"kernel": "linear", "C": 0.8}}),
|
94 |
+
# ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "dummy", "options": {}}),
|
95 |
+
# ({"lemmatize": True, "remove_stop": True}, {"embed": False}, {"type": "tree", "options": {}}),
|
96 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear')),
|
97 |
+
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5}, SVC(kernel='linear')),
|
98 |
+
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5, "max_freq": .70}, SVC(kernel='linear')),
|
99 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.6)),
|
100 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.7)),
|
101 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.8)),
|
102 |
+
# ({"lemmatize": True, "remove_stop": True}, {"ngram_range": (1,2)}, SVC(kernel='linear', C=0.8)),
|
103 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel="rbf")),
|
104 |
+
]
|
105 |
+
|
106 |
+
nlp = spacy.load("it_core_news_md")
|
107 |
+
|
108 |
+
for s_idx, (text_options, vect_options, model_info) in enumerate(setups):
|
109 |
+
|
110 |
+
if model_info["type"] == "bert":
|
111 |
+
print("\t\tPreparing BERT model...")
|
112 |
+
|
113 |
+
# cuda_device = 0 if torch.cuda.is_available() else -1
|
114 |
+
cuda_device = None if use_gpu and torch.cuda.is_available() else -1
|
115 |
+
|
116 |
+
transformer = model_info["options"]["transformer"]
|
117 |
+
token_indexers = {"tokens": PretrainedTransformerIndexer(transformer)}
|
118 |
+
tokenizer = PretrainedTransformerTokenizer(transformer)
|
119 |
+
|
120 |
+
reader = MigrationReader(token_indexers, tokenizer)
|
121 |
+
train_instances = list(
|
122 |
+
reader.read_instances(x_train, y_train)
|
123 |
+
)
|
124 |
+
dev_instances = list(
|
125 |
+
reader.read_instances(x_dev, y_dev)
|
126 |
+
)
|
127 |
+
vocab = Vocabulary.from_instances(train_instances + dev_instances)
|
128 |
+
print(vocab.get_vocab_size("tags"))
|
129 |
+
|
130 |
+
embedder = BasicTextFieldEmbedder(
|
131 |
+
{"tokens": PretrainedTransformerEmbedder(transformer)}
|
132 |
+
)
|
133 |
+
seq2vec = BertPooler(transformer)
|
134 |
+
model = BasicClassifier(vocab, embedder, seq2vec, namespace="tags")
|
135 |
+
if use_gpu:
|
136 |
+
model = model.cuda(cuda_device)
|
137 |
+
|
138 |
+
checkpoint_dir = f"/scratch/p289731/cda_classify/model_{attrib}/checkpoints/"
|
139 |
+
serialization_dir = f"/scratch/p289731/cda_classify/model_{attrib}/serialize/"
|
140 |
+
os.makedirs(checkpoint_dir)
|
141 |
+
os.makedirs(serialization_dir)
|
142 |
+
checkpointer = Checkpointer(checkpoint_dir)
|
143 |
+
optimizer = AdamOptimizer(
|
144 |
+
[(n, p) for n, p in model.named_parameters() if p.requires_grad],
|
145 |
+
lr=1e-6
|
146 |
+
)
|
147 |
+
train_loader = SimpleDataLoader(train_instances, batch_size=8, shuffle=True)
|
148 |
+
dev_loader = SimpleDataLoader(dev_instances, batch_size=8, shuffle=False)
|
149 |
+
train_loader.index_with(vocab)
|
150 |
+
dev_loader.index_with(vocab)
|
151 |
+
|
152 |
+
print("\t\tTraining BERT model")
|
153 |
+
trainer = GradientDescentTrainer(
|
154 |
+
model,
|
155 |
+
optimizer,
|
156 |
+
train_loader,
|
157 |
+
validation_data_loader=dev_loader,
|
158 |
+
patience=32,
|
159 |
+
checkpointer=checkpointer,
|
160 |
+
cuda_device=cuda_device,
|
161 |
+
serialization_dir=serialization_dir
|
162 |
+
)
|
163 |
+
trainer.train()
|
164 |
+
|
165 |
+
print("\t\tProducing predictions...")
|
166 |
+
predictor = TextClassifierPredictor(model, reader)
|
167 |
+
predictions = [predictor.predict(sentence) for sentence in x_dev]
|
168 |
+
y_dev_pred = [p["label"] for p in predictions]
|
169 |
+
class_labels = list(vocab.get_token_to_index_vocabulary("labels").keys())
|
170 |
+
|
171 |
+
elif model_info["type"] in ["svm", "tree", "dummy"]:
|
172 |
+
# extract features
|
173 |
+
print("\t\tExtracting features...")
|
174 |
+
x_train_fts, vectorizer = extract_features(
|
175 |
+
x_train, nlp, text_options, **vect_options
|
176 |
+
)
|
177 |
+
x_dev_fts, _ = extract_features(
|
178 |
+
x_dev, nlp, text_options, **vect_options, vectorizer=vectorizer
|
179 |
+
)
|
180 |
+
|
181 |
+
if not vect_options["embed"]:
|
182 |
+
print(f"\t\t\tnum features: {len(vectorizer.vocabulary_)}")
|
183 |
+
else:
|
184 |
+
assert model_info["type"] != "tree", "Decision tree does not support embedding input"
|
185 |
+
|
186 |
+
print("\t\tTraining the model...")
|
187 |
+
if model_info["type"] == "svm":
|
188 |
+
model = SVC(**model_info["options"])
|
189 |
+
elif model_info["type"] == "tree":
|
190 |
+
model = DecisionTreeClassifier()
|
191 |
+
else:
|
192 |
+
model = DummyClassifier()
|
193 |
+
model.fit(x_train_fts, y_train)
|
194 |
+
|
195 |
+
# evaluate on dev
|
196 |
+
print("\t\tValidating the model...")
|
197 |
+
y_dev_pred = model.predict(x_dev_fts)
|
198 |
+
class_labels = model.classes_
|
199 |
+
|
200 |
+
p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(
|
201 |
+
y_dev, y_dev_pred, average="micro"
|
202 |
+
)
|
203 |
+
p_classes, r_classes, f_classes, _ = precision_recall_fscore_support(
|
204 |
+
y_dev, y_dev_pred, average=None, labels=class_labels, zero_division=0
|
205 |
+
)
|
206 |
+
print(
|
207 |
+
f"\t\t\tOverall scores (micro-averaged):\tP={p_micro}\tR={r_micro}\tF={f_micro}"
|
208 |
+
)
|
209 |
+
|
210 |
+
scores.append(
|
211 |
+
{
|
212 |
+
"micro": {"p": p_micro, "r": r_micro, "f": f_micro},
|
213 |
+
"classes": {
|
214 |
+
"p": list(zip(class_labels, p_classes)),
|
215 |
+
"r": list(zip(class_labels, r_classes)),
|
216 |
+
"f": list(zip(class_labels, f_classes)),
|
217 |
+
},
|
218 |
+
}
|
219 |
+
)
|
220 |
+
|
221 |
+
prediction_df = pd.DataFrame(
|
222 |
+
zip(x_dev, y_dev, y_dev_pred), columns=["headline", "gold", "prediction"]
|
223 |
+
)
|
224 |
+
prediction_df.to_csv(
|
225 |
+
f"output/migration/cda_classify/predictions_{attrib}_{s_idx:02}.csv"
|
226 |
+
)
|
227 |
+
|
228 |
+
with open(
|
229 |
+
f"output/migration/cda_classify/scores_{attrib}.json", "w", encoding="utf-8"
|
230 |
+
) as f_scores:
|
231 |
+
json.dump(scores, f_scores, indent=4)
|
232 |
+
|
233 |
+
|
234 |
+
def load_data(attrib):
|
235 |
+
train_data = pd.read_csv("output/migration/preprocess/annotations_train.csv")
|
236 |
+
dev_data = pd.read_csv("output/migration/preprocess/annotations_dev.csv")
|
237 |
+
|
238 |
+
x_train = train_data["Titolo"]
|
239 |
+
x_dev = dev_data["Titolo"]
|
240 |
+
|
241 |
+
if attrib == "cda_frame":
|
242 |
+
y_train = train_data["frame"]
|
243 |
+
y_dev = dev_data["frame"]
|
244 |
+
elif attrib == "riferimento":
|
245 |
+
y_train = train_data["riferimento"]
|
246 |
+
y_dev = dev_data["riferimento"]
|
247 |
+
elif attrib == "orientation":
|
248 |
+
y_train = train_data["orientation"]
|
249 |
+
y_dev = dev_data["orientation"]
|
250 |
+
|
251 |
+
# fake task to test setup
|
252 |
+
else:
|
253 |
+
y_train = pd.Series(["true" if "rifugiato" in exa else "false" for exa in x_train])
|
254 |
+
y_dev = pd.Series(["true" if "rifugiato" in exa else "false" for exa in x_dev])
|
255 |
+
|
256 |
+
return x_train, y_train, x_dev, y_dev
|
257 |
+
|
258 |
+
|
259 |
+
def extract_features(
|
260 |
+
headlines,
|
261 |
+
nlp,
|
262 |
+
text_options,
|
263 |
+
embed=False,
|
264 |
+
min_freq=1,
|
265 |
+
max_freq=1.0,
|
266 |
+
ngram_range=(1, 1),
|
267 |
+
vectorizer=None,
|
268 |
+
):
|
269 |
+
|
270 |
+
if embed:
|
271 |
+
vectorized = np.array(
|
272 |
+
[vec for vec in process_text(headlines, nlp, embed=True, **text_options)]
|
273 |
+
)
|
274 |
+
else:
|
275 |
+
tokenized = [
|
276 |
+
" ".join(sent) for sent in process_text(headlines, nlp, **text_options)
|
277 |
+
]
|
278 |
+
if vectorizer is None:
|
279 |
+
vectorizer = CountVectorizer(
|
280 |
+
lowercase=False,
|
281 |
+
analyzer="word",
|
282 |
+
min_df=min_freq,
|
283 |
+
max_df=max_freq,
|
284 |
+
ngram_range=ngram_range,
|
285 |
+
)
|
286 |
+
vectorized = vectorizer.fit_transform(tokenized)
|
287 |
+
else:
|
288 |
+
vectorized = vectorizer.transform(tokenized)
|
289 |
+
return vectorized, vectorizer
|
290 |
+
|
291 |
+
|
292 |
+
def process_text(
|
293 |
+
headlines,
|
294 |
+
nlp,
|
295 |
+
embed=False,
|
296 |
+
remove_punct=True,
|
297 |
+
lowercase=True,
|
298 |
+
lemmatize=False,
|
299 |
+
remove_stop=False,
|
300 |
+
):
|
301 |
+
for sent in headlines:
|
302 |
+
doc = nlp(sent)
|
303 |
+
tokens = (
|
304 |
+
t
|
305 |
+
for t in doc
|
306 |
+
if (not remove_stop or not t.is_stop)
|
307 |
+
and (not remove_punct or t.pos_ not in ["PUNCT", "SYM", "X"])
|
308 |
+
)
|
309 |
+
if embed:
|
310 |
+
if lemmatize:
|
311 |
+
tokens = (t.vocab[t.lemma].vector for t in tokens)
|
312 |
+
else:
|
313 |
+
tokens = (t.vector for t in tokens if t.has_vector)
|
314 |
+
else:
|
315 |
+
if lemmatize:
|
316 |
+
tokens = (t.lemma_ for t in tokens)
|
317 |
+
else:
|
318 |
+
tokens = (t.text for t in tokens)
|
319 |
+
|
320 |
+
if lowercase:
|
321 |
+
tokens = (t.lower() for t in tokens)
|
322 |
+
|
323 |
+
if embed:
|
324 |
+
token_arr = np.array([t for t in tokens])
|
325 |
+
if len(token_arr) == 0:
|
326 |
+
yield np.random.rand(300)
|
327 |
+
else:
|
328 |
+
yield np.mean(token_arr, axis=0)
|
329 |
+
else:
|
330 |
+
yield list(tokens)
|
331 |
+
|
332 |
+
|
333 |
+
if __name__ == "__main__":
|
334 |
+
use_gpu = True if sys.argv[1] == "gpu" else False
|
335 |
+
# train(attrib="fake", use_gpu=use_gpu)
|
336 |
+
train(attrib="cda_frame", use_gpu=use_gpu)
|
337 |
+
# train(attrib="riferimento")
|
338 |
+
# train(attrib="orientation")
|
sociofillmore/migration/cda_classify_.py
ADDED
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Learn to classify the manually annotated CDA attributes (frames, 'riferimento', orientation)
|
3 |
+
"""
|
4 |
+
|
5 |
+
GLOVE_MODEL = "/net/aistaff/gminnema/thesis_data/data/glove-it/glove_WIKI"
|
6 |
+
|
7 |
+
|
8 |
+
from sklearn.svm import SVC
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
+
from sklearn.metrics import precision_recall_fscore_support
|
11 |
+
import gensim
|
12 |
+
import pandas as pd
|
13 |
+
import spacy
|
14 |
+
|
15 |
+
import json
|
16 |
+
|
17 |
+
|
18 |
+
def train(attrib):
|
19 |
+
assert attrib in ["cda_frame", "riferimento", "orientation"]
|
20 |
+
|
21 |
+
# load data
|
22 |
+
print("Loading data...")
|
23 |
+
x_train, y_train, x_dev, y_dev = load_data(attrib)
|
24 |
+
print(f"\t\ttrain size: {len(x_train)}")
|
25 |
+
print(f"\t\tdev size: {len(x_dev)}")
|
26 |
+
|
27 |
+
# try different setups
|
28 |
+
print("Running training setups...")
|
29 |
+
scores = []
|
30 |
+
setups = [
|
31 |
+
# defaults: remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False
|
32 |
+
# ({}, {}, SVC(kernel='linear')),
|
33 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear')),
|
34 |
+
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5}, SVC(kernel='linear')),
|
35 |
+
# ({"lemmatize": True, "remove_stop": True}, {"min_freq": 5, "max_freq": .70}, SVC(kernel='linear')),
|
36 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.6)),
|
37 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.7)),
|
38 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel='linear', C=0.8)),
|
39 |
+
({"lemmatize": True, "remove_stop": True}, {"embed": "glove"}, SVC(kernel='linear', C=0.8)),
|
40 |
+
# ({"lemmatize": True, "remove_stop": True}, {}, SVC(kernel="rbf")),
|
41 |
+
]
|
42 |
+
|
43 |
+
|
44 |
+
nlp = spacy.load("it_core_news_md")
|
45 |
+
|
46 |
+
for s_idx, (text_options, vect_options, model) in enumerate(setups):
|
47 |
+
|
48 |
+
print(f"\tSetup #{s_idx}")
|
49 |
+
|
50 |
+
# extract features
|
51 |
+
print("\t\tExtracting features...")
|
52 |
+
x_train_fts, vectorizer = extract_features(x_train, nlp, text_options, **vect_options)
|
53 |
+
x_dev_fts, _ = extract_features(x_dev, nlp, text_options, **vect_options, vectorizer=vectorizer)
|
54 |
+
print(f"\t\t\tnum features: {len(vectorizer.vocabulary_)}")
|
55 |
+
|
56 |
+
print("\t\tTraining the model...")
|
57 |
+
model.fit(x_train_fts, y_train)
|
58 |
+
|
59 |
+
# evaluate on dev
|
60 |
+
print("\t\tValidating the model...")
|
61 |
+
y_dev_pred = model.predict(x_dev_fts)
|
62 |
+
p_micro, r_micro, f_micro, _ = precision_recall_fscore_support(
|
63 |
+
y_dev, y_dev_pred, average="micro")
|
64 |
+
p_classes, r_classes, f_classes, _ = precision_recall_fscore_support(
|
65 |
+
y_dev, y_dev_pred, average=None, labels=model.classes_, zero_division=0)
|
66 |
+
print(
|
67 |
+
f"\t\t\tOverall scores (micro-averaged):\tP={p_micro}\tR={r_micro}\tF={f_micro}"
|
68 |
+
)
|
69 |
+
|
70 |
+
scores.append({
|
71 |
+
"micro": {
|
72 |
+
"p": p_micro,
|
73 |
+
"r": r_micro,
|
74 |
+
"f": f_micro
|
75 |
+
},
|
76 |
+
"classes": {
|
77 |
+
"p": list(zip(model.classes_, p_classes)),
|
78 |
+
"r": list(zip(model.classes_, r_classes)),
|
79 |
+
"f": list(zip(model.classes_, f_classes)),
|
80 |
+
}
|
81 |
+
})
|
82 |
+
|
83 |
+
prediction_df = pd.DataFrame(zip(x_dev, y_dev, y_dev_pred), columns=["headline", "gold", "prediction"])
|
84 |
+
prediction_df.to_csv(f"output/migration/cda_classify/predictions_{s_idx:02}.csv")
|
85 |
+
|
86 |
+
|
87 |
+
with open("output/migration/cda_classify/scores.json", "w", encoding="utf-8") as f_scores:
|
88 |
+
json.dump(scores, f_scores, indent=4)
|
89 |
+
|
90 |
+
|
91 |
+
def load_data(attrib):
|
92 |
+
train_data = pd.read_csv(
|
93 |
+
"output/migration/preprocess/annotations_train.csv")
|
94 |
+
dev_data = pd.read_csv("output/migration/preprocess/annotations_dev.csv")
|
95 |
+
|
96 |
+
x_train = train_data["Titolo"]
|
97 |
+
x_dev = dev_data["Titolo"]
|
98 |
+
|
99 |
+
if attrib == "cda_frame":
|
100 |
+
y_train = train_data["frame"]
|
101 |
+
y_dev = dev_data["frame"]
|
102 |
+
elif attrib == "riferimento":
|
103 |
+
y_train = train_data["riferimento"]
|
104 |
+
y_dev = dev_data["riferimento"]
|
105 |
+
else:
|
106 |
+
x_train = train_data["orientation"]
|
107 |
+
y_dev = dev_data["orientation"]
|
108 |
+
return x_train, y_train, x_dev, y_dev
|
109 |
+
|
110 |
+
|
111 |
+
def extract_features(headlines, nlp, text_options, min_freq=1, max_freq=1.0, embed=None, vectorizer=None):
|
112 |
+
tokenized = [" ".join(sent) for sent in tokenize(headlines, nlp, **text_options)]
|
113 |
+
if vectorizer is None:
|
114 |
+
if embed is None:
|
115 |
+
vectorizer = CountVectorizer(lowercase=False, analyzer="word", min_df=min_freq, max_df=max_freq)
|
116 |
+
vectorized = vectorizer.fit_transform(tokenized)
|
117 |
+
else:
|
118 |
+
vectorizer = gensim.models.
|
119 |
+
else:
|
120 |
+
vectorized = vectorizer.transform(tokenized)
|
121 |
+
return vectorized, vectorizer
|
122 |
+
|
123 |
+
|
124 |
+
def tokenize(headlines, nlp, remove_punct=True, lowercase=True, lemmatize=False, remove_stop=False):
|
125 |
+
for sent in headlines:
|
126 |
+
doc = nlp(sent)
|
127 |
+
tokens = (
|
128 |
+
t.lemma_ if lemmatize else t.text
|
129 |
+
for t in doc
|
130 |
+
if (not remove_stop or not t.is_stop) and (not remove_punct or t.pos_ not in ["PUNCT", "SYM", "X"])
|
131 |
+
)
|
132 |
+
if lowercase:
|
133 |
+
tokens = [t.lower() for t in tokens]
|
134 |
+
else:
|
135 |
+
tokens = [t for t in tokens]
|
136 |
+
yield tokens
|
137 |
+
|
138 |
+
|
139 |
+
if __name__ == '__main__':
|
140 |
+
train(attrib="cda_frame")
|
sociofillmore/migration/extract_political_ratings.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
|
4 |
+
def main():
|
5 |
+
corpus_df = pd.read_excel("data/migration/corpus_titoli_2013_2021_with_most_recent_years.xlsx")
|
6 |
+
ratings = corpus_df.groupby("Testata").first()[["Orientamento politico", "Religiosa?"]]
|
7 |
+
ratings.index.rename("provider", inplace=True)
|
8 |
+
ratings["political_stance"] = ratings["Orientamento politico"]
|
9 |
+
ratings["religious"] = ratings["Religiosa?"] == "religiosa"
|
10 |
+
ratings.drop("Orientamento politico", axis=1, inplace=True)
|
11 |
+
ratings.drop("Religiosa?", axis=1, inplace=True)
|
12 |
+
|
13 |
+
ratings.to_csv("data/migration/provider_pol_rel_ratings.csv")
|
14 |
+
|
15 |
+
|
16 |
+
if __name__ == "__main__":
|
17 |
+
main()
|
sociofillmore/migration/preprocess.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
import random
|
5 |
+
|
6 |
+
random.seed(1996)
|
7 |
+
|
8 |
+
|
9 |
+
CORPUS_ANNOTATED = "data/migration/corpus_with_frames_and_orientation.csv"
|
10 |
+
CORPUS_ALL = "data/migration/corpus_all.csv"
|
11 |
+
|
12 |
+
RATIO_DEV = 0.05
|
13 |
+
RATIO_TEST = 0.25
|
14 |
+
|
15 |
+
|
16 |
+
def preprocess_annotated():
|
17 |
+
print("Loading corpus...")
|
18 |
+
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
|
19 |
+
print(f"\tfound {len(df)} annotated headlines")
|
20 |
+
|
21 |
+
train_idx = []
|
22 |
+
dev_idx = []
|
23 |
+
test_idx = []
|
24 |
+
|
25 |
+
print("Making random train/dev/test split...")
|
26 |
+
for i in range(len(df)):
|
27 |
+
rnd = random.random()
|
28 |
+
if rnd < RATIO_DEV:
|
29 |
+
dev_idx.append(i)
|
30 |
+
elif rnd < (RATIO_DEV + RATIO_TEST):
|
31 |
+
test_idx.append(i)
|
32 |
+
else:
|
33 |
+
train_idx.append(i)
|
34 |
+
|
35 |
+
print(f"\tassigned {len(train_idx)} samples to train")
|
36 |
+
print(f"\tassigned {len(dev_idx)} samples to dev")
|
37 |
+
print(f"\tassigned {len(test_idx)} samples to test")
|
38 |
+
|
39 |
+
df_train = df.iloc[train_idx]
|
40 |
+
df_dev = df.iloc[dev_idx]
|
41 |
+
df_test = df.iloc[test_idx]
|
42 |
+
|
43 |
+
df_train.to_csv("output/migration/preprocess/annotations_train.csv")
|
44 |
+
df_dev.to_csv("output/migration/preprocess/annotations_dev.csv")
|
45 |
+
df_test.to_csv("output/migration/preprocess/annotations_test.csv")
|
46 |
+
|
47 |
+
|
48 |
+
def preprocess_all():
|
49 |
+
df = pd.read_csv(CORPUS_ANNOTATED, encoding="latin1")
|
50 |
+
for _, row in df.iterrows():
|
51 |
+
pass
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
# preprocess_annotated()
|
57 |
+
preprocess_all()
|
sociofillmore/migration/split_data.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
import datetime
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
random.seed(1996)
|
7 |
+
|
8 |
+
|
9 |
+
DEV_RATIO = 0.10
|
10 |
+
|
11 |
+
|
12 |
+
def choose_best_casing(orig, predicted):
|
13 |
+
num_upper_tokens = len([c == c.upper() for c in orig.upper()])
|
14 |
+
if num_upper_tokens > 0.5 * len(orig):
|
15 |
+
return predicted
|
16 |
+
return predicted
|
17 |
+
|
18 |
+
|
19 |
+
def split_data():
|
20 |
+
events_main = []
|
21 |
+
texts_main = []
|
22 |
+
events_dev = []
|
23 |
+
texts_dev = []
|
24 |
+
|
25 |
+
with open("data/migration/corpus_titoli_all_raw.truecase_bilstm.txt", encoding="utf-8") as f:
|
26 |
+
titles_tc = [line.strip() for line in f]
|
27 |
+
|
28 |
+
df_all = pd.read_csv("data/migration/corpus_all.csv", encoding="latin-1")
|
29 |
+
for idx, (_, row) in enumerate(df_all.iterrows()):
|
30 |
+
|
31 |
+
if idx % 1000 == 0:
|
32 |
+
print("Processing line:", idx)
|
33 |
+
|
34 |
+
year = int(row["Anno"])
|
35 |
+
|
36 |
+
event_data = {
|
37 |
+
"event:id": idx,
|
38 |
+
"event:year": year,
|
39 |
+
|
40 |
+
}
|
41 |
+
text_data = {
|
42 |
+
"event_id": idx,
|
43 |
+
"text_id": idx,
|
44 |
+
"pubyear": year,
|
45 |
+
"language": "Italian",
|
46 |
+
"provider": row["Testata"].lstrip("*T_"),
|
47 |
+
"title": choose_best_casing(row["Titolo"], titles_tc[idx]),
|
48 |
+
"title_truecased": titles_tc[idx],
|
49 |
+
"title_orig": row["Titolo"]
|
50 |
+
}
|
51 |
+
|
52 |
+
if random.random() < DEV_RATIO:
|
53 |
+
events_dev.append(event_data)
|
54 |
+
texts_dev.append(text_data)
|
55 |
+
|
56 |
+
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
|
57 |
+
f_out.write(text_data["title"])
|
58 |
+
|
59 |
+
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
|
60 |
+
f_out.write(text_data["title_orig"])
|
61 |
+
|
62 |
+
with open(f"output/migration/split_data/split_dev10_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
|
63 |
+
f_out.write(text_data["title_truecased"])
|
64 |
+
|
65 |
+
else:
|
66 |
+
events_main.append(event_data)
|
67 |
+
texts_main.append(text_data)
|
68 |
+
|
69 |
+
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.best.txt", "w", encoding="utf-8") as f_out:
|
70 |
+
f_out.write(text_data["title"])
|
71 |
+
|
72 |
+
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.orig.txt", "w", encoding="utf-8") as f_out:
|
73 |
+
f_out.write(text_data["title_orig"])
|
74 |
+
|
75 |
+
with open(f"output/migration/split_data/split_main_sep_txt_files/{idx}.truecase.txt", "w", encoding="utf-8") as f_out:
|
76 |
+
f_out.write(text_data["title_truecased"])
|
77 |
+
|
78 |
+
pd.DataFrame(events_main).to_csv("output/migration/split_data/split_main.events.csv")
|
79 |
+
pd.DataFrame(texts_main).to_csv("output/migration/split_data/split_main.texts.meta.csv")
|
80 |
+
pd.DataFrame(events_dev).to_csv("output/migration/split_data/split_dev10.events.csv")
|
81 |
+
pd.DataFrame(texts_dev).to_csv("output/migration/split_data/split_dev10.texts.meta.csv")
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == "__main__":
|
85 |
+
split_data()
|
sociofillmore/migration/split_lome_predictions.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import json
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
|
7 |
+
def main(input_json, input_txt, output_dir):
|
8 |
+
meta_df = pd.read_csv("output/migration/split_data/split_dev10.texts.meta.csv")
|
9 |
+
text_ids = meta_df["text_id"].to_list()
|
10 |
+
with open(input_json, encoding="utf-8") as f:
|
11 |
+
json_predictions = json.load(f)
|
12 |
+
|
13 |
+
with open(input_txt, encoding="utf-8") as f:
|
14 |
+
txt_predictions = f.read().split("\n\n")
|
15 |
+
|
16 |
+
for t_id, json_p, txt_p in zip(text_ids, json_predictions, txt_predictions):
|
17 |
+
|
18 |
+
if int(t_id) % 100 == 0:
|
19 |
+
print(t_id)
|
20 |
+
|
21 |
+
prediction_dir = f"{output_dir}/{t_id}"
|
22 |
+
if not os.path.isdir(prediction_dir):
|
23 |
+
os.makedirs(prediction_dir)
|
24 |
+
prediction_file_json = f"{prediction_dir}/lome_{t_id}.comm.json"
|
25 |
+
prediction_file_txt = f"{prediction_dir}/lome_{t_id}.comm.txt"
|
26 |
+
|
27 |
+
with open(prediction_file_json, "w", encoding="utf-8") as f_out:
|
28 |
+
json.dump([json_p], f_out)
|
29 |
+
|
30 |
+
with open(prediction_file_txt, "w", encoding="utf-8") as f_out:
|
31 |
+
f_out.write(txt_p + "\n\n")
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == "__main__":
|
35 |
+
# main(
|
36 |
+
# input_json="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_tc.comm.json",
|
37 |
+
# input_txt="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_tc.comm.txt",
|
38 |
+
# output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
|
39 |
+
# )
|
40 |
+
# main(
|
41 |
+
# input_json="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_best-truecase.comm.json",
|
42 |
+
# input_txt="output/migration/lome/lome_0shot/lome_lome_0shot_migration_all_best-truecase.comm.txt",
|
43 |
+
# output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
|
44 |
+
# )
|
45 |
+
# main(
|
46 |
+
# input_json="output/migration/lome/lome_zs-tgt_ev-frm/data-in.concat.combined_zs_ev.tc_bilstm.json",
|
47 |
+
# input_txt="output/migration/lome/lome_zs-tgt_ev-frm/data-in.concat.combined_zs_ev.tc_bilstm.txt",
|
48 |
+
# output_dir="output/migration/lome/multilabel/lome_zs-tgt_ev_frm/pavia"
|
49 |
+
# )
|
50 |
+
main(
|
51 |
+
input_json="/home/gossminn/WorkSyncs/Code/fn-for-social-frames/output/migration/lome/lome_migration_concat.comm.json",
|
52 |
+
input_txt="/home/gossminn/WorkSyncs/Code/fn-for-social-frames/output/migration/lome/lome_migration_concat.comm.txt",
|
53 |
+
output_dir="output/migration/lome/multilabel/lome_0shot/pavia"
|
54 |
+
)
|
sociofillmore/scoring/eval/__pycache__/analyze_final_questionnaire.cpython-37.pyc
ADDED
Binary file (2.64 kB). View file
|
|