Spaces:
Sleeping
Sleeping
AlzbetaStrompova
commited on
Commit
•
2a9fe7e
1
Parent(s):
7e6964a
requirements
Browse files- data_manipulation/create_gazetteers.py +0 -218
- requirements.txt +7 -0
data_manipulation/create_gazetteers.py
DELETED
@@ -1,218 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pickle
|
3 |
-
import itertools
|
4 |
-
import pandas as pd
|
5 |
-
|
6 |
-
from names_dataset import NameDataset
|
7 |
-
|
8 |
-
|
9 |
-
def load_gazetteers(path):
|
10 |
-
"""
|
11 |
-
Load gazetteers from a file
|
12 |
-
:param path: path to the gazetteer file
|
13 |
-
:return: a dict of gazetteers
|
14 |
-
"""
|
15 |
-
with open(path, 'rb') as f:
|
16 |
-
gazetteers = pickle.load(f)
|
17 |
-
return gazetteers
|
18 |
-
|
19 |
-
def save_gazetteers(gazetteers, path):
|
20 |
-
"""
|
21 |
-
Save gazetteers to a file
|
22 |
-
:param path: path to the gazetteer file
|
23 |
-
:param gazetteers: a dict of gazetteers
|
24 |
-
"""
|
25 |
-
with open(path, 'wb') as f:
|
26 |
-
pickle.dump(gazetteers, f)
|
27 |
-
|
28 |
-
def load_gazetteers_from_paper(path="/home/xstromp/dp/data/gazetteers_data/paper/Locations.Cities.Europe"):
|
29 |
-
"""
|
30 |
-
Load gazetteers from the paper
|
31 |
-
:param path: path to the gazetteer file
|
32 |
-
:return: a dict of gazetteers
|
33 |
-
"""
|
34 |
-
with open(path, 'r') as f:
|
35 |
-
gazetteers = f.readlines()
|
36 |
-
gazetteers = {gazetteer.strip() for gazetteer in gazetteers}
|
37 |
-
return gazetteers
|
38 |
-
|
39 |
-
def merge_gazetteers(*gazetteers):
|
40 |
-
# Initialize a new dictionary to store merged results
|
41 |
-
merged_gazetteers = {}
|
42 |
-
# Iterate over each dictionary provided
|
43 |
-
for gaz in gazetteers:
|
44 |
-
# Iterate over each key and set in the current dictionary
|
45 |
-
for key, value_set in gaz.items():
|
46 |
-
if key in merged_gazetteers:
|
47 |
-
# If the key already exists in the result, union the sets
|
48 |
-
merged_gazetteers[key] |= value_set
|
49 |
-
else:
|
50 |
-
# Otherwise, initialize the key with the set from the current dictionary
|
51 |
-
merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
|
52 |
-
return merged_gazetteers
|
53 |
-
|
54 |
-
|
55 |
-
####################################################################################################
|
56 |
-
### GENERATED LISTS ################################################################################
|
57 |
-
####################################################################################################
|
58 |
-
|
59 |
-
nationalities = [
|
60 |
-
"Čech", "Češka", "Češi",
|
61 |
-
"Slovák", "Slovenka", "Slováci",
|
62 |
-
"Němec", "Němka", "Němci",
|
63 |
-
"Polák", "Polka", "Poláci",
|
64 |
-
"Maďar", "Maďarka", "Maďaři",
|
65 |
-
"Rakušan", "Rakušanka", "Rakušané",
|
66 |
-
"Ukrajinec", "Ukrajinka", "Ukrajinci",
|
67 |
-
"Rus", "Ruska", "Rusové",
|
68 |
-
"Angličan", "Angličanka", "Angličané",
|
69 |
-
"Američan", "Američanka", "Američané",
|
70 |
-
"Francouz", "Francouzka", "Francouzi",
|
71 |
-
"Ital", "Italka", "Italové",
|
72 |
-
"Španěl", "Španělka", "Španělé",
|
73 |
-
"Portugalec", "Portugalka", "Portugalci",
|
74 |
-
"Řek", "Řekyně", "Řekové",
|
75 |
-
"Bulhar", "Bulharka", "Bulhaři",
|
76 |
-
"Rumun", "Rumunka", "Rumuni",
|
77 |
-
"Belgičan", "Belgičanka", "Belgičané",
|
78 |
-
"Holanďan", "Holanďanka", "Holandci",
|
79 |
-
"Švýcar", "Švýcarka", "Švýcaři",
|
80 |
-
"Slovinec", "Slovinka", "Slovinci",
|
81 |
-
"Chorvat", "Chorvatka", "Chorvaté",
|
82 |
-
"Srb", "Srbka", "Srbové",
|
83 |
-
"Bosňák", "Bosňačka", "Bosňáci",
|
84 |
-
"Černohorec", "Černohorka", "Černohorci",
|
85 |
-
"Makedonec", "Makedonka", "Makedonci",
|
86 |
-
"Albánec", "Albánka", "Albánci",
|
87 |
-
"Turek", "Turkyně", "Turci",
|
88 |
-
"Kanaďan", "Kanaďanka", "Kanaďané",
|
89 |
-
"Mexičan", "Mexičanka", "Mexičané",
|
90 |
-
"Brazilec", "Brazilka", "Brazilci",
|
91 |
-
"Argentinc", "Argentinka", "Argentinci",
|
92 |
-
"Chilan", "Chilanka", "Chilané",
|
93 |
-
"Australan", "Australanka", "Australané",
|
94 |
-
"Novozélanďan", "Novozélanďanka", "Novozélanďané",
|
95 |
-
"Číňan", "Číňanka", "Číňané",
|
96 |
-
"Japonec", "Japonka", "Japonci",
|
97 |
-
"Korejec", "Korejka", "Korejci",
|
98 |
-
"Vietnamec", "Vietnamka", "Vietnamci",
|
99 |
-
"Ind", "Indka", "Indové",
|
100 |
-
"Pákistánec", "Pákistánka", "Pákistánci",
|
101 |
-
"Iráčan", "Iráčanka", "Iráčané",
|
102 |
-
"Íránec", "Íránka", "Íránci",
|
103 |
-
"Syřan", "Syřanka", "Syrští",
|
104 |
-
"Izraelan", "Izraelanka", "Izraelci",
|
105 |
-
"Egyptan", "Egyptanka", "Egyptané",
|
106 |
-
"Súdánec", "Súdánka", "Súdánci",
|
107 |
-
"Maročan", "Maročanka", "Maročané",
|
108 |
-
"Alžířan", "Alžírka", "Alžířané",
|
109 |
-
"Libanonec", "Libanonka", "Libanonci",
|
110 |
-
"Jordánec", "Jordánka", "Jordánci",
|
111 |
-
"Kuvajťan", "Kuvajťanka", "Kuvajťané"
|
112 |
-
]
|
113 |
-
|
114 |
-
titles = "Bc., BcA., Ing., Ing. arch., MgA., Mgr., MBA, Ph.D., JuDr., PhDr., Th.D., MuDr., RNDr., MVDr., PharmDr., DrSc., MVDR., MDDr., CSc, DRSc., doc., RNDr., prof., PhMr., Akad. Mal., Bc. et Bc., Mgr. et Mgr.".split(", ")
|
115 |
-
|
116 |
-
relig_myth = ["Bůh", "Ježíš Kristus", "Mojžíš", "Muhammad", "Buddha", "Krishna", "Thor", "Zeus",
|
117 |
-
"Odin", "Héraklés", "Anubis", "Osiris", "Izida", "Shiva", "Vishnu", "Ganesha",
|
118 |
-
"Athena", "Apolón", "Héra", "Artemis", "Dionýsos", "Quetzalcoatl", "Tezcatlipoca",
|
119 |
-
"Amaterasu", "Izanagi", "Izanami", "Freya", "Loki", "Baldur", "Saraswati", "Lakshmi",
|
120 |
-
"Hanuman", "Rama", "Sita", "Parvati", "Durga", "Kali", "Tara", "Vajrapani",
|
121 |
-
"Maitreya", "Avalokiteśvara"]
|
122 |
-
|
123 |
-
####################################################################################################
|
124 |
-
### WIKIANN GAZETTEERS #############################################################################
|
125 |
-
####################################################################################################
|
126 |
-
def determine_category(line):
|
127 |
-
categories = ["PER", "LOC", "ORG"]
|
128 |
-
for category in categories:
|
129 |
-
if category in line:
|
130 |
-
return category
|
131 |
-
return ""
|
132 |
-
|
133 |
-
def load_document(file_name):
|
134 |
-
with open(file_name, 'r') as file:
|
135 |
-
lines = file.readlines()
|
136 |
-
|
137 |
-
categories = {"LOC": set(), "PER": set(), "ORG": set()}
|
138 |
-
current_text, current_category = "", ""
|
139 |
-
|
140 |
-
for line in lines:
|
141 |
-
category = determine_category(line)
|
142 |
-
if not category:
|
143 |
-
continue
|
144 |
-
|
145 |
-
parts = line.strip().split("\t")
|
146 |
-
tag, word = parts[1], parts[0].split(":")[1]
|
147 |
-
|
148 |
-
if tag.startswith("B-"):
|
149 |
-
if current_category:
|
150 |
-
categories[current_category].add(current_text.strip())
|
151 |
-
current_category = category
|
152 |
-
current_text = word
|
153 |
-
elif tag.startswith("I-") and current_category == category:
|
154 |
-
current_text += " " + word
|
155 |
-
else:
|
156 |
-
if current_category:
|
157 |
-
categories[current_category].add(current_text.strip())
|
158 |
-
current_category, current_text = "", ""
|
159 |
-
|
160 |
-
if current_category:
|
161 |
-
categories[current_category].add(current_text.strip())
|
162 |
-
|
163 |
-
return categories
|
164 |
-
|
165 |
-
def load_gazetteers_from_wikiann(path="/home/xstromp/dp/data/wikiann/cs"):
|
166 |
-
gazetteers = {"LOC": set(), "PER": set(), "ORG": set()}
|
167 |
-
for data_split in ['train', 'extra', 'dev']:
|
168 |
-
additional_data = load_document(os.path.join(path, data_split))
|
169 |
-
for key, values in additional_data.items():
|
170 |
-
gazetteers[key].update(values)
|
171 |
-
return gazetteers
|
172 |
-
|
173 |
-
####################################################################################################
|
174 |
-
### GENERATION OF GAZETTEERS TO EXPAND TRAIN DATASET ###############################################
|
175 |
-
####################################################################################################
|
176 |
-
|
177 |
-
def get_complex_person():
|
178 |
-
pass
|
179 |
-
|
180 |
-
####################################################################################################
|
181 |
-
### GENERATION OF GAZETTEERS TO FIND MATCH FOR EXTENDED EMBEDDINGS #################################
|
182 |
-
####################################################################################################
|
183 |
-
|
184 |
-
def get_persons():
|
185 |
-
nd = NameDataset()
|
186 |
-
per = set()
|
187 |
-
# first names
|
188 |
-
first = nd.get_top_names(n=10000, country_alpha2='CZ')
|
189 |
-
per.update(first["CZ"]["M"])
|
190 |
-
per.update(first["CZ"]["F"])
|
191 |
-
# surnames
|
192 |
-
surnames = nd.get_top_names(n=10000, use_first_names=False, country_alpha2='CZ')
|
193 |
-
per.update(surnames["CZ"])
|
194 |
-
# titles
|
195 |
-
per.update(titles)
|
196 |
-
# nationalities
|
197 |
-
per.update(nationalities)
|
198 |
-
return per
|
199 |
-
|
200 |
-
def get_locations():
|
201 |
-
df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/LOC/world-data-2023.csv")
|
202 |
-
loc = {country for country in df['Country'].tolist()}
|
203 |
-
loc.update(["Asie", "Afrika", "Severní Amerika", "Jižní Amerika", "Antarktida", "Evropa", "Austrálie"])
|
204 |
-
with open("/home/xstromp/dp/data/gazetteers_data/LOC/data.json", 'rb') as handle:
|
205 |
-
loaded_dict = pickle.load(handle)
|
206 |
-
loc.update(list(itertools.chain.from_iterable([v for _, v in loaded_dict.items()])))
|
207 |
-
loc.update(load_gazetteers_from_paper())
|
208 |
-
return loc
|
209 |
-
|
210 |
-
|
211 |
-
def get_organizations():
|
212 |
-
|
213 |
-
df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/Inc5000Eu-full.csv")
|
214 |
-
org = set(df['Company'].tolist())
|
215 |
-
df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/FirmyBrno.csv")
|
216 |
-
org.update(df['name'].tolist())
|
217 |
-
org.update(load_gazetteers_from_paper("/home/xstromp/dp/data/gazetteers_data/paper/Organizations"))
|
218 |
-
return org
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
jupyter
|
2 |
+
transformers
|
3 |
+
datasets
|
4 |
+
torch
|
5 |
+
simplemma
|
6 |
+
gradio
|
7 |
+
pandas
|