AlzbetaStrompova commited on
Commit
2a9fe7e
1 Parent(s): 7e6964a

requirements

Browse files
data_manipulation/create_gazetteers.py DELETED
@@ -1,218 +0,0 @@
1
- import os
2
- import pickle
3
- import itertools
4
- import pandas as pd
5
-
6
- from names_dataset import NameDataset
7
-
8
-
9
- def load_gazetteers(path):
10
- """
11
- Load gazetteers from a file
12
- :param path: path to the gazetteer file
13
- :return: a dict of gazetteers
14
- """
15
- with open(path, 'rb') as f:
16
- gazetteers = pickle.load(f)
17
- return gazetteers
18
-
19
- def save_gazetteers(gazetteers, path):
20
- """
21
- Save gazetteers to a file
22
- :param path: path to the gazetteer file
23
- :param gazetteers: a dict of gazetteers
24
- """
25
- with open(path, 'wb') as f:
26
- pickle.dump(gazetteers, f)
27
-
28
- def load_gazetteers_from_paper(path="/home/xstromp/dp/data/gazetteers_data/paper/Locations.Cities.Europe"):
29
- """
30
- Load gazetteers from the paper
31
- :param path: path to the gazetteer file
32
- :return: a dict of gazetteers
33
- """
34
- with open(path, 'r') as f:
35
- gazetteers = f.readlines()
36
- gazetteers = {gazetteer.strip() for gazetteer in gazetteers}
37
- return gazetteers
38
-
39
- def merge_gazetteers(*gazetteers):
40
- # Initialize a new dictionary to store merged results
41
- merged_gazetteers = {}
42
- # Iterate over each dictionary provided
43
- for gaz in gazetteers:
44
- # Iterate over each key and set in the current dictionary
45
- for key, value_set in gaz.items():
46
- if key in merged_gazetteers:
47
- # If the key already exists in the result, union the sets
48
- merged_gazetteers[key] |= value_set
49
- else:
50
- # Otherwise, initialize the key with the set from the current dictionary
51
- merged_gazetteers[key] = value_set.copy() # Use copy to avoid mutating the original sets
52
- return merged_gazetteers
53
-
54
-
55
- ####################################################################################################
56
- ### GENERATED LISTS ################################################################################
57
- ####################################################################################################
58
-
59
- nationalities = [
60
- "Čech", "Češka", "Češi",
61
- "Slovák", "Slovenka", "Slováci",
62
- "Němec", "Němka", "Němci",
63
- "Polák", "Polka", "Poláci",
64
- "Maďar", "Maďarka", "Maďaři",
65
- "Rakušan", "Rakušanka", "Rakušané",
66
- "Ukrajinec", "Ukrajinka", "Ukrajinci",
67
- "Rus", "Ruska", "Rusové",
68
- "Angličan", "Angličanka", "Angličané",
69
- "Američan", "Američanka", "Američané",
70
- "Francouz", "Francouzka", "Francouzi",
71
- "Ital", "Italka", "Italové",
72
- "Španěl", "Španělka", "Španělé",
73
- "Portugalec", "Portugalka", "Portugalci",
74
- "Řek", "Řekyně", "Řekové",
75
- "Bulhar", "Bulharka", "Bulhaři",
76
- "Rumun", "Rumunka", "Rumuni",
77
- "Belgičan", "Belgičanka", "Belgičané",
78
- "Holanďan", "Holanďanka", "Holandci",
79
- "Švýcar", "Švýcarka", "Švýcaři",
80
- "Slovinec", "Slovinka", "Slovinci",
81
- "Chorvat", "Chorvatka", "Chorvaté",
82
- "Srb", "Srbka", "Srbové",
83
- "Bosňák", "Bosňačka", "Bosňáci",
84
- "Černohorec", "Černohorka", "Černohorci",
85
- "Makedonec", "Makedonka", "Makedonci",
86
- "Albánec", "Albánka", "Albánci",
87
- "Turek", "Turkyně", "Turci",
88
- "Kanaďan", "Kanaďanka", "Kanaďané",
89
- "Mexičan", "Mexičanka", "Mexičané",
90
- "Brazilec", "Brazilka", "Brazilci",
91
- "Argentinc", "Argentinka", "Argentinci",
92
- "Chilan", "Chilanka", "Chilané",
93
- "Australan", "Australanka", "Australané",
94
- "Novozélanďan", "Novozélanďanka", "Novozélanďané",
95
- "Číňan", "Číňanka", "Číňané",
96
- "Japonec", "Japonka", "Japonci",
97
- "Korejec", "Korejka", "Korejci",
98
- "Vietnamec", "Vietnamka", "Vietnamci",
99
- "Ind", "Indka", "Indové",
100
- "Pákistánec", "Pákistánka", "Pákistánci",
101
- "Iráčan", "Iráčanka", "Iráčané",
102
- "Íránec", "Íránka", "Íránci",
103
- "Syřan", "Syřanka", "Syrští",
104
- "Izraelan", "Izraelanka", "Izraelci",
105
- "Egyptan", "Egyptanka", "Egyptané",
106
- "Súdánec", "Súdánka", "Súdánci",
107
- "Maročan", "Maročanka", "Maročané",
108
- "Alžířan", "Alžírka", "Alžířané",
109
- "Libanonec", "Libanonka", "Libanonci",
110
- "Jordánec", "Jordánka", "Jordánci",
111
- "Kuvajťan", "Kuvajťanka", "Kuvajťané"
112
- ]
113
-
114
- titles = "Bc., BcA., Ing., Ing. arch., MgA., Mgr., MBA, Ph.D., JuDr., PhDr., Th.D., MuDr., RNDr., MVDr., PharmDr., DrSc., MVDR., MDDr., CSc, DRSc., doc., RNDr., prof., PhMr., Akad. Mal., Bc. et Bc., Mgr. et Mgr.".split(", ")
115
-
116
- relig_myth = ["Bůh", "Ježíš Kristus", "Mojžíš", "Muhammad", "Buddha", "Krishna", "Thor", "Zeus",
117
- "Odin", "Héraklés", "Anubis", "Osiris", "Izida", "Shiva", "Vishnu", "Ganesha",
118
- "Athena", "Apolón", "Héra", "Artemis", "Dionýsos", "Quetzalcoatl", "Tezcatlipoca",
119
- "Amaterasu", "Izanagi", "Izanami", "Freya", "Loki", "Baldur", "Saraswati", "Lakshmi",
120
- "Hanuman", "Rama", "Sita", "Parvati", "Durga", "Kali", "Tara", "Vajrapani",
121
- "Maitreya", "Avalokiteśvara"]
122
-
123
- ####################################################################################################
124
- ### WIKIANN GAZETTEERS #############################################################################
125
- ####################################################################################################
126
- def determine_category(line):
127
- categories = ["PER", "LOC", "ORG"]
128
- for category in categories:
129
- if category in line:
130
- return category
131
- return ""
132
-
133
- def load_document(file_name):
134
- with open(file_name, 'r') as file:
135
- lines = file.readlines()
136
-
137
- categories = {"LOC": set(), "PER": set(), "ORG": set()}
138
- current_text, current_category = "", ""
139
-
140
- for line in lines:
141
- category = determine_category(line)
142
- if not category:
143
- continue
144
-
145
- parts = line.strip().split("\t")
146
- tag, word = parts[1], parts[0].split(":")[1]
147
-
148
- if tag.startswith("B-"):
149
- if current_category:
150
- categories[current_category].add(current_text.strip())
151
- current_category = category
152
- current_text = word
153
- elif tag.startswith("I-") and current_category == category:
154
- current_text += " " + word
155
- else:
156
- if current_category:
157
- categories[current_category].add(current_text.strip())
158
- current_category, current_text = "", ""
159
-
160
- if current_category:
161
- categories[current_category].add(current_text.strip())
162
-
163
- return categories
164
-
165
- def load_gazetteers_from_wikiann(path="/home/xstromp/dp/data/wikiann/cs"):
166
- gazetteers = {"LOC": set(), "PER": set(), "ORG": set()}
167
- for data_split in ['train', 'extra', 'dev']:
168
- additional_data = load_document(os.path.join(path, data_split))
169
- for key, values in additional_data.items():
170
- gazetteers[key].update(values)
171
- return gazetteers
172
-
173
- ####################################################################################################
174
- ### GENERATION OF GAZETTEERS TO EXPAND TRAIN DATASET ###############################################
175
- ####################################################################################################
176
-
177
- def get_complex_person():
178
- pass
179
-
180
- ####################################################################################################
181
- ### GENERATION OF GAZETTEERS TO FIND MATCH FOR EXTENDED EMBEDDINGS #################################
182
- ####################################################################################################
183
-
184
- def get_persons():
185
- nd = NameDataset()
186
- per = set()
187
- # first names
188
- first = nd.get_top_names(n=10000, country_alpha2='CZ')
189
- per.update(first["CZ"]["M"])
190
- per.update(first["CZ"]["F"])
191
- # surnames
192
- surnames = nd.get_top_names(n=10000, use_first_names=False, country_alpha2='CZ')
193
- per.update(surnames["CZ"])
194
- # titles
195
- per.update(titles)
196
- # nationalities
197
- per.update(nationalities)
198
- return per
199
-
200
- def get_locations():
201
- df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/LOC/world-data-2023.csv")
202
- loc = {country for country in df['Country'].tolist()}
203
- loc.update(["Asie", "Afrika", "Severní Amerika", "Jižní Amerika", "Antarktida", "Evropa", "Austrálie"])
204
- with open("/home/xstromp/dp/data/gazetteers_data/LOC/data.json", 'rb') as handle:
205
- loaded_dict = pickle.load(handle)
206
- loc.update(list(itertools.chain.from_iterable([v for _, v in loaded_dict.items()])))
207
- loc.update(load_gazetteers_from_paper())
208
- return loc
209
-
210
-
211
- def get_organizations():
212
-
213
- df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/Inc5000Eu-full.csv")
214
- org = set(df['Company'].tolist())
215
- df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/FirmyBrno.csv")
216
- org.update(df['name'].tolist())
217
- org.update(load_gazetteers_from_paper("/home/xstromp/dp/data/gazetteers_data/paper/Organizations"))
218
- return org
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ jupyter
2
+ transformers
3
+ datasets
4
+ torch
5
+ simplemma
6
+ gradio
7
+ pandas