import re from simplemma import lemmatize def flatten(xss): return [x for xs in xss for x in xs] def remove_all_brackets(text): return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text) def lemmatizing(x): if x == "": return "" return lemmatize(x, lang="cs") def build_reverse_dictionary(dictionary, apply_lemmatizing=False): reverse_dictionary = {} for key, values in dictionary.items(): for value in values: reverse_dictionary[value] = key if apply_lemmatizing: temp = lemmatizing(value) if temp != value: reverse_dictionary[temp] = key return reverse_dictionary def split_gazetteers_for_single_token_match(gazetteers): result = {} for k, v in gazetteers.items(): result[k] = set(flatten([vv.split(" ") for vv in v])) result[k] = {x for x in result[k] if len(x) > 2} return result def preprocess_gazetteers(gazetteers, config): if config["split_person"]: gazetteers["PER"].update(set([x for x in flatten([v.split(" ") for v in gazetteers["PER"]]) if len(x) > 2])) if config["lemmatize"]: for k, v in gazetteers.items(): gazetteers[k] = set(flatten([(vv, lemmatizing(vv)) for vv in v if len(vv) > 2])) if config["remove_brackets"]: for k, v in gazetteers.items(): gazetteers[k] = {remove_all_brackets(vv).strip() for vv in v if len(remove_all_brackets(vv).strip()) > 2} if config["remove_numeric"]: for k, v in gazetteers.items(): gazetteers[k] = {vv for vv in v if not vv.isnumeric()} if config["techniq_for_matching"] != "single": gazetteers = split_gazetteers_for_single_token_match(gazetteers) return gazetteers