Spaces:

bettystr
/

NerRoB-czech

Sleeping

App Files Files Community

AlzbetaStrompova commited on May 5

Commit

2a9fe7e

•

1 Parent(s): 7e6964a

requirements

Browse files

Files changed (2) hide show

data_manipulation/create_gazetteers.py +0 -218
requirements.txt +7 -0

data_manipulation/create_gazetteers.py DELETED Viewed

@@ -1,218 +0,0 @@
-import os
-import pickle
-import itertools
-import pandas as pd
-from names_dataset import NameDataset
-def load_gazetteers(path):
-    """
-    Load gazetteers from a file
-    :param path: path to the gazetteer file
-    :return: a dict of gazetteers
-    """
-    with open(path, 'rb') as f:
-        gazetteers = pickle.load(f)
-    return gazetteers
-def save_gazetteers(gazetteers, path):
-    """
-    Save gazetteers to a file
-    :param path: path to the gazetteer file
-    :param gazetteers: a dict of gazetteers
-    """
-    with open(path, 'wb') as f:
-        pickle.dump(gazetteers, f)
-def load_gazetteers_from_paper(path="/home/xstromp/dp/data/gazetteers_data/paper/Locations.Cities.Europe"):
-    """
-    Load gazetteers from the paper
-    :param path: path to the gazetteer file
-    :return: a dict of gazetteers
-    """
-    with open(path, 'r') as f:
-        gazetteers = f.readlines()
-    gazetteers = {gazetteer.strip() for gazetteer in gazetteers}
-    return gazetteers
-def merge_gazetteers(*gazetteers):
-    # Initialize a new dictionary to store merged results
-    merged_gazetteers = {}
-    # Iterate over each dictionary provided
-    for gaz in gazetteers:
-        # Iterate over each key and set in the current dictionary
-        for key, value_set in gaz.items():
-            if key in merged_gazetteers:
-                # If the key already exists in the result, union the sets
-                merged_gazetteers[key] |= value_set
-            else:
-                # Otherwise, initialize the key with the set from the current dictionary
-                merged_gazetteers[key] = value_set.copy()  # Use copy to avoid mutating the original sets
-    return merged_gazetteers
-####################################################################################################
-### GENERATED LISTS ################################################################################
-####################################################################################################
-nationalities = [
-    "Čech", "Češka", "Češi",
-    "Slovák", "Slovenka", "Slováci",
-    "Němec", "Němka", "Němci",
-    "Polák", "Polka", "Poláci",
-    "Maďar", "Maďarka", "Maďaři",
-    "Rakušan", "Rakušanka", "Rakušané",
-    "Ukrajinec", "Ukrajinka", "Ukrajinci",
-    "Rus", "Ruska", "Rusové",
-    "Angličan", "Angličanka", "Angličané",
-    "Američan", "Američanka", "Američané",
-    "Francouz", "Francouzka", "Francouzi",
-    "Ital", "Italka", "Italové",
-    "Španěl", "Španělka", "Španělé",
-    "Portugalec", "Portugalka", "Portugalci",
-    "Řek", "Řekyně", "Řekové",
-    "Bulhar", "Bulharka", "Bulhaři",
-    "Rumun", "Rumunka", "Rumuni",
-    "Belgičan", "Belgičanka", "Belgičané",
-    "Holanďan", "Holanďanka", "Holandci",
-    "Švýcar", "Švýcarka", "Švýcaři",
-    "Slovinec", "Slovinka", "Slovinci",
-    "Chorvat", "Chorvatka", "Chorvaté",
-    "Srb", "Srbka", "Srbové",
-    "Bosňák", "Bosňačka", "Bosňáci",
-    "Černohorec", "Černohorka", "Černohorci",
-    "Makedonec", "Makedonka", "Makedonci",
-    "Albánec", "Albánka", "Albánci",
-    "Turek", "Turkyně", "Turci",
-    "Kanaďan", "Kanaďanka", "Kanaďané",
-    "Mexičan", "Mexičanka", "Mexičané",
-    "Brazilec", "Brazilka", "Brazilci",
-    "Argentinc", "Argentinka", "Argentinci",
-    "Chilan", "Chilanka", "Chilané",
-    "Australan", "Australanka", "Australané",
-    "Novozélanďan", "Novozélanďanka", "Novozélanďané",
-    "Číňan", "Číňanka", "Číňané",
-    "Japonec", "Japonka", "Japonci",
-    "Korejec", "Korejka", "Korejci",
-    "Vietnamec", "Vietnamka", "Vietnamci",
-    "Ind", "Indka", "Indové",
-    "Pákistánec", "Pákistánka", "Pákistánci",
-    "Iráčan", "Iráčanka", "Iráčané",
-    "Íránec", "Íránka", "Íránci",
-    "Syřan", "Syřanka", "Syrští",
-    "Izraelan", "Izraelanka", "Izraelci",
-    "Egyptan", "Egyptanka", "Egyptané",
-    "Súdánec", "Súdánka", "Súdánci",
-    "Maročan", "Maročanka", "Maročané",
-    "Alžířan", "Alžírka", "Alžířané",
-    "Libanonec", "Libanonka", "Libanonci",
-    "Jordánec", "Jordánka", "Jordánci",
-    "Kuvajťan", "Kuvajťanka", "Kuvajťané"
-]
-titles = "Bc., BcA., Ing., Ing. arch., MgA., Mgr., MBA, Ph.D., JuDr., PhDr., Th.D., MuDr., RNDr., MVDr., PharmDr., DrSc., MVDR., MDDr., CSc, DRSc., doc., RNDr., prof., PhMr., Akad. Mal., Bc. et Bc., Mgr. et Mgr.".split(", ")
-relig_myth = ["Bůh", "Ježíš Kristus", "Mojžíš", "Muhammad", "Buddha", "Krishna", "Thor", "Zeus",
-              "Odin", "Héraklés", "Anubis", "Osiris", "Izida", "Shiva", "Vishnu", "Ganesha",
-              "Athena", "Apolón", "Héra", "Artemis", "Dionýsos", "Quetzalcoatl", "Tezcatlipoca",
-              "Amaterasu", "Izanagi", "Izanami", "Freya", "Loki", "Baldur", "Saraswati", "Lakshmi",
-              "Hanuman", "Rama", "Sita", "Parvati", "Durga", "Kali", "Tara", "Vajrapani",
-              "Maitreya", "Avalokiteśvara"]
-####################################################################################################
-### WIKIANN GAZETTEERS #############################################################################
-####################################################################################################
-def determine_category(line):
-    categories = ["PER", "LOC", "ORG"]
-    for category in categories:
-        if category in line:
-            return category
-    return ""
-def load_document(file_name):
-    with open(file_name, 'r') as file:
-        lines = file.readlines()
-    categories = {"LOC": set(), "PER": set(), "ORG": set()}
-    current_text, current_category = "", ""
-    for line in lines:
-        category = determine_category(line)
-        if not category:
-            continue
-        parts = line.strip().split("\t")
-        tag, word = parts[1], parts[0].split(":")[1]
-        if tag.startswith("B-"):
-            if current_category:
-                categories[current_category].add(current_text.strip())
-            current_category = category
-            current_text = word
-        elif tag.startswith("I-") and current_category == category:
-            current_text += " " + word
-        else:
-            if current_category:
-                categories[current_category].add(current_text.strip())
-            current_category, current_text = "", ""
-    if current_category:
-        categories[current_category].add(current_text.strip())
-    return categories
-def load_gazetteers_from_wikiann(path="/home/xstromp/dp/data/wikiann/cs"):
-    gazetteers = {"LOC": set(), "PER": set(), "ORG": set()}
-    for data_split in ['train', 'extra', 'dev']:
-        additional_data = load_document(os.path.join(path, data_split))
-        for key, values in additional_data.items():
-            gazetteers[key].update(values)
-    return gazetteers
-####################################################################################################
-### GENERATION OF GAZETTEERS TO EXPAND TRAIN DATASET ###############################################
-####################################################################################################
-def get_complex_person():
-    pass
-####################################################################################################
-### GENERATION OF GAZETTEERS TO FIND MATCH FOR EXTENDED EMBEDDINGS #################################
-####################################################################################################
-def get_persons():
-    nd = NameDataset()
-    per = set()
-    # first names
-    first  = nd.get_top_names(n=10000, country_alpha2='CZ')
-    per.update(first["CZ"]["M"])
-    per.update(first["CZ"]["F"])
-    # surnames
-    surnames = nd.get_top_names(n=10000, use_first_names=False, country_alpha2='CZ')
-    per.update(surnames["CZ"])
-    # titles
-    per.update(titles)
-    # nationalities
-    per.update(nationalities)
-    return per
-def get_locations():
-    df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/LOC/world-data-2023.csv")
-    loc = {country for country in df['Country'].tolist()}
-    loc.update(["Asie", "Afrika", "Severní Amerika", "Jižní Amerika", "Antarktida", "Evropa", "Austrálie"])
-    with open("/home/xstromp/dp/data/gazetteers_data/LOC/data.json", 'rb') as handle:
-        loaded_dict = pickle.load(handle)
-    loc.update(list(itertools.chain.from_iterable([v for _, v in loaded_dict.items()])))
-    loc.update(load_gazetteers_from_paper())
-    return loc
-def get_organizations():
-    df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/Inc5000Eu-full.csv")
-    org = set(df['Company'].tolist())
-    df = pd.read_csv("/home/xstromp/dp/data/gazetteers_data/ORG/FirmyBrno.csv")
-    org.update(df['name'].tolist())
-    org.update(load_gazetteers_from_paper("/home/xstromp/dp/data/gazetteers_data/paper/Organizations"))
-    return org

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+jupyter
+transformers
+datasets
+torch
+simplemma
+gradio
+pandas