Gosse Minnema
Add sociofillmore code, load dataset via private dataset repo
b11ac48
raw
history blame
No virus
1.1 kB
import os
import glob
import tarfile
def split_lome_files(lome_folder, output_folder):
for file in glob.glob(f"{lome_folder}/**/*.comm.*"):
doc_id = os.path.basename(file).split(".")[0].split("_")[1]
doc_key = doc_id[:2]
print(file, "->", doc_key)
with tarfile.open(f"{output_folder}/block_{doc_key}.tar", "a") as tar_f:
tar_f.add(file)
if __name__ == "__main__":
#split_lome_files("output/migration/lome/multilabel/lome_0shot/pavia/", "output/migration/lome/lome_0shot/multilabel_pavia_blocks")
# split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai/", "output/femicides/lome/lome_0shot/multilabel_rai_blocks")
split_lome_files("output/femicides/lome/lome_0shot/multilabel/rai_ALL/", "output/femicides/lome/lome_0shot/multilabel_rai_ALL_blocks")
# split_lome_files("output/femicides/lome/lome_0shot/multilabel/olv/", "output/femicides/lome/lome_0shot/multilabel_olv_blocks")
# split_lome_files("output/crashes/lome/lome_0shot/multilabel/thecrashes/", "output/crashes/lome/lome_0shot/multilabel_thecrashes_blocks")