import sys import os from datetime import datetime import pandas as pd import contexttimer from urllib.request import urlopen import requests from PIL import Image import torch from torchvision.transforms import functional as TF from multiprocessing import Pool from tqdm import tqdm import logging import sys import numpy as np from nltk.tag import CRFTagger ct = CRFTagger() ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger') headers = { "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot "X-Forwarded-For": "64.18.15.200", } # Setup logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO) requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) '''if len(sys.argv) != 3: print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training") exit(1)''' # Load data print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}') with contexttimer.Timer(prefix="Loading from tsv"): df = pd.read_csv(sys.argv[1], delimiter='\t') df = df[["caption", "url"]] def drop_no(text): try: if len(text)==0: return True elif len(text) > 96: return True text = text.split() result = ct.tag_sents([text]) nnp_cnt = 0 total = len(result[0]) for x in result[0]: if x[1] == "NNP": nnp_cnt += 1 if (nnp_cnt/total)>=0.8: return True return False except Exception as e: print(e) return True df["to_drop"]=df["caption"].apply(drop_no) df = df[df["to_drop"]==False] df = df.drop("to_drop",axis=1) df["index_row"] = df.index df.to_csv(sys.argv[2], sep='\t',index=False)