Galuh Sahid commited on
Commit
ba7a003
1 Parent(s): bb13925

Add download_logs and scripts

Browse files
.gitattributes CHANGED
@@ -16,3 +16,4 @@
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
  flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
16
  *.pth filter=lfs diff=lfs merge=lfs -text
17
  *tfevents* filter=lfs diff=lfs merge=lfs -text
18
  flax_model.msgpack filter=lfs diff=lfs merge=lfs -text
19
+ *.log filter=lfs diff=lfs merge=lfs -text
data/download_logs/cc12m_2m_download.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e00288c04e8d226862a37c1ca4d4953a40770d30f7f7f696aafcba2ed57212d
3
+ size 145262942
data/download_logs/cc12m_download.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25de5a6a1bffc69309fdf30d53a27978bb0de8b49a80429b14f880c6470495b
3
+ size 262929928
data/download_logs/cc3m_download.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f5aeaaf47c1370a5da33fc6a97303a1ef5d020670e06fbc9e8474b41a5eb3ba
3
+ size 126513213
data/download_logs/wit_download.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:361c1a8fd3704ec101be204690949ff20b02eac54c39c9fb4ed934d0497ff6ba
3
+ size 233568
data/scripts/.DS_Store ADDED
Binary file (8.2 kB). View file
 
data/scripts/cc12m.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+
8
+ # Setup
9
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
10
+
11
+ if len(sys.argv) != 4:
12
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
13
+ exit(1)
14
+
15
+ annotation_file = sys.argv[1]
16
+ images_dir = sys.argv[2]
17
+ output_file = sys.argv[3]
18
+
19
+ logging.info("Processing cc12m dataset")
20
+
21
+ with contexttimer.Timer(prefix="Loading from tsv"):
22
+ df = pd.read_csv(annotation_file, delimiter='\t')
23
+
24
+ lines = []
25
+
26
+ df = df[["caption", "url"]]
27
+
28
+ print(f"Loaded {len(df)} images.")
29
+
30
+ for index, caption_reference_description, image_url in df.itertuples():
31
+ index+=1
32
+ base_url = os.path.basename(image_url) # extract base url
33
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
34
+ filename = f'{index:08d}---{stem}.jpg'
35
+
36
+ full_image_path = images_dir+"/"+filename
37
+
38
+ if os.path.isfile(full_image_path):
39
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
40
+ else:
41
+ #print(f"{full_image_path} doesn't exist")
42
+ logging.error(full_image_path)
43
+
44
+
45
+ with open(output_file, "w") as f:
46
+ f.write("\n".join(lines))
47
+
48
+ logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
49
+
data/scripts/cc12m_disk1.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ # Setup
10
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
11
+
12
+ if len(sys.argv) != 4:
13
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
14
+ exit(1)
15
+
16
+ annotation_file = sys.argv[1]
17
+ images_dir = sys.argv[2]
18
+ output_file = sys.argv[3]
19
+
20
+ logging.info("Processing cc12m dataset")
21
+
22
+ with contexttimer.Timer(prefix="Loading from tsv"):
23
+ df = pd.read_csv(annotation_file, delimiter='\t')
24
+
25
+ lines = []
26
+
27
+ df = df[["caption", "url"]]
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+ print(f"Loaded {len(df)} images.")
33
+
34
+ for index, caption_reference_description, image_url in df.itertuples():
35
+ index+=1
36
+ base_url = os.path.basename(image_url) # extract base url
37
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
38
+ filename = f'{index:08d}---{stem}.jpg'
39
+
40
+ full_image_path = images_dir+"/"+filename
41
+
42
+ if os.path.isfile(full_image_path):
43
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
44
+ else:
45
+ #print(f"{full_image_path} doesn't exist")
46
+ logging.error(full_image_path)
47
+
48
+ train_lines = lines[:-150_001]
49
+ valid_lines = lines[-150_001:]
50
+
51
+ with open(output_file+"_train.json", "w") as f:
52
+ f.write("\n".join(train_lines))
53
+
54
+ with open(output_file+"_val.json", "w") as f:
55
+ f.write("\n".join(valid_lines))
56
+
57
+ logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
58
+
data/scripts/cc12m_disk2.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ # Setup
10
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
11
+
12
+ if len(sys.argv) != 4:
13
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
14
+ exit(1)
15
+
16
+ annotation_file = sys.argv[1]
17
+ images_dir = sys.argv[2]
18
+ output_file = sys.argv[3]
19
+
20
+ logging.info("Processing cc12m dataset")
21
+
22
+ with contexttimer.Timer(prefix="Loading from tsv"):
23
+ df = pd.read_csv(annotation_file, delimiter='\t')
24
+
25
+ lines = []
26
+
27
+ df = df[["caption", "url"]]
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+ print(f"Loaded {len(df)} images.")
33
+
34
+ for index, caption_reference_description, image_url in df.itertuples():
35
+ index+=1
36
+ base_url = os.path.basename(image_url) # extract base url
37
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
38
+ filename = f'{index:08d}---{stem}.jpg'
39
+
40
+ full_image_path = images_dir+"/"+filename
41
+
42
+ if os.path.isfile(full_image_path):
43
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
44
+ else:
45
+ #print(f"{full_image_path} doesn't exist")
46
+ logging.error(full_image_path)
47
+
48
+ train_lines = lines[:-500_001]
49
+ valid_lines = lines[-500_001:]
50
+
51
+ with open(output_file+"_train.json", "w") as f:
52
+ f.write("\n".join(train_lines))
53
+
54
+ with open(output_file+"_val.json", "w") as f:
55
+ f.write("\n".join(valid_lines))
56
+
57
+ logging.info(f"Processing cc12m dataset done. {len(lines)} images processed.")
58
+
data/scripts/cc3m.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ # Setup
10
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
11
+
12
+ if len(sys.argv) != 4:
13
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
14
+ exit(1)
15
+
16
+ annotation_file = sys.argv[1]
17
+ images_dir = sys.argv[2]
18
+ output_file = sys.argv[3]
19
+
20
+ logging.info("Processing cc3m dataset")
21
+
22
+ with contexttimer.Timer(prefix="Loading from tsv"):
23
+ df = pd.read_csv(annotation_file, delimiter='\t')
24
+
25
+ lines = []
26
+
27
+ df = df[["caption", "url"]]
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+ print(f"Loaded {len(df)} images.")
33
+
34
+ for index, caption_reference_description, image_url in df.itertuples():
35
+ index+=1
36
+ base_url = os.path.basename(image_url) # extract base url
37
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
38
+ filename = f'{index:08d}---{stem}.jpg'
39
+
40
+ full_image_path = images_dir+"/"+filename
41
+
42
+ if os.path.isfile(full_image_path):
43
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
44
+ else:
45
+ #print(f"{full_image_path} doesn't exist")
46
+ logging.error(full_image_path)
47
+
48
+ train_lines = lines[:-300_001]
49
+ valid_lines = lines[-300_001:]
50
+
51
+ with open(output_file+"_train.json", "w") as f:
52
+ f.write("\n".join(train_lines))
53
+
54
+ with open(output_file+"_val.json", "w") as f:
55
+ f.write("\n".join(valid_lines))
56
+
57
+ logging.info(f"Processing cc3m dataset done. {len(lines)} images processed.")
58
+
data/scripts/cc3m_modified.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ # Setup
10
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
11
+
12
+ if len(sys.argv) != 4:
13
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
14
+ exit(1)
15
+
16
+ annotation_file = sys.argv[1]
17
+ images_dir = sys.argv[2]
18
+ output_file = sys.argv[3]
19
+
20
+ logging.info("Processing cc3m dataset")
21
+
22
+ with contexttimer.Timer(prefix="Loading from tsv"):
23
+ df = pd.read_csv(annotation_file, delimiter='\t')
24
+
25
+ lines = []
26
+
27
+ df = df[["caption", "url", "index_row"]]
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+ print(f"Loaded {len(df)} images.")
33
+
34
+ for index, caption_reference_description, image_url, index_row in df.itertuples():
35
+ index_row+=1
36
+ base_url = os.path.basename(image_url) # extract base url
37
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
38
+ filename = f'{index_row:08d}---{stem}.jpg'
39
+
40
+ full_image_path = images_dir+"/"+filename
41
+
42
+ if os.path.isfile(full_image_path):
43
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
44
+ else:
45
+ #print(f"{full_image_path} doesn't exist")
46
+ logging.error(full_image_path)
47
+
48
+ train_lines = lines[:-300_001]
49
+ valid_lines = lines[-300_001:]
50
+
51
+ with open(output_file+"_train.json", "w") as f:
52
+ f.write("\n".join(train_lines))
53
+
54
+ with open(output_file+"_val.json", "w") as f:
55
+ f.write("\n".join(valid_lines))
56
+
57
+ logging.info(f"Processing cc3m dataset done. {len(lines)} images processed.")
58
+
data/scripts/cc_propn.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ import contexttimer
6
+ from urllib.request import urlopen
7
+ import requests
8
+ from PIL import Image
9
+ import torch
10
+ from torchvision.transforms import functional as TF
11
+ from multiprocessing import Pool
12
+ from tqdm import tqdm
13
+ import logging
14
+ import sys
15
+ import numpy as np
16
+
17
+
18
+
19
+ from nltk.tag import CRFTagger
20
+ ct = CRFTagger()
21
+ ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
22
+
23
+ headers = {
24
+ "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
25
+ "X-Forwarded-For": "64.18.15.200",
26
+ }
27
+
28
+ # Setup
29
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
30
+ requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
31
+
32
+ '''if len(sys.argv) != 3:
33
+ print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
34
+ exit(1)'''
35
+
36
+ # Load data
37
+ print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
38
+
39
+ with contexttimer.Timer(prefix="Loading from tsv"):
40
+ df = pd.read_csv(sys.argv[1], delimiter='\t')
41
+ df = df[["caption", "url"]]
42
+
43
+ def drop_no(text):
44
+ try:
45
+ if len(text)==0:
46
+ return True
47
+ elif len(text) > 96:
48
+ return True
49
+ text = text.split()
50
+ result = ct.tag_sents([text])
51
+ nnp_cnt = 0
52
+ total = len(result[0])
53
+
54
+ for x in result[0]:
55
+ if x[1] == "NNP":
56
+ nnp_cnt += 1
57
+
58
+ if (nnp_cnt/total)>=0.8:
59
+ return True
60
+ return False
61
+ except Exception as e:
62
+ print(e)
63
+ return True
64
+
65
+ df["to_drop"]=df["caption"].apply(drop_no)
66
+ df = df[df["to_drop"]==False]
67
+ df = df.drop("to_drop",axis=1)
68
+
69
+ df["index_row"] = df.index
70
+
71
+ df.to_csv(sys.argv[2], sep='\t',index=False)
72
+
data/scripts/coco.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import collections
3
+ import logging
4
+ import sys
5
+
6
+ if len(sys.argv) != 4:
7
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
8
+ exit(1)
9
+
10
+ annotation_file = sys.argv[1]
11
+ images_dir = sys.argv[2]
12
+ output_file = sys.argv[3]
13
+
14
+ logging.info("Processing COCO dataset")
15
+
16
+ with open(annotation_file, "r") as f:
17
+ annotations = json.load(f)["annotations"]
18
+
19
+ image_path_to_caption = collections.defaultdict(list)
20
+ for element in annotations:
21
+ caption = f"{element['caption'].lower().rstrip('.')}"
22
+ image_path = images_dir + "/%012d.jpg" % (element["image_id"])
23
+ image_path_to_caption[image_path].append(caption)
24
+
25
+ lines = []
26
+ for image_path, captions in image_path_to_caption.items():
27
+ lines.append(json.dumps({"image_path": image_path, "captions": captions}))
28
+
29
+ train_lines = lines[:-10_001]
30
+ valid_lines = lines[-10_001:]
31
+
32
+ with open(output_file+"_train.json", "w") as f:
33
+ f.write("\n".join(train_lines))
34
+
35
+ with open(output_file+"_val.json", "w") as f:
36
+ f.write("\n".join(valid_lines))
37
+
38
+ logging.info(f"Processing COCO dataset done. {len(lines)} images processed.")
39
+
40
+ # python scripts/coco.py annotations/coco_captions_train2017.json coco_dataset_train.json
data/scripts/flicker8k.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import sys
4
+ import os.path
5
+
6
+ if len(sys.argv) != 4:
7
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
8
+ exit(1)
9
+
10
+ annotation_file = sys.argv[1]
11
+ images_dir = sys.argv[2]
12
+ output_file = sys.argv[3]
13
+
14
+ logging.info("Processing Flicker 8k dataset")
15
+
16
+ with open(annotation_file, "r") as f:
17
+ annotations = json.load(f)
18
+
19
+ lines = []
20
+ for image_path, captions in annotations.items():
21
+ edited_captions = []
22
+ for caption in captions:
23
+ if len(caption) > 0:
24
+ edited_captions.append(caption.replace("<start> ", "").replace(" <end>", ""))
25
+ full_image_path = images_dir+"/"+image_path
26
+ if os.path.isfile(full_image_path):
27
+ if len(edited_captions) > 0:
28
+ lines.append(json.dumps({"image_path": full_image_path, "captions": edited_captions}))
29
+ else:
30
+ print(f"{full_image_path} doesn't exist")
31
+
32
+ train_lines = lines[:-801]
33
+ valid_lines = lines[-801:]
34
+
35
+ with open(output_file+"_train.json", "w") as f:
36
+ f.write("\n".join(train_lines))
37
+
38
+ with open(output_file+"_val.json", "w") as f:
39
+ f.write("\n".join(valid_lines))
40
+
41
+ logging.info(f"Processing Flicker 8k dataset done. {len(lines)} images processed.")
42
+
data/scripts/flickr30k.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+
8
+ if len(sys.argv) != 4:
9
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
10
+ exit(1)
11
+
12
+ annotation_file = sys.argv[1]
13
+ images_dir = sys.argv[2]
14
+ output_file = sys.argv[3]
15
+
16
+ logging.info("Processing Flicker 30k dataset")
17
+
18
+ with contexttimer.Timer(prefix="Loading from tsv"):
19
+ df = pd.read_csv(annotation_file, delimiter='\t')
20
+
21
+ images_dict = {}
22
+
23
+ for index, caption, image_name in df.itertuples():
24
+ if image_name in images_dict:
25
+ images_dict[image_name] += [caption]
26
+ else:
27
+ images_dict[image_name] = [caption]
28
+
29
+ lines = []
30
+
31
+ for image_path, captions in images_dict.items():
32
+ full_image_path = images_dir+"/"+image_name
33
+ if os.path.isfile(full_image_path):
34
+ lines.append(json.dumps({"image_path": full_image_path, "captions": captions}))
35
+ else:
36
+ print(f"{full_image_path} doesn't exist")
37
+
38
+ train_lines = lines[:-3_001]
39
+ valid_lines = lines[-3_001:]
40
+
41
+ with open(output_file+"_train.json", "w") as f:
42
+ f.write("\n".join(train_lines))
43
+
44
+ with open(output_file+"_val.json", "w") as f:
45
+ f.write("\n".join(valid_lines))
46
+
47
+ logging.info(f"Processing Flicker 30k dataset done. {len(lines)} images processed.")
48
+
data/scripts/subcaption.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ # Setup
10
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
11
+
12
+ if len(sys.argv) != 4:
13
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
14
+ exit(1)
15
+
16
+ annotation_file = sys.argv[1]
17
+ images_dir = sys.argv[2]
18
+ output_file = sys.argv[3]
19
+
20
+ logging.info("Processing subcaption dataset")
21
+
22
+ with contexttimer.Timer(prefix="Loading from tsv"):
23
+ df = pd.read_csv(annotation_file, delimiter='\t')
24
+
25
+ lines = []
26
+
27
+ df = df[["caption", "url"]]
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+ print(f"Loaded {len(df)} images.")
33
+
34
+ for index, caption_reference_description, image_url in df.itertuples():
35
+ #index+=1
36
+ base_url = os.path.basename(image_url) # extract base url
37
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
38
+ filename = f'{index:08d}---{stem}.jpg'
39
+
40
+ full_image_path = images_dir+"/"+filename
41
+
42
+ if os.path.isfile(full_image_path):
43
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
44
+ else:
45
+ #print(f"{full_image_path} doesn't exist")
46
+ logging.error(full_image_path)
47
+
48
+
49
+ train_lines = lines[:-100_001]
50
+ valid_lines = lines[-100_001:]
51
+
52
+ with open(output_file+"_train.json", "w") as f:
53
+ f.write("\n".join(train_lines))
54
+
55
+ with open(output_file+"_val.json", "w") as f:
56
+ f.write("\n".join(valid_lines))
57
+
58
+ logging.info(f"Processing subcaption dataset done. {len(lines)} images processed.")
59
+
data/scripts/wit.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os.path
3
+ import sys
4
+ import json
5
+ import logging
6
+ import contexttimer
7
+ import numpy as np
8
+
9
+ if len(sys.argv) != 4:
10
+ print("Provide .tsv file name, images dir, output file name. e.g. python coco.py coco_captions_train2017.json /mnt/disks/data-1/flickr8k/coco_train.json coco_dataset_train.json")
11
+ exit(1)
12
+
13
+ annotation_file = sys.argv[1]
14
+ images_dir = sys.argv[2]
15
+ output_file = sys.argv[3]
16
+
17
+ logging.info("Processing WIT dataset")
18
+
19
+ with contexttimer.Timer(prefix="Loading from tsv"):
20
+ df = pd.read_csv(annotation_file, delimiter='\t')
21
+
22
+ images_dict = {}
23
+
24
+ lines = []
25
+
26
+ df = df[["caption_reference_description", "image_url"]]
27
+
28
+
29
+ df = df.replace('', np.nan)
30
+ df = df.dropna()
31
+
32
+
33
+
34
+ for index, caption_reference_description, image_url in df.itertuples():
35
+ base_url = os.path.basename(image_url) # extract base url
36
+ stem, ext = os.path.splitext(base_url) # split into stem and extension
37
+ filename = f'{stem}.jpg'
38
+
39
+ full_image_path = images_dir+"/"+filename
40
+
41
+ if os.path.isfile(full_image_path):
42
+ lines.append(json.dumps({"image_path": full_image_path, "captions": [caption_reference_description]}))
43
+ else:
44
+ print(f"{full_image_path} doesn't exist")
45
+
46
+ train_lines = lines[:-9_001]
47
+ valid_lines = lines[-9_001:]
48
+
49
+ with open(output_file+"_train.json", "w") as f:
50
+ f.write("\n".join(train_lines))
51
+
52
+ with open(output_file+"_val.json", "w") as f:
53
+ f.write("\n".join(valid_lines))
54
+
55
+ logging.info(f"Processing Flicker WIT dataset done. {len(lines)} images processed.")
56
+
data/scripts/wit_propn.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+ from datetime import datetime
4
+ import pandas as pd
5
+ import contexttimer
6
+ from urllib.request import urlopen
7
+ import requests
8
+ from PIL import Image
9
+ import torch
10
+ from torchvision.transforms import functional as TF
11
+ from multiprocessing import Pool
12
+ from tqdm import tqdm
13
+ import logging
14
+ import sys
15
+ import numpy as np
16
+
17
+
18
+
19
+ from nltk.tag import CRFTagger
20
+ ct = CRFTagger()
21
+ ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')
22
+
23
+ headers = {
24
+ "User-Agent": "Googlebot-Image/1.0", # Pretend to be googlebot
25
+ "X-Forwarded-For": "64.18.15.200",
26
+ }
27
+
28
+ # Setup
29
+ logging.basicConfig(filename='download.log', filemode='w', level=logging.INFO)
30
+ requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
31
+
32
+ '''if len(sys.argv) != 3:
33
+ print("Provide .tsv file name & output directory. e.g. python downloader.py Train-GCC-training.tsv training")
34
+ exit(1)'''
35
+
36
+ # Load data
37
+ print(f'Starting to load at {datetime.now().isoformat(timespec="minutes")}')
38
+
39
+ with contexttimer.Timer(prefix="Loading from tsv"):
40
+ df = pd.read_csv(sys.argv[1], delimiter='\t')
41
+ df = df[["caption_reference_description", "image_url"]]
42
+
43
+ def drop_no(text):
44
+ try:
45
+ if len(text)==0:
46
+ return True
47
+ text = text.split()
48
+ result = ct.tag_sents([text])
49
+ nnp_cnt = 0
50
+ total = len(result[0])
51
+
52
+ for x in result[0]:
53
+ if x[1] == "NNP":
54
+ nnp_cnt += 1
55
+
56
+ if (nnp_cnt/total)>=0.8:
57
+ return True
58
+ return False
59
+ except Exception as e:
60
+ print(e)
61
+ return True
62
+
63
+ df["to_drop"]=df["caption_reference_description"].apply(drop_no)
64
+ df = df[df["to_drop"]==False]
65
+ df = df.drop("to_drop",axis=1)
66
+
67
+ df.to_csv(sys.argv[2], sep='\t')
68
+