thomaspaniagua
QuadAttack release
71f183c
raw
history blame
No virus
4.24 kB
import os
import numpy as np
import requests
import zipfile
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable
from torchvision import datasets, transforms
import torchvision
from PIL import Image
from .glove import GloVe
from tqdm import tqdm
AD_DIRECTORY = os.path.dirname(__file__)
def obtain_vector(inputs, glove):
vector_im = glove.embedding.get(inputs)
if vector_im is None:
vector_im = glove.embedding.get(inputs.lower())
if vector_im is None:
vector_im = glove.embedding.get(inputs.title())
if vector_im is None:
vector_im = glove.embedding.get(inputs.upper())
return vector_im
def generate_glove():
print("Generating glove similarity...")
#download glove file
os.makedirs("./knowledge", exist_ok=True)
glove_file = './knowledge/glove.840B.300d.txt'
if not os.path.exists(glove_file):
print("Downloading glove files...")
print("")
print("Gonna take a while")
print("")
url_path = "http://nlp.stanford.edu/data/glove.840B.300d.zip"
r = requests.get(url_path)
with open("./knowledge/glove.840B.300d.zip","wb") as f:
f.write(r.content)
filename = './knowledge/glove.840B.300d.zip'
fz = zipfile.ZipFile(filename, 'r')
for file in fz.namelist():
fz.extract(file, './knowledge/.')
if os.path.exists(filename):
os.remove(filename)
glove = GloVe('./knowledge/glove.840B.300d.txt')
filepath = os.path.join(AD_DIRECTORY, "label_name.txt")
vec_list = []
vec_list_np = []
cos_similarity = np.zeros((1000,1000))
index = 0
#the labels could be a word or a phrase with multi words
#we also tested on average of every words
#But we assume the last word should be more important, so in our final version
#we assign a higher weight to last word in a phrase and average the fornt words
#w2v for last word of multi-words
for line in tqdm(open(filepath)):
a = line.strip('\n')
b = a.split(',')
cnt = 0
vector = torch.zeros(300)
vec_front = torch.zeros(300)
vec_b_average = torch.zeros(300)
cnt_b = 0
for i in range(len(b)):
b[i] = b[i].lstrip()
c = b[i].split(' ')
if obtain_vector(c[-1], glove) is not None:
vec_b_average += obtain_vector(c[-1], glove)
cnt_b += 1
if cnt_b == 0:
print('index ', index,' generatint word_vector failure')
continue
vec_b_average = vec_b_average / cnt_b
for i in range(len(b)):
b[i] = b[i].lstrip()
c = b[i].split(' ')
cnt_f = 0
for j in range(len(c) - 1):
if obtain_vector(c[j], glove) is not None:
vec_front += obtain_vector(c[j], glove)
cnt_f += 1
if obtain_vector(c[-1], glove) is not None:
vec_back =obtain_vector(c[-1], glove)
else:
vec_back = vec_b_average
if cnt_f == 0:
vector += vec_back
else:
vector += (vec_front / cnt_f )* 0.1 + vec_back * 0.9
cnt += 1
vector = torch.div(vector,cnt)
vec_list_np.append(np.array(vector))
vec_list.append(vector)
index += 1
vec_list_np_stacked = np.stack(vec_list_np)
vec_list_torch = torch.from_numpy(vec_list_np_stacked)
cos_similarity = F.cosine_similarity(vec_list_torch[None, :], vec_list_torch[:, None], dim=-1)
cos_similarity = cos_similarity.numpy()
# np.save('./knowledge/golve_vec_list', np.array(vec_list_np))
# for i in range(len(vec_list)):
# for j in range(len(vec_list)):
# cos_similarity[i,j] = F.cosine_similarity(vec_list[i], vec_list[j],dim=0).type(torch.half)
# if i != j:
# cos_similarity[i,j] = cos_similarity[i,j]
# cos_similarity = np.array(cos_similarity)
np.save(os.path.join(AD_DIRECTORY, "imagenet_cos_similarity_glove"), cos_similarity)
print("Glove cos_similarity finished")
return cos_similarity