Spaces:

ybbwcwaps
/

FakeVideoDetect

Sleeping

FakeVideoDetect / FakeVD /code_test /utils /dataloader.py

ybbwcwaps

some FakeVD

b0c0f32 5 months ago

21.7 kB



	import os
	import pickle

	import h5py
	import jieba
	import jieba.analyse as analyse
	import numpy as np
	import pandas as pd
	import torch
	from scipy.spatial import distance
	from sklearn import preprocessing
	from sklearn.decomposition import LatentDirichletAllocation
	from sklearn.feature_extraction.text import TfidfVectorizer
	from torch.utils.data import Dataset
	from transformers import BertTokenizer
	import math

	import json

	def str2num(str_x):
	if isinstance(str_x, float):
	return str_x
	elif str_x.isdigit():
	return int(str_x)
	elif 'w' in str_x:
	return float(str_x[:-1])*10000
	elif '亿' in str_x:
	return float(str_x[:-1])*100000000
	else:
	print ("error")
	print (str_x)


	class SVFENDDataset(Dataset):

	def __init__(self, data, data_paths):

	self.data = data

	self.text = []
	with open(data_paths['ASR_text'], 'r', encoding='utf-8') as file:
	for line in file:
	self.text.append(json.loads(line))

	# self.text = pd.read_json(data_paths['ASR_text'], lines=True)

	with open(data_paths['VGGish_audio'], "rb") as fr:
	self.dict_vid_convfea = pickle.load(fr)

	self.framefeapath = data_paths['VGG19_frame']

	self.c3dfeapath = data_paths['C3D_video']

	self.tokenizer = BertTokenizer.from_pretrained('./FakeVD/Models/bert-base-chinese/')


	def __len__(self):
	# return self.data.shape[0]
	return len(self.data)

	def __getitem__(self, idx):

	vid = self.data[idx]

	# text
	# 遍历列表，查找匹配的 video_id
	for itm in self.text:
	if itm['video_id'] == vid:
	text = itm['text']
	break
	title_tokens = self.tokenizer(text, max_length=512, padding='max_length', truncation=True)
	title_inputid = torch.LongTensor(title_tokens['input_ids'])
	title_mask = torch.LongTensor(title_tokens['attention_mask'])

	# comments


	# audio
	audioframes = self.dict_vid_convfea[vid]
	audioframes = torch.FloatTensor(audioframes)

	# frames
	frames=pickle.load(open(os.path.join(self.framefeapath,vid+'.pkl'),'rb'))
	frames=torch.FloatTensor(frames)

	# video
	c3d = h5py.File(self.c3dfeapath+vid+".hdf5", "r")[vid]['c3d_features']
	c3d = torch.FloatTensor(c3d)

	# # user


	return {
	'label': torch.tensor(-1),
	'title_inputid': title_inputid,
	'title_mask': title_mask,
	'audioframes': audioframes,
	'frames':frames,
	'c3d': c3d,
	# 'comments_inputid': comments_inputid,
	# 'comments_mask': comments_mask,
	# 'comments_like': comments_like,
	# 'intro_inputid': intro_inputid,
	# 'intro_mask': intro_mask,
	}


	def split_word(df):
	title = df['description'].values
	comments = df['comments'].apply(lambda x:' '.join(x)).values
	text = np.concatenate([title, comments],axis=0)
	analyse.set_stop_words('./data/stopwords.txt')
	all_word = [analyse.extract_tags(txt) for txt in text.tolist()]
	corpus = [' '.join(word) for word in all_word]
	return corpus


	class FANVMDataset_train(Dataset):

	def __init__(self, path_vid_train):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.framefeapath='./data/ptvgg19_frames/'
	self.thumbframefeapath='./data/ptvgg19_frame_thumb/'

	self.vid_train = []
	with open('./data/vids/'+path_vid_train, "r") as fr:
	for line in fr.readlines():
	self.vid_train.append(line.strip())
	self.data_train = self.data_complete[self.data_complete.video_id.isin(self.vid_train)]
	self.data_train['video_id'] = self.data_train['video_id'].astype('category')
	self.data_train['video_id'].cat.set_categories(self.vid_train, inplace=True)
	self.data_train.sort_values('video_id', ascending=True, inplace=True)
	self.data_train.reset_index(inplace=True)

	self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

	corpus = split_word(self.data_train)
	tfidf = TfidfVectorizer().fit_transform(corpus)
	lda = LatentDirichletAllocation(n_components=15,random_state=2022)
	docres = lda.fit_transform(tfidf)
	self.topic_title = []
	s = []
	for idx in range(self.data_train.shape[0]):
	theta_title = docres[idx]
	self.topic_title.append(theta_title)
	theta_comments = docres[idx+self.data_train.shape[0]]
	s.append(distance.jensenshannon(theta_title, theta_comments) ** 2)
	min_max_scaler = preprocessing.MinMaxScaler()
	s_minMax = min_max_scaler.fit_transform(np.array(s).reshape(-1, 1))
	self.s_minMax = s_minMax.reshape(s_minMax.shape[0])

	def __len__(self):
	return self.data_train.shape[0]

	def __getitem__(self, idx):
	item = self.data_train.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	title_tokens = self.tokenizer(item['description'], max_length=512, padding='max_length', truncation=True)
	title_inputid = torch.LongTensor(title_tokens['input_ids'])
	title_mask = torch.LongTensor(title_tokens['attention_mask'])

	comments_inputid = []
	comments_mask = []
	for comment in item['comments']:
	comment_tokens = self.tokenizer(comment, max_length=250, padding='max_length', truncation=True)
	comments_inputid.append(comment_tokens['input_ids'])
	comments_mask.append(comment_tokens['attention_mask'])
	comments_inputid = torch.LongTensor(comments_inputid)
	comments_mask = torch.LongTensor(comments_mask)

	comments_like = []
	for num in item['comments_like']:
	num_like = num.split(" ")[0]
	comments_like.append(str2num(num_like))
	comments_like = torch.tensor(comments_like)

	frames=pickle.load(open(os.path.join(self.framefeapath,vid+'.pkl'),'rb'))
	frames=torch.FloatTensor(frames)

	frame_thmub = pickle.load(open(os.path.join(self.thumbframefeapath,vid+'.pkl'),'rb'))
	frame_thmub = torch.FloatTensor(frame_thmub)

	s = self.s_minMax[idx]
	s = torch.tensor(s)

	topic_title = self.topic_title[idx]
	topic_title = torch.FloatTensor(topic_title)


	return {
	'label': label,
	'title_inputid': title_inputid,
	'title_mask': title_mask,
	'comments_inputid': comments_inputid,
	'comments_mask': comments_mask,
	'comments_like': comments_like,
	'frames':frames,
	'frame_thmub':frame_thmub,
	's':s,
	'label_event':topic_title,
	}


	class FANVMDataset_test(Dataset):

	def __init__(self, path_vid_train, path_vid_test):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.framefeapath='./data/ptvgg19_frames/'
	self.thumbframefeapath='./data/ptvgg19_frame_thumb/'

	self.vid_train = []
	with open('./data/vids/'+path_vid_train, "r") as fr:
	for line in fr.readlines():
	self.vid_train.append(line.strip())
	self.data_train = self.data_complete[self.data_complete.video_id.isin(self.vid_train)]
	self.data_train['video_id'] = self.data_train['video_id'].astype('category')
	self.data_train['video_id'].cat.set_categories(self.vid_train, inplace=True)
	self.data_train.sort_values('video_id', ascending=True, inplace=True)
	self.data_train.reset_index(inplace=True)

	self.vid_test = []
	with open('./data/vids/'+path_vid_test, "r") as fr:
	for line in fr.readlines():
	self.vid_test.append(line.strip())
	self.data_test = self.data_complete[self.data_complete.video_id.isin(self.vid_test)]
	self.data_test['video_id'] = self.data_test['video_id'].astype('category')
	self.data_test['video_id'].cat.set_categories(self.vid_test, inplace=True)
	self.data_test.sort_values('video_id', ascending=True, inplace=True)
	self.data_test.reset_index(inplace=True)

	self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

	# Use data_train to train
	corpus_train = split_word(self.data_train)
	tfidf = TfidfVectorizer()
	tfidf_matrix_train = tfidf.fit_transform(corpus_train)
	lda = LatentDirichletAllocation(n_components=15,random_state=2022)
	lda.fit(tfidf_matrix_train)

	# apply on data_test
	corpus_test = split_word(self.data_test)
	tfidf_matrix_test = tfidf.transform(corpus_test)
	docres = lda.transform(tfidf_matrix_test)

	s = []
	self.topic_title = []
	for idx in range(self.data_test.shape[0]):
	theta_title = docres[idx]
	self.topic_title.append(theta_title)
	theta_comments = docres[idx+self.data_test.shape[0]]
	s.append(distance.jensenshannon(theta_title, theta_comments) ** 2)
	min_max_scaler = preprocessing.MinMaxScaler()
	s_minMax = min_max_scaler.fit_transform(np.array(s).reshape(-1, 1))
	self.s_minMax = s_minMax.reshape(s_minMax.shape[0])

	def __len__(self):
	return self.data_test.shape[0]

	def __getitem__(self, idx):
	item = self.data_test.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	title_tokens = self.tokenizer(item['description'], max_length=512, padding='max_length', truncation=True)
	title_inputid = torch.LongTensor(title_tokens['input_ids'])
	title_mask = torch.LongTensor(title_tokens['attention_mask'])

	comments_inputid = []
	comments_mask = []
	for comment in item['comments']:
	comment_tokens = self.tokenizer(comment, max_length=250, padding='max_length', truncation=True)
	comments_inputid.append(comment_tokens['input_ids'])
	comments_mask.append(comment_tokens['attention_mask'])
	comments_inputid = torch.LongTensor(comments_inputid)
	comments_mask = torch.LongTensor(comments_mask)

	comments_like = []
	for num in item['comments_like']:
	num_like = num.split(" ")[0]
	comments_like.append(str2num(num_like))
	comments_like = torch.tensor(comments_like)

	frames=pickle.load(open(os.path.join(self.framefeapath,vid+'.pkl'),'rb'))
	frames=torch.FloatTensor(frames)

	frame_thmub = pickle.load(open(os.path.join(self.thumbframefeapath,vid+'.pkl'),'rb'))
	frame_thmub = torch.FloatTensor(frame_thmub)

	s = self.s_minMax[idx]
	s = torch.tensor(s)

	topic_title = self.topic_title[idx]
	topic_title = torch.FloatTensor(topic_title)


	return {
	'label': label,
	'title_inputid': title_inputid,
	'title_mask': title_mask,
	'comments_inputid': comments_inputid,
	'comments_mask': comments_mask,
	'comments_like': comments_like,
	'frames':frames,
	'frame_thmub':frame_thmub,
	's':s,
	'label_event':topic_title,
	}


	class TikTecDataset(Dataset):

	def __init__(self, path_vid):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open(f'./data/vids/{path_vid}', "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete['video_id'].isin(self.vid)]

	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]
	vid = item['video_id']

	label = 1 if item['label']=='假' else 0
	label = torch.tensor(label)

	max_K = 200 # max num of frames
	max_N = 500 # max num of ASR words

	# get caption feature
	with open('./data/caption_w2v_pad%s.pkl' % vid, 'rb') as f:
	caption_feature = pickle.load(f) # (num_frame, 100, 300)
	if max_K / caption_feature.shape[0] >= 2:
	times = math.floor(max_K / caption_feature.shape[0])
	caption_feature = caption_feature.repeat_interleave(times, dim=0)
	elif caption_feature.shape[0] > max_K:
	times = math.ceil(caption_feature.shape[0] / max_K)
	caption_feature = caption_feature[::times][:max_K]
	actual_K = caption_feature.shape[0]
	caption_feature = torch.cat([caption_feature, torch.zeros((max_K - caption_feature.shape[0], 100, 300))], dim=0)

	# get visual feature
	with open( './data/vgg19_result%s.pkl' % vid, 'rb') as f:
	visual_feature = pickle.load(f) # (num_frame, 45, 1000)
	if max_K / visual_feature.shape[0] >= 2:
	times = math.floor(max_K / visual_feature.shape[0])
	visual_feature = visual_feature.repeat_interleave(times, dim=0)
	elif visual_feature.shape[0] > max_K:
	times = math.ceil(visual_feature.shape[0] / max_K)
	visual_feature = visual_feature[::times][:max_K]
	visual_feature = torch.cat([visual_feature, torch.zeros((max_K - visual_feature.shape[0], 45, 1000))], dim=0)

	# get ASR feature
	with open('./data/asr_w2v+mfcc%s.pkl' % vid, 'rb') as f:
	asr_feature = pickle.load(f) # (num_word, 300+650)
	asr_feature = asr_feature[:max_N]
	actual_N = asr_feature.shape[0]
	asr_feature = torch.cat([asr_feature, torch.zeros((max_N - asr_feature.shape[0], 300+650))], dim=0)

	# get frames mask & ASR words mask
	mask_K = torch.zeros(max_K, dtype=torch.int)
	mask_K[:actual_K] = 1
	mask_N = torch.zeros(max_N, dtype=torch.int)
	mask_N[:actual_N] = 1
	if actual_N == 0:
	mask_N[:] = 1

	return {
	'label': label,
	'caption_feature': caption_feature,
	'visual_feature': visual_feature,
	'asr_feature': asr_feature,
	'mask_K': mask_K,
	'mask_N': mask_N,
	}


	class C3DDataset(Dataset):

	def __init__(self, path_vid):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open('./data/vids/'+path_vid, "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete.video_id.isin(self.vid)]
	self.data['video_id'] = self.data['video_id'].astype('category')
	self.data['video_id'].cat.set_categories(self.vid, inplace=True)
	self.data.sort_values('video_id', ascending=True, inplace=True)
	self.data.reset_index(inplace=True)

	self.c3dfeapath='./data/c3d/'

	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	c3d = h5py.File(self.c3dfeapath+vid+".hdf5", "r")[vid]['c3d_features']
	c3d = torch.FloatTensor(c3d)

	return {
	'label': label,
	'c3d': c3d,
	}


	class VGGDataset(Dataset):

	def __init__(self, path_vid):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open('./data/vids/'+path_vid, "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete.video_id.isin(self.vid)]
	self.data['video_id'] = self.data['video_id'].astype('category')
	self.data['video_id'].cat.set_categories(self.vid, inplace=True)
	self.data.sort_values('video_id', ascending=True, inplace=True)
	self.data.reset_index(inplace=True)

	self.framefeapath='./data/ptvgg19_frames/'


	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	frames=pickle.load(open(os.path.join(self.framefeapath,vid+'.pkl'),'rb'))
	frames=torch.FloatTensor(frames)

	return {
	'label': label,
	'frames': frames,
	}


	class BboxDataset(Dataset):
	def __init__(self, path_vid):
	self.data_complete = pd.read_json('./data/data_5500_revised.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open('./data/vids/'+path_vid, "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete.video_id.isin(self.vid)]
	self.data['video_id'] = self.data['video_id'].astype('category')
	self.data['video_id'].cat.set_categories(self.vid, inplace=True)
	self.data.sort_values('video_id', ascending=True, inplace=True)
	self.data.reset_index(inplace=True)

	self.bboxfeapath = './data/bbox_vgg19/'

	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	bbox_vgg = pickle.load(open(os.path.join(self.bboxfeapath,vid+'.pkl'),'rb'))
	bbox_vgg = torch.FloatTensor(bbox_vgg)

	return {
	'label': label,
	'bbox_vgg': bbox_vgg
	}


	class Title_W2V_Dataset(Dataset):
	def __init__(self, path_vid, wv_from_text):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open('./data/vids/'+path_vid, "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete.video_id.isin(self.vid)]
	self.data['video_id'] = self.data['video_id'].astype('category')
	self.data['video_id'].cat.set_categories(self.vid, inplace=True)
	self.data.sort_values('video_id', ascending=True, inplace=True)
	self.data.reset_index(inplace=True)

	self.wv_from_text = wv_from_text

	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	text = item['description']+' '+item['ocr']
	title_w2v = []
	for word in jieba.cut(text, cut_all=False):
	if self.wv_from_text.__contains__(word):
	try:
	title_w2v.append(self.wv_from_text[word])
	except:
	continue

	title_w2v = torch.FloatTensor(title_w2v)

	return {
	'label': label,
	'title_w2v': title_w2v,
	}


	class CommentsDataset(Dataset):

	def __init__(self, path_vid):
	self.data_complete = pd.read_json('./data/data.json',orient='records',dtype=False,lines=True)

	self.vid = []
	with open('./data/vids/'+path_vid, "r") as fr:
	for line in fr.readlines():
	self.vid.append(line.strip())
	self.data = self.data_complete[self.data_complete.video_id.isin(self.vid)]
	self.data['video_id'] = self.data['video_id'].astype('category')
	self.data['video_id'].cat.set_categories(self.vid, inplace=True)
	self.data.sort_values('video_id', ascending=True, inplace=True)
	self.data.reset_index(inplace=True)

	self.tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

	hascomments = self.data['comments'].apply(lambda x:len(x)>0)
	self.data = self.data[hascomments]
	print (self.data.shape)

	def __len__(self):
	return self.data.shape[0]

	def __getitem__(self, idx):
	item = self.data.iloc[idx]
	vid = item['video_id']

	label = 1 if item['annotation']=='假' else 0
	label = torch.tensor(label)

	comments_inputid = []
	comments_mask = []
	for comment in item['comments']:
	comment_tokens = self.tokenizer(comment, max_length=250, padding='max_length', truncation=True)
	comments_inputid.append(comment_tokens['input_ids'])
	comments_mask.append(comment_tokens['attention_mask'])
	comments_inputid = torch.LongTensor(comments_inputid)
	comments_mask = torch.LongTensor(comments_mask)

	comments_like = []
	for num in item['comments_like']:
	num_like = num.split(" ")[0]
	comments_like.append(str2num(num_like))
	comments_like = torch.tensor(comments_like)

	return {
	'label': label,
	'comments_inputid': comments_inputid,
	'comments_mask': comments_mask,
	'comments_like': comments_like,
	}