Spaces:
Running
Running
# Copyright 2018 The TensorFlow Authors. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Test NCF data pipeline.""" | |
from __future__ import absolute_import | |
from __future__ import division | |
from __future__ import print_function | |
from collections import defaultdict | |
import hashlib | |
import os | |
import mock | |
import numpy as np | |
import scipy.stats | |
import tensorflow as tf | |
from official.recommendation import constants as rconst | |
from official.recommendation import data_preprocessing | |
from official.recommendation import movielens | |
from official.recommendation import popen_helper | |
DATASET = "ml-test" | |
NUM_USERS = 1000 | |
NUM_ITEMS = 2000 | |
NUM_PTS = 50000 | |
BATCH_SIZE = 2048 | |
EVAL_BATCH_SIZE = 4000 | |
NUM_NEG = 4 | |
END_TO_END_TRAIN_MD5 = "b218738e915e825d03939c5e305a2698" | |
END_TO_END_EVAL_MD5 = "d753d0f3186831466d6e218163a9501e" | |
FRESH_RANDOMNESS_MD5 = "63d0dff73c0e5f1048fbdc8c65021e22" | |
def mock_download(*args, **kwargs): | |
return | |
# The forkpool used by data producers interacts badly with the threading | |
# used by TestCase. Without this patch tests will hang, and no amount | |
# of diligent closing and joining within the producer will prevent it. | |
class BaseTest(tf.test.TestCase): | |
def setUp(self): | |
tf.compat.v1.disable_eager_execution() | |
self.temp_data_dir = self.get_temp_dir() | |
ratings_folder = os.path.join(self.temp_data_dir, DATASET) | |
tf.io.gfile.makedirs(ratings_folder) | |
np.random.seed(0) | |
raw_user_ids = np.arange(NUM_USERS * 3) | |
np.random.shuffle(raw_user_ids) | |
raw_user_ids = raw_user_ids[:NUM_USERS] | |
raw_item_ids = np.arange(NUM_ITEMS * 3) | |
np.random.shuffle(raw_item_ids) | |
raw_item_ids = raw_item_ids[:NUM_ITEMS] | |
users = np.random.choice(raw_user_ids, NUM_PTS) | |
items = np.random.choice(raw_item_ids, NUM_PTS) | |
scores = np.random.randint(low=0, high=5, size=NUM_PTS) | |
times = np.random.randint(low=1000000000, high=1200000000, size=NUM_PTS) | |
self.rating_file = os.path.join(ratings_folder, movielens.RATINGS_FILE) | |
self.seen_pairs = set() | |
self.holdout = {} | |
with tf.io.gfile.GFile(self.rating_file, "w") as f: | |
f.write("user_id,item_id,rating,timestamp\n") | |
for usr, itm, scr, ts in zip(users, items, scores, times): | |
pair = (usr, itm) | |
if pair in self.seen_pairs: | |
continue | |
self.seen_pairs.add(pair) | |
if usr not in self.holdout or (ts, itm) > self.holdout[usr]: | |
self.holdout[usr] = (ts, itm) | |
f.write("{},{},{},{}\n".format(usr, itm, scr, ts)) | |
movielens.download = mock_download | |
movielens.NUM_RATINGS[DATASET] = NUM_PTS | |
movielens.DATASET_TO_NUM_USERS_AND_ITEMS[DATASET] = (NUM_USERS, NUM_ITEMS) | |
def make_params(self, train_epochs=1): | |
return { | |
"train_epochs": train_epochs, | |
"batches_per_step": 1, | |
"use_seed": False, | |
"batch_size": BATCH_SIZE, | |
"eval_batch_size": EVAL_BATCH_SIZE, | |
"num_neg": NUM_NEG, | |
"match_mlperf": True, | |
"use_tpu": False, | |
"use_xla_for_gpu": False, | |
"stream_files": False, | |
} | |
def test_preprocessing(self): | |
# For the most part the necessary checks are performed within | |
# _filter_index_sort() | |
cache_path = os.path.join(self.temp_data_dir, "test_cache.pickle") | |
data, valid_cache = data_preprocessing._filter_index_sort( | |
self.rating_file, cache_path=cache_path) | |
assert len(data[rconst.USER_MAP]) == NUM_USERS | |
assert len(data[rconst.ITEM_MAP]) == NUM_ITEMS | |
def drain_dataset(self, dataset, g): | |
# type: (tf.data.Dataset, tf.Graph) -> list | |
with self.session(graph=g) as sess: | |
with g.as_default(): | |
batch = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next() | |
output = [] | |
while True: | |
try: | |
output.append(sess.run(batch)) | |
except tf.errors.OutOfRangeError: | |
break | |
return output | |
def _test_end_to_end(self, constructor_type): | |
params = self.make_params(train_epochs=1) | |
_, _, producer = data_preprocessing.instantiate_pipeline( | |
dataset=DATASET, data_dir=self.temp_data_dir, params=params, | |
constructor_type=constructor_type, deterministic=True) | |
producer.start() | |
producer.join() | |
assert producer._fatal_exception is None | |
user_inv_map = {v: k for k, v in producer.user_map.items()} | |
item_inv_map = {v: k for k, v in producer.item_map.items()} | |
# ========================================================================== | |
# == Training Data ========================================================= | |
# ========================================================================== | |
g = tf.Graph() | |
with g.as_default(): | |
input_fn = producer.make_input_fn(is_training=True) | |
dataset = input_fn(params) | |
first_epoch = self.drain_dataset(dataset=dataset, g=g) | |
counts = defaultdict(int) | |
train_examples = { | |
True: set(), | |
False: set(), | |
} | |
md5 = hashlib.md5() | |
for features, labels in first_epoch: | |
data_list = [ | |
features[movielens.USER_COLUMN].flatten(), | |
features[movielens.ITEM_COLUMN].flatten(), | |
features[rconst.VALID_POINT_MASK].flatten(), | |
labels.flatten() | |
] | |
for i in data_list: | |
md5.update(i.tobytes()) | |
for u, i, v, l in zip(*data_list): | |
if not v: | |
continue # ignore padding | |
u_raw = user_inv_map[u] | |
i_raw = item_inv_map[i] | |
if ((u_raw, i_raw) in self.seen_pairs) != l: | |
# The evaluation item is not considered during false negative | |
# generation, so it will occasionally appear as a negative example | |
# during training. | |
assert not l | |
self.assertEqual(i_raw, self.holdout[u_raw][1]) | |
train_examples[l].add((u_raw, i_raw)) | |
counts[(u_raw, i_raw)] += 1 | |
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5) | |
num_positives_seen = len(train_examples[True]) | |
self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen) | |
# This check is more heuristic because negatives are sampled with | |
# replacement. It only checks that negative generation is reasonably random. | |
self.assertGreater( | |
len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9) | |
# This checks that the samples produced are independent by checking the | |
# number of duplicate entries. If workers are not properly independent there | |
# will be lots of repeated pairs. | |
self.assertLess(np.mean(list(counts.values())), 1.1) | |
# ========================================================================== | |
# == Eval Data ============================================================= | |
# ========================================================================== | |
with g.as_default(): | |
input_fn = producer.make_input_fn(is_training=False) | |
dataset = input_fn(params) | |
eval_data = self.drain_dataset(dataset=dataset, g=g) | |
current_user = None | |
md5 = hashlib.md5() | |
for features in eval_data: | |
data_list = [ | |
features[movielens.USER_COLUMN].flatten(), | |
features[movielens.ITEM_COLUMN].flatten(), | |
features[rconst.DUPLICATE_MASK].flatten() | |
] | |
for i in data_list: | |
md5.update(i.tobytes()) | |
for idx, (u, i, d) in enumerate(zip(*data_list)): | |
u_raw = user_inv_map[u] | |
i_raw = item_inv_map[i] | |
if current_user is None: | |
current_user = u | |
# Ensure that users appear in blocks, as the evaluation logic expects | |
# this structure. | |
self.assertEqual(u, current_user) | |
# The structure of evaluation data is 999 negative examples followed | |
# by the holdout positive. | |
if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1): | |
# Check that the last element in each chunk is the holdout item. | |
self.assertEqual(i_raw, self.holdout[u_raw][1]) | |
current_user = None | |
elif i_raw == self.holdout[u_raw][1]: | |
# Because the holdout item is not given to the negative generation | |
# process, it can appear as a negative. In that case, it should be | |
# masked out as a duplicate. (Since the true positive is placed at | |
# the end and would therefore lose the tie.) | |
assert d | |
else: | |
# Otherwise check that the other 999 points for a user are selected | |
# from the negatives. | |
assert (u_raw, i_raw) not in self.seen_pairs | |
self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5) | |
def _test_fresh_randomness(self, constructor_type): | |
train_epochs = 5 | |
params = self.make_params(train_epochs=train_epochs) | |
_, _, producer = data_preprocessing.instantiate_pipeline( | |
dataset=DATASET, data_dir=self.temp_data_dir, params=params, | |
constructor_type=constructor_type, deterministic=True) | |
producer.start() | |
results = [] | |
g = tf.Graph() | |
with g.as_default(): | |
for _ in range(train_epochs): | |
input_fn = producer.make_input_fn(is_training=True) | |
dataset = input_fn(params) | |
results.extend(self.drain_dataset(dataset=dataset, g=g)) | |
producer.join() | |
assert producer._fatal_exception is None | |
positive_counts, negative_counts = defaultdict(int), defaultdict(int) | |
md5 = hashlib.md5() | |
for features, labels in results: | |
data_list = [ | |
features[movielens.USER_COLUMN].flatten(), | |
features[movielens.ITEM_COLUMN].flatten(), | |
features[rconst.VALID_POINT_MASK].flatten(), | |
labels.flatten() | |
] | |
for i in data_list: | |
md5.update(i.tobytes()) | |
for u, i, v, l in zip(*data_list): | |
if not v: | |
continue # ignore padding | |
if l: | |
positive_counts[(u, i)] += 1 | |
else: | |
negative_counts[(u, i)] += 1 | |
self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5) | |
# The positive examples should appear exactly once each epoch | |
self.assertAllEqual(list(positive_counts.values()), | |
[train_epochs for _ in positive_counts]) | |
# The threshold for the negatives is heuristic, but in general repeats are | |
# expected, but should not appear too frequently. | |
pair_cardinality = NUM_USERS * NUM_ITEMS | |
neg_pair_cardinality = pair_cardinality - len(self.seen_pairs) | |
# Approximation for the expectation number of times that a particular | |
# negative will appear in a given epoch. Implicit in this calculation is the | |
# treatment of all negative pairs as equally likely. Normally is not | |
# necessarily reasonable; however the generation in self.setUp() will | |
# approximate this behavior sufficiently for heuristic testing. | |
e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality | |
# The frequency of occurance of a given negative pair should follow an | |
# approximately binomial distribution in the limit that the cardinality of | |
# the negative pair set >> number of samples per epoch. | |
approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs+1), | |
n=train_epochs, p=e_sample) | |
# Tally the actual observed counts. | |
count_distribution = [0 for _ in range(train_epochs + 1)] | |
for i in negative_counts.values(): | |
i = min([i, train_epochs]) # round down tail for simplicity. | |
count_distribution[i] += 1 | |
count_distribution[0] = neg_pair_cardinality - sum(count_distribution[1:]) | |
# Check that the frequency of negative pairs is approximately binomial. | |
for i in range(train_epochs + 1): | |
if approx_pdf[i] < 0.05: | |
continue # Variance will be high at the tails. | |
observed_fraction = count_distribution[i] / neg_pair_cardinality | |
deviation = (2 * abs(observed_fraction - approx_pdf[i]) / | |
(observed_fraction + approx_pdf[i])) | |
self.assertLess(deviation, 0.2) | |
def test_end_to_end_materialized(self): | |
self._test_end_to_end("materialized") | |
def test_end_to_end_bisection(self): | |
self._test_end_to_end("bisection") | |
def test_fresh_randomness_materialized(self): | |
self._test_fresh_randomness("materialized") | |
def test_fresh_randomness_bisection(self): | |
self._test_fresh_randomness("bisection") | |
if __name__ == "__main__": | |
tf.test.main() | |