from audiocaptioner import AudioCaptioner from data_module import AudiostockDataset from utils import * def infer(input_filename): device = get_device(0) # connect to GCS gcs = CheckpointManager() # create and/or load model tokenizer = GPT2Tokenizer.from_pretrained('gpt2', local_files_only=False) prefix_dim = 512 prefix_length = 10 prefix_length_clip = 10 num_layers = 8 checkpoint = 'checkpoints/ZRIUE-BEST.pt' model = AudioCaptioner(prefix_length, clip_length=prefix_length_clip, prefix_size=prefix_dim, num_layers=num_layers).to(device) model.load_state_dict(gcs.get_checkpoint(checkpoint)) print(f'Loaded from {checkpoint}') model.eval() # read in the wav file and precompute neighbors dataset_path = '' train_dataset = AudiostockDataset( dataset_path=dataset_path, train=False, split='audiostock-train-240k.txt', factor=1.0, verbose=False, file_list=open('audiostock-train-240k.txt', 'r').read().split() ) print('Reading in file', input_filename) dataset = AudiostockDataset( dataset_path=dataset_path, train=False, split=None, factor=1.0, verbose=False, file_list=[input_filename] # manually override file list ) dataset.precompute_neighbors(model, candidate_set=train_dataset) waveform = dataset.read_wav(input_filename).unsqueeze(0).to(device, dtype=torch.float32) # predict with torch.no_grad(): prefix_embed = model.create_prefix(waveform, 1) tweet_tokens = torch.tensor(preproc(dataset.id2neighbor[os.path.basename(input_filename).split('.')[0]], tokenizer, stop=False), dtype=torch.int64).to(device)[:150] tweet_embed = model.gpt.transformer.wte(tweet_tokens) prefix_embed = torch.cat([prefix_embed, tweet_embed.unsqueeze(0)], dim=1) candidates = generate_beam(model, tokenizer, embed=prefix_embed, beam_size=5) generated_text = candidates[0] generated_text = postproc(generated_text) print('=======================================') print(generated_text) return generated_text if __name__ == '__main__': infer('../MusicCaptioning/sample_inputs/sisters.mp3')