Spaces:

pragmatic-programs
/

pragmatic-synthesizer

Runtime error

App Files Files Community

pragmatic-synthesizer / listener.py

saujasv

make demo gpu compatible

1a8e5ac 11 months ago

raw

history blame contribute delete

4.54 kB

	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig
	from dataclasses import dataclass
	from typing import List, Optional
	from utils import (
	get_preprocess_function,
	get_utterance_processing_functions,
	byt5_decode_batch,
	consistent,
	)
	from utils import (
	PROGRAM_SPECIAL_TOKEN,
	UTTERANCES_SPECIAL_TOKEN,
	GT_PROGRAM_SPECIAL_TOKEN,
	)
	from greenery import parse
	from greenery.parse import NoMatch
	import numpy as np
	import torch


	class Agent:
	def __init__(
	self,
	model_path: str,
	gen_config: dict,
	inference_batch_size: int = 1,
	device=None,
	):
	self.model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
	self.tokenizer = AutoTokenizer.from_pretrained(model_path)
	self.gen_config = GenerationConfig(**gen_config)
	self.inference_batch_size = inference_batch_size


	@dataclass
	class ListenerOutput:
	programs: List[List[str]]
	idx: Optional[List[List[int]]] = None
	decoded: Optional[List[List[str]]] = None
	decoded_scores: Optional[List[List[float]]] = None
	pruned: Optional[List[List[str]]] = None


	class Listener(Agent):
	def __init__(
	self,
	model_path,
	gen_config,
	inference_batch_size=4,
	label_pos="suffix",
	idx: bool = True,
	program_special_token=PROGRAM_SPECIAL_TOKEN,
	utterances_special_token=UTTERANCES_SPECIAL_TOKEN,
	device=None,
	):
	super().__init__(model_path, gen_config, inference_batch_size, device)
	self.label_pos = label_pos
	self.idx = idx
	self.program_special_token = program_special_token
	self.utterances_special_token = utterances_special_token
	self.utterances_to_string, self.string_to_utterances = (
	get_utterance_processing_functions(
	label_pos, idx, separator=utterances_special_token
	)
	)
	self.device = self.model.device

	def synthesize(self, context, return_scores=False, enforce_consistency=True):
	# If context is a list of utterances, convert to string
	if isinstance(context[0], list):
	context_str = list(map(self.utterances_to_string, context))
	else:
	context_str = context

	context_tokens = self.tokenizer(
	[
	(
	f"{self.utterances_special_token}{c}"
	if not c.startswith(self.utterances_special_token)
	else c
	)
	for c in context_str
	],
	return_tensors="pt",
	padding=True,
	).to(self.device)

	decoder_inputs = self.tokenizer(
	[self.program_special_token for _ in context],
	return_tensors="pt",
	add_special_tokens=False,
	).to(self.device)

	outputs = self.model.generate(
	**context_tokens,
	decoder_input_ids=decoder_inputs.input_ids,
	generation_config=self.gen_config,
	return_dict_in_generate=True,
	output_scores=True,
	)

	decoded_batch = byt5_decode_batch(
	outputs.sequences.reshape(
	(len(context), -1, outputs.sequences.shape[-1])
	).tolist(),
	skip_position_token=True,
	skip_special_tokens=True,
	)

	consistent_programs = []
	idxs = []
	for decoded, ctx in zip(decoded_batch, context):
	cp = []
	idx = []
	for i, p in enumerate(decoded):
	if enforce_consistency:
	if consistent(p, ctx):
	cp.append(p)
	idx.append(i)
	else:
	cp.append(p)
	idx.append(i)

	consistent_programs.append(cp)
	idxs.append(idx)

	logprobs = torch.stack(outputs.scores, dim=1).log_softmax(dim=-1)
	gen_probs = torch.gather(logprobs, 2, outputs.sequences[:, 1:, None]).squeeze(
	-1
	)
	gen_probs.masked_fill_(gen_probs.isinf(), 0)
	scores = gen_probs.sum(-1)
	n_decoded = scores.shape[0]
	n_seq = n_decoded // len(context)
	scores = scores.reshape((len(context), n_seq))
	scores_list = scores.tolist()

	if return_scores:
	return ListenerOutput(consistent_programs, idxs, decoded_batch, scores_list)
	else:
	return ListenerOutput(consistent_programs)