from typing import Any, Dict, List from langchain.llms import HuggingFacePipeline import torch import transformers from transformers import AutoModelForCausalLM, AutoTokenizer dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 from transformers import ( StoppingCriteria, StoppingCriteriaList, pipeline, ) from typing import List import torch class StopGenerationCriteria(StoppingCriteria): def __init__(self, max_duplicate_sequences=3, max_repeated_words=2): self.generated_sequences = set() self.max_duplicate_sequences = max_duplicate_sequences self.max_repeated_words = max_repeated_words def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: tokenizer=AutoTokenizer.from_pretrained("ClaudiaIoana550/try2_deploy_falcon", trust_remote_code=True) generated_sequence = input_ids.tolist() if len(generated_sequence[0]) >= 50: sequen = generated_sequence[0][-30:] s_mare = str(generated_sequence[0]).strip("[]") s_mic = str(sequen).strip("[]") count2 = 0 if s_mic in s_mare: count2 = sum(1 for i in range(len(generated_sequence[0]) - len(sequen) + 1) if generated_sequence[0][i:i + len(sequen)] == sequen) if count2 >= 2: return True generated_tokens = [tokenizer.decode(token_id) for token_id in input_ids[0]] count = 1 prev_token = None for token in generated_tokens: if token == prev_token: count += 1 if count > self.max_repeated_words: return True else: count = 1 prev_token = token if len(self.generated_sequences) >= self.max_duplicate_sequences: return True return False # Example usage: # Define the maximum number of duplicate sequences and repeated words max_duplicate_sequences = 1 max_repeated_words = 2 # Create an instance of StopGenerationCriteria stop_criteria = StopGenerationCriteria(max_duplicate_sequences, max_repeated_words) # Add the custom stopping criteria to a StoppingCriteriaList stopping_criteria = StoppingCriteriaList([stop_criteria]) class EndpointHandler: def __init__(self, model_path=""): tokenizer=AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, return_dict=True, device_map="auto", torch_dtype = dtype, trust_remote_code=True ) generation_config = model.generation_config generation_config.max_new_tokens = 1700 generation_config.min_length = 20 generation_config.temperature = 1 generation_config.top_p = 0.7 generation_config.num_return_sequences = 1 generation_config.pad_token_id = tokenizer.eos_token_id generation_config.eos_token_id = tokenizer.eos_token_id generation_config.repetition_penalty = 1.1 gpipeline = transformers.pipeline( model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation", stopping_criteria=stopping_criteria, generation_config=generation_config ) self.llm = HuggingFacePipeline(pipeline=gpipeline) def __call__(self, data:Dict[str, Any]) -> Dict[str, Any]: prompt = data.pop("inputs", data) result = self.llm(prompt) return result