#using pipeline to predict the input text from transformers import pipeline, AutoTokenizer import torch label_mapping = { 'delete': [0, 'LABEL_0'], 'keep': [1, 'LABEL_1'], 'merge': [2, 'LABEL_2'], 'no consensus': [3, 'LABEL_3'], 'speedy keep': [4, 'LABEL_4'], 'speedy delete': [5, 'LABEL_5'], 'redirect': [6, 'LABEL_6'], 'withdrawn': [7, 'LABEL_7'] } def predict_text(text, model_name): tokenizer = AutoTokenizer.from_pretrained(model_name) model = pipeline("text-classification", model=model_name, return_all_scores=True) # Tokenize and truncate the text tokens = tokenizer(text, truncation=True, max_length=512) truncated_text = tokenizer.decode(tokens['input_ids'], skip_special_tokens=True) results = model(truncated_text) final_scores = {key: 0.0 for key in label_mapping} for result in results[0]: for key, value in label_mapping.items(): if result['label'] == value[1]: final_scores[key] = result['score'] break return final_scores # from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline # import torch # label_mapping = { # 'delete': [0, 'LABEL_0'], # 'keep': [1, 'LABEL_1'], # 'merge': [2, 'LABEL_2'], # 'no consensus': [3, 'LABEL_3'], # 'speedy keep': [4, 'LABEL_4'], # 'speedy delete': [5, 'LABEL_5'], # 'redirect': [6, 'LABEL_6'], # 'withdrawn': [7, 'LABEL_7'] # } # def predict_text(text, model_name): # tokenizer = AutoTokenizer.from_pretrained(model_name) # model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True) # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True) # outputs = model(**inputs) # predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) # final_scores = {key: 0.0 for key in label_mapping} # for i, score in enumerate(predictions[0]): # for key, value in label_mapping.items(): # if i == value[0]: # final_scores[key] = score.item() # break # # Calculate average attention # attentions = outputs.attentions # avg_attentions = torch.mean(torch.stack(attentions), dim=1) # Average over all layers # avg_attentions = avg_attentions.mean(dim=1)[0] # Average over heads # token_importance = avg_attentions.mean(dim=0) # # Decode tokens and highlight important ones # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) # highlighted_text = [] # for token, importance in zip(tokens, token_importance): # if importance > token_importance.mean(): # highlighted_text.append(f"{token}") # # else: # highlighted_text.append(token) # highlighted_text = " ".join(highlighted_text) # highlighted_text = highlighted_text.replace("##", "") # return final_scores, highlighted_text