Spaces:

valurank
/

spam_comment_detection

Running

App Files Files Community

spam_comment_detection / app.py

abdulmatinomotoso

Create app.py

2ad849f verified 4 months ago

raw history blame contribute delete

No virus

1.75 kB

	import gradio as gr
	import numpy as np
	import pandas as pd
	import re
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch


	#Defining the models and tokenuzer
	model_name = "valurank/distilroberta-spam-comments-detection"
	model = AutoModelForSequenceClassification.from_pretrained(model_name)
	tokenizer = AutoTokenizer.from_pretrained(model_name)



	def clean_text(raw_text):
	text = raw_text.encode("ascii", errors="ignore").decode(
	"ascii"
	) # remove non-ascii, Chinese characters

	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\n\n", " ", text)
	text = re.sub(r"\t", " ", text)
	text = text.strip(" ")
	text = re.sub(
	" +", " ", text
	).strip() # get rid of multiple spaces and replace with a single

	text = re.sub(r"Date\s\d{1,2}\/\d{1,2}\/\d{4}", "", text) #remove date
	text = re.sub(r"\d{1,2}:\d{2}\s[A-Z]+\s[A-Z]+", "", text) #remove time

	return text


	#Defining a function to get the category of the news article
	def get_category(text):
	text = clean_text(text)

	input_tensor = tokenizer.encode(text, return_tensors="pt", truncation=True)
	input_tensor = input_tensor.to(device)
	logits = model(input_tensor).logits

	softmax = torch.nn.Softmax(dim=1)
	probs = softmax(logits)[0]
	p = probs.cpu().detach().numpy()
	pred = {l: p[int(i)] for i, l in model.config.id2label.items()}
	category = max(pred, key=lambda k: pred[k])

	return category

	#Creating the interface for the radio app
	demo = gr.Interface(get_category, inputs=gr.Textbox(label="Drop your comment here"),
	outputs = "text",
	title="Spam comments detection")


	#Launching the gradio app
	if __name__ == "__main__":
	demo.launch(debug=True)