CekHoaks / app.py
nug1209's picture
Upload 2 files
a68d2df
raw
history blame
2.29 kB
import pandas as pd
import numpy as np
import re
import string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st
class TokenSimilarity:
def load_pretrained(self, from_pretrained:str='indobenchmark/indobert-base-p1'):
self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
self.model = AutoModel.from_pretrained(from_pretrained)
def __cleaning(self, text:str):
text = text.translate(str.maketrans('', ''))
text = text.translate(str.maketrans('', '', string.punctuation))
text = re.sub(r'/s+', ' ', text).strip()
return text
def __process(self, first_token:str, second_token:str):
inputs = self.tokenizer([first_token, second_token], max_length=self.max_length, truncation=self.truncation, padding=self.padding, return_tensors='pt')
attention = inputs.attention_mask
outputs = self.model(**inputs)
embeddings = outputs[0]
embeddings = outputs.last_hidden_state
mask = attention.unsqueeze(-1).expand(embeddings.shape).float()
masked_embeddings = embeddings * mask
summed = masked_embeddings.sum(1)
counts = clamp(mask.sum(1), min = 1e-9)
mean_pooled = summed / counts
return mean_pooled.detach().numpy()
def predict(self, first_token:str, second_token:str, return_as_embeddings:bool=False, max_length:int=16, truncation:bool=True, padding:str='max_length'):
self.max_length = max_length
self.truncation = truncation
self.padding = padding
first_token = self.__cleaning(first_token)
second_token = self.__cleaning(second_token)
mean_pooled_arr = self.__process(first_token, second_token)
if return_as_embeddings:
return mean_pooled_arr
similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])
return similarity
st.title('CEK HOAKS')
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-base-p2')
df = pd.read_csv('hoax.csv', sep=';')
# def clear_submit():
# st.session_state['submit'] = False
to_check = st.text_area('Teks yang mau dicek...')
if to_check:
for i in np.arange(len(df['text'])):
result = model.predict(to_check, df['text'][i])
st.write(result)