|
from fastapi import FastAPI, Request |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline |
|
import torch |
|
from pydantic import BaseModel |
|
from typing import Optional |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
import yake |
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
class InputText(BaseModel): |
|
text : str |
|
threshold: float = 0. |
|
|
|
|
|
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment" |
|
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
sentiment_model.config.id2label[3] = "mixed" |
|
|
|
model_name = 'qanastek/51-languages-classifier' |
|
language_model = AutoModelForSequenceClassification.from_pretrained(model_name) |
|
language_tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
language = "id" |
|
max_ngram_size = 3 |
|
deduplication_threshold = 0.6 |
|
deduplication_algo = 'seqm' |
|
windowSize = 3 |
|
numOfKeywords = 20 |
|
|
|
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) |
|
|
|
|
|
ner_model = "syafiqfaray/indobert-model-ner" |
|
ner = pipeline( |
|
"ner", |
|
ner_model, |
|
aggregation_strategy="simple", |
|
) |
|
|
|
|
|
|
|
@app.get("/") |
|
def greet_json(): |
|
return {"Hello": "World!"} |
|
|
|
|
|
@app.post("/key_phrase_extraction") |
|
async def key_phrase_extraction(inp: InputText): |
|
return [{"label": x["word"], "score": float(x["score"])} for x in ner(inp.text) if x["score"] > inp.threshold and x["entity_group"] != "CRD"] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.post("/language_detection") |
|
async def language_detection(inp: InputText): |
|
inputs = language_tokenizer(inp.text, return_tensors='pt') |
|
with torch.no_grad(): |
|
logits = language_model(**inputs).logits |
|
|
|
softmax = torch.nn.functional.sigmoid(logits) |
|
|
|
|
|
mask = softmax >= inp.threshold |
|
|
|
|
|
filtered_x = softmax[mask] |
|
|
|
|
|
sorted_indices = torch.argsort(filtered_x, descending=True) |
|
|
|
|
|
original_indices = torch.nonzero(mask, as_tuple=True)[1][sorted_indices] |
|
|
|
return [{"label":language_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in original_indices] |
|
|
|
|
|
@app.post("/sentiment_score") |
|
async def sentiment_score(inp: InputText): |
|
text = inp.text |
|
inputs = sentiment_tokenizer(text[:2500], return_tensors='pt') |
|
|
|
with torch.no_grad(): |
|
logits = sentiment_model(**inputs).logits |
|
|
|
|
|
print(logits) |
|
|
|
logits = logits + logits[0,1].abs() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logits = torch.cat( |
|
( |
|
logits, ( |
|
|
|
|
|
(1 - torch.abs(logits[0,0] - logits[0,-1])*(2+(logits[0,1]//torch.max(torch.abs(logits[0,::2]))))) |
|
).unsqueeze(0).unsqueeze(0) |
|
), dim=-1 |
|
) |
|
|
|
softmax = torch.nn.functional.softmax( |
|
logits, |
|
dim=-1 |
|
) |
|
|
|
return [{"label":sentiment_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in softmax.argsort(dim=-1, descending=True)[0]] |
|
|
|
|
|
|