File size: 5,075 Bytes
ea8b4a5 b01c113 ea8b4a5 6a128a7 2a52076 d812ab5 b01c113 ea8b4a5 46b8bbd ea8b4a5 b01c113 6a128a7 2a52076 6a128a7 08fd334 6a128a7 d812ab5 b01c113 2a52076 08fd334 fd1af44 08fd334 2a52076 08fd334 2a52076 08fd334 2a52076 6a128a7 28c31d6 2cb17e6 6a128a7 28c31d6 6a128a7 79fd2ad ea8b4a5 b01c113 cc6b847 b01c113 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
from fastapi import FastAPI, Request
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch
from pydantic import BaseModel
from typing import Optional
from sklearn.feature_extraction.text import CountVectorizer
import yake
app = FastAPI()
class InputText(BaseModel):
text : str
threshold: float = 0.
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model.config.id2label[3] = "mixed"
model_name = 'qanastek/51-languages-classifier'
language_model = AutoModelForSequenceClassification.from_pretrained(model_name)
language_tokenizer = AutoTokenizer.from_pretrained(model_name)
language = "id"
max_ngram_size = 3
deduplication_threshold = 0.6
deduplication_algo = 'seqm'
windowSize = 3
numOfKeywords = 20
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
ner_model = "syafiqfaray/indobert-model-ner"
ner = pipeline(
"ner",
ner_model,
aggregation_strategy="simple",
)
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.post("/key_phrase_extraction")
async def key_phrase_extraction(inp: InputText):
return [{"label": x["word"], "score": float(x["score"])} for x in ner(inp.text) if x["score"] > inp.threshold and x["entity_group"] != "CRD"]
# @app.post("/key_phrase_extraction")
# async def key_phrase_extraction(inp:InputText):
# def merge_keyphrases(keyphrases):
# new_merged = keyphrases
# while True:
# merged = [new_merged[0]]
# for i in range(1, len(keyphrases)):
# keys = keyphrases[i]
# keys_prev = keyphrases[i-1]
# label = keys["label"]
# score = keys["score"]
# vectorizer = CountVectorizer(ngram_range=( 1,len(label.split(" ")) ), lowercase=False)
# analyzer = vectorizer.build_analyzer()
# for key in analyzer(label)[::-1]:
# key_prev = keys_prev["label"][::-1]
# if key == key_prev[:len(key)][::-1].strip():
# label = key_prev[len(key):][::-1].strip() + " " + label#.replace(key, "")
# score = max(keys_prev["score"],keys["score"])
# merged.pop()
# break
# merged.append({"label":label.strip(), "score":score})
# if new_merged == merged:
# break
# else:
# new_merged = merged
# return merged
# keywords = kw_extractor.extract_keywords(inp.text)
# return merge_keyphrases([{"label":key[0], "score":1-key[1]} for key in keywords if 1-key[1]>inp.threshold])
@app.post("/language_detection")
async def language_detection(inp: InputText):
inputs = language_tokenizer(inp.text, return_tensors='pt')
with torch.no_grad():
logits = language_model(**inputs).logits
softmax = torch.nn.functional.sigmoid(logits)
# Apply the threshold by creating a mask
mask = softmax >= inp.threshold
# Filter the tensor based on the threshold
filtered_x = softmax[mask]
# Get the sorted indices of the filtered tensor
sorted_indices = torch.argsort(filtered_x, descending=True)
# Map the sorted indices back to the original tensor indices
original_indices = torch.nonzero(mask, as_tuple=True)[1][sorted_indices]
return [{"label":language_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in original_indices]
@app.post("/sentiment_score")
async def sentiment_score(inp: InputText):
text = inp.text
inputs = sentiment_tokenizer(text[:2500], return_tensors='pt')
with torch.no_grad():
logits = sentiment_model(**inputs).logits #+ 1
print(logits)
logits = logits + logits[0,1].abs()
# print(torch.nn.functional.sigmoid(logits))
# logits = logits / 10
# print(logits)
# print(torch.abs(logits[0,0] - logits[0,-1]))
# print(logits[0,1]//torch.max(torch.abs(logits[0,::2])))
logits = torch.cat(
(
logits, (
# ( logits[0,1] + torch.sign(logits[0,0] - logits[0,-1]) * (logits[0,0] - logits[0,-1])/2 )/2 +
# (logits[0,0] + logits[0,-1])/20
(1 - torch.abs(logits[0,0] - logits[0,-1])*(2+(logits[0,1]//torch.max(torch.abs(logits[0,::2])))))
).unsqueeze(0).unsqueeze(0)
), dim=-1
)
softmax = torch.nn.functional.softmax(
logits,
dim=-1
)
return [{"label":sentiment_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in softmax.argsort(dim=-1, descending=True)[0]]
|