from fastapi import FastAPI, Request from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline import torch from pydantic import BaseModel from typing import Optional from sklearn.feature_extraction.text import CountVectorizer import yake app = FastAPI() class InputText(BaseModel): text : str threshold: float = 0. model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment" sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name) sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name) sentiment_model.config.id2label[3] = "mixed" model_name = 'qanastek/51-languages-classifier' language_model = AutoModelForSequenceClassification.from_pretrained(model_name) language_tokenizer = AutoTokenizer.from_pretrained(model_name) language = "id" max_ngram_size = 3 deduplication_threshold = 0.6 deduplication_algo = 'seqm' windowSize = 3 numOfKeywords = 20 kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None) @app.get("/") def greet_json(): return {"Hello": "World!"} @app.post("/key_phrase_extraction") async def key_phrase_extraction(inp:InputText): def merge_keyphrases(keyphrases): new_merged = keyphrases while True: merged = [new_merged[0]] for i in range(1, len(keyphrases)): keys = keyphrases[i] keys_prev = keyphrases[i-1] label = keys["label"] score = keys["score"] vectorizer = CountVectorizer(ngram_range=( 1,len(label.split(" ")) ), lowercase=False) analyzer = vectorizer.build_analyzer() for key in analyzer(label)[::-1]: key_prev = keys_prev["label"][::-1] if key == key_prev[:len(key)][::-1].strip(): label = key_prev[len(key):][::-1].strip() + " " + label#.replace(key, "") score = max(keys_prev["score"],keys["score"]) merged.pop() break merged.append({"label":label.strip(), "score":score}) if new_merged == merged: break else: new_merged = merged return merged keywords = kw_extractor.extract_keywords(inp.text) return merge_keyphrases([{"label":key[0], "score":1-key[1]} for key in keywords if 1-key[1]>inp.threshold]) @app.post("/language_detection") async def language_detection(inp: InputText): inputs = language_tokenizer(inp.text, return_tensors='pt') with torch.no_grad(): logits = language_model(**inputs).logits softmax = torch.nn.functional.sigmoid(logits) # Apply the threshold by creating a mask mask = softmax >= inp.threshold # Filter the tensor based on the threshold filtered_x = softmax[mask] # Get the sorted indices of the filtered tensor sorted_indices = torch.argsort(filtered_x, descending=True) # Map the sorted indices back to the original tensor indices original_indices = torch.nonzero(mask, as_tuple=True)[1][sorted_indices] return [{"label":language_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in original_indices] @app.post("/sentiment_score") async def sentiment_score(inp: InputText): text = inp.text inputs = sentiment_tokenizer(text[:2500], return_tensors='pt') with torch.no_grad(): logits = sentiment_model(**inputs).logits #+ 1 print(logits) logits = logits + logits[0,1].abs() # print(torch.nn.functional.sigmoid(logits)) # logits = logits / 10 # print(logits) # print(torch.abs(logits[0,0] - logits[0,-1])) # print(logits[0,1]//torch.max(torch.abs(logits[0,::2]))) logits = torch.cat( ( logits, ( # ( logits[0,1] + torch.sign(logits[0,0] - logits[0,-1]) * (logits[0,0] - logits[0,-1])/2 )/2 + # (logits[0,0] + logits[0,-1])/20 (1 - torch.abs(logits[0,0] - logits[0,-1])*(2+(logits[0,1]//torch.max(torch.abs(logits[0,::2]))))) ).unsqueeze(0).unsqueeze(0) ), dim=-1 ) softmax = torch.nn.functional.softmax( logits, dim=-1 ) return [{"label":sentiment_model.config.id2label[predicted_class_id.tolist()], "score":softmax[0, predicted_class_id].tolist()} for predicted_class_id in softmax.argsort(dim=-1, descending=True)[0]]