jonathanjordan21
commited on
Commit
•
2a52076
1
Parent(s):
46b8bbd
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,9 @@ from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipe
|
|
3 |
import torch
|
4 |
from pydantic import BaseModel
|
5 |
from typing import Optional
|
|
|
|
|
|
|
6 |
|
7 |
app = FastAPI()
|
8 |
|
@@ -22,6 +25,15 @@ language_model = AutoModelForSequenceClassification.from_pretrained(model_name)
|
|
22 |
language_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
|
27 |
|
@@ -32,6 +44,37 @@ def greet_json():
|
|
32 |
return {"Hello": "World!"}
|
33 |
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
@app.post("/language_detection")
|
37 |
async def language_detection(inp: InputText):
|
|
|
3 |
import torch
|
4 |
from pydantic import BaseModel
|
5 |
from typing import Optional
|
6 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
7 |
+
import yake
|
8 |
+
|
9 |
|
10 |
app = FastAPI()
|
11 |
|
|
|
25 |
language_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
26 |
|
27 |
|
28 |
+
language = "id"
|
29 |
+
max_ngram_size = 3
|
30 |
+
deduplication_threshold = 0.6
|
31 |
+
deduplication_algo = 'seqm'
|
32 |
+
windowSize = 3
|
33 |
+
numOfKeywords = 20
|
34 |
+
|
35 |
+
kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, dedupFunc=deduplication_algo, windowsSize=windowSize, top=numOfKeywords, features=None)
|
36 |
+
|
37 |
|
38 |
|
39 |
|
|
|
44 |
return {"Hello": "World!"}
|
45 |
|
46 |
|
47 |
+
@app.post("/key_phrase_extraction")
|
48 |
+
async def key_phrase_extraction(inp:InputText):
|
49 |
+
def merge_keyphrases(keyphrases):
|
50 |
+
new_merged = keyphrases
|
51 |
+
while True:
|
52 |
+
merged = [new_merged[0]]
|
53 |
+
for i in range(1, len(keyphrases)):
|
54 |
+
keys = keyphrases[i]
|
55 |
+
keys_prev = keyphrases[i-1]
|
56 |
+
label = keys["label"]
|
57 |
+
score = keys["score"]
|
58 |
+
vectorizer = CountVectorizer(ngram_range=( 1,len(label.split(" ")) ), lowercase=False)
|
59 |
+
analyzer = vectorizer.build_analyzer()
|
60 |
+
for key in analyzer(label)[::-1]:
|
61 |
+
key_prev = keys_prev["label"][::-1]
|
62 |
+
if key == key_prev[:len(key)][::-1].strip():
|
63 |
+
label = key_prev[len(key):][::-1].strip() + " " + label#.replace(key, "")
|
64 |
+
score = max(keys_prev["score"],keys["score"])
|
65 |
+
merged.pop()
|
66 |
+
break
|
67 |
+
merged.append({"label":label.strip(), "score":score})
|
68 |
+
if new_merged == merged:
|
69 |
+
break
|
70 |
+
else:
|
71 |
+
new_merged = merged
|
72 |
+
return merged
|
73 |
+
|
74 |
+
keywords = kw_extractor.extract_keywords(inp.text)
|
75 |
+
|
76 |
+
return merge_keyphrases([{"label":key[0], "score":1-key[1]} for key in keywords if 1-key[1]>inp.threshold])
|
77 |
+
|
78 |
|
79 |
@app.post("/language_detection")
|
80 |
async def language_detection(inp: InputText):
|