RUPunct_small - самая маленькая модель из семейства RUPunct. Идеально подходит для несложных текстов и там, где требуется высокая скорость работы на CPU.
Код инференса:
from transformers import pipeline
from transformers import AutoTokenizer
pt = "RUPunct/RUPunct_small"
tk = AutoTokenizer.from_pretrained(pt, strip_accents=False, add_prefix_space=True)
classifier = pipeline("ner", model=pt, tokenizer=tk, aggregation_strategy="first")
def process_token(token, label):
if label == "LOWER_O":
return token
if label == "LOWER_PERIOD":
return token + "."
if label == "LOWER_COMMA":
return token + ","
if label == "LOWER_QUESTION":
return token + "?"
if label == "LOWER_TIRE":
return token + "—"
if label == "LOWER_DVOETOCHIE":
return token + ":"
if label == "LOWER_VOSKL":
return token + "!"
if label == "LOWER_PERIODCOMMA":
return token + ";"
if label == "LOWER_DEFIS":
return token + "-"
if label == "LOWER_MNOGOTOCHIE":
return token + "..."
if label == "LOWER_QUESTIONVOSKL":
return token + "?!"
if label == "UPPER_O":
return token.capitalize()
if label == "UPPER_PERIOD":
return token.capitalize() + "."
if label == "UPPER_COMMA":
return token.capitalize() + ","
if label == "UPPER_QUESTION":
return token.capitalize() + "?"
if label == "UPPER_TIRE":
return token.capitalize() + " —"
if label == "UPPER_DVOETOCHIE":
return token.capitalize() + ":"
if label == "UPPER_VOSKL":
return token.capitalize() + "!"
if label == "UPPER_PERIODCOMMA":
return token.capitalize() + ";"
if label == "UPPER_DEFIS":
return token.capitalize() + "-"
if label == "UPPER_MNOGOTOCHIE":
return token.capitalize() + "..."
if label == "UPPER_QUESTIONVOSKL":
return token.capitalize() + "?!"
if label == "UPPER_TOTAL_O":
return token.upper()
if label == "UPPER_TOTAL_PERIOD":
return token.upper() + "."
if label == "UPPER_TOTAL_COMMA":
return token.upper() + ","
if label == "UPPER_TOTAL_QUESTION":
return token.upper() + "?"
if label == "UPPER_TOTAL_TIRE":
return token.upper() + " —"
if label == "UPPER_TOTAL_DVOETOCHIE":
return token.upper() + ":"
if label == "UPPER_TOTAL_VOSKL":
return token.upper() + "!"
if label == "UPPER_TOTAL_PERIODCOMMA":
return token.upper() + ";"
if label == "UPPER_TOTAL_DEFIS":
return token.upper() + "-"
if label == "UPPER_TOTAL_MNOGOTOCHIE":
return token.upper() + "..."
if label == "UPPER_TOTAL_QUESTIONVOSKL":
return token.upper() + "?!"
while 1:
input_text = input(":> ")
preds = classifier(input_text)
output = ""
for item in preds:
output += " " + process_token(item['word'].strip(), item['entity_group'])
print(">>>", output)
- Downloads last month
- 155
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.