Spaces:
Runtime error
Runtime error
File size: 4,710 Bytes
8505d54 84b8f8b 57a7aa0 151824c 8505d54 57a7aa0 8505d54 84b8f8b 8505d54 84b8f8b 57a7aa0 84b8f8b 8505d54 84b8f8b 8505d54 84b8f8b 9de8217 151824c 8505d54 84b8f8b 8505d54 5b88edd 84b8f8b 5b88edd 84b8f8b 8505d54 84b8f8b 5b88edd 8505d54 5b88edd 84b8f8b 5b88edd 84b8f8b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import re
import os
import gc
from cleantext import clean
import gradio as gr
from tqdm.auto import tqdm
from transformers import pipeline
from transformers import AutoModelForSequenceClassification, AutoTokenizer
checker_model_name = "textattack/roberta-base-CoLA"
corrector_model_name = "pszemraj/flan-t5-large-grammar-synthesis"
# pipelines
if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
from optimum.bettertransformer import BetterTransformer
model_hf = AutoModelForSequenceClassification.from_pretrained(checker_model_name)
tokenizer = AutoTokenizer.from_pretrained(checker_model_name)
model = BetterTransformer.transform(model_hf, keep_original_model=False)
checker = pipeline(
"text-classification",
model=model,
tokenizer=tokenizer,
)
else:
checker = pipeline(
"text-classification",
checker_model_name,
)
gc.collect()
if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
# load onnx runtime unless HF_DEMO_NO_USE_ONNX is set
from optimum.pipelines import pipeline
corrector = pipeline(
"text2text-generation", model=corrector_model_name, accelerator="ort"
)
else:
corrector = pipeline("text2text-generation", corrector_model_name)
def split_text(text: str) -> list:
# Split the text into sentences using regex
sentences = re.split(r"(?<=[^A-Z].[.?]) +(?=[A-Z])", text)
# Initialize a list to store the sentence batches
sentence_batches = []
# Initialize a temporary list to store the current batch of sentences
temp_batch = []
# Iterate through the sentences
for sentence in sentences:
# Add the sentence to the temporary batch
temp_batch.append(sentence)
# If the length of the temporary batch is between 2 and 3 sentences, or if it is the last batch, add it to the list of sentence batches
if len(temp_batch) >= 2 and len(temp_batch) <= 3 or sentence == sentences[-1]:
sentence_batches.append(temp_batch)
temp_batch = []
return sentence_batches
def correct_text(text: str, checker, corrector, separator: str = " ") -> str:
# Split the text into sentence batches
sentence_batches = split_text(text)
# Initialize a list to store the corrected text
corrected_text = []
# Iterate through the sentence batches
for batch in tqdm(
sentence_batches, total=len(sentence_batches), desc="correcting text.."
):
# Join the sentences in the batch into a single string
raw_text = " ".join(batch)
# Check the grammar quality of the text using the text-classification pipeline
results = checker(raw_text)
# Only correct the text if the results of the text-classification are not LABEL_1 or are LABEL_1 with a score below 0.9
if results[0]["label"] != "LABEL_1" or (
results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9
):
# Correct the text using the text-generation pipeline
corrected_batch = corrector(raw_text)
corrected_text.append(corrected_batch[0]["generated_text"])
else:
corrected_text.append(raw_text)
# Join the corrected text into a single string
corrected_text = separator.join(corrected_text)
return corrected_text
def update(text: str):
text = clean(text[:4000], lower=False)
return correct_text(text, checker, corrector)
with gr.Blocks() as demo:
gr.Markdown("# <center>Robust Grammar Correction with FLAN-T5</center>")
gr.Markdown(
"**Instructions:** Enter the text you want to correct in the textbox below (_text will be truncated to 4000 characters_). Click 'Process' to run."
)
gr.Markdown(
"""Models:
- `textattack/roberta-base-CoLA` for grammar quality detection
- `pszemraj/flan-t5-large-grammar-synthesis` for grammar correction
"""
)
with gr.Row():
inp = gr.Textbox(
label="input",
placeholder="PUT TEXT TO CHECK & CORRECT BROSKI",
value="I wen to the store yesturday to bye some food. I needd milk, bread, and a few otter things. The store was really crowed and I had a hard time finding everyting I needed. I finaly made it to the check out line and payed for my stuff.",
)
out = gr.Textbox(label="output", interactive=False)
btn = gr.Button("Process")
btn.click(fn=update, inputs=inp, outputs=out)
gr.Markdown("---")
gr.Markdown(
"- see the [model card](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis) for more info"
)
gr.Markdown("- if experiencing long wait times, feel free to duplicate the space!")
demo.launch()
|