ismot pszemraj commited on
Commit
fc8e96d
0 Parent(s):

Duplicate from pszemraj/FLAN-grammar-correction

Browse files

Co-authored-by: Peter Szemraj <pszemraj@users.noreply.huggingface.co>

Files changed (4) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +116 -0
  4. requirements.txt +7 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: FLAN Grammar Correction
3
+ emoji: 🔥
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 3.16.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: pszemraj/FLAN-grammar-correction
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+
4
+ from cleantext import clean
5
+ import gradio as gr
6
+ from tqdm.auto import tqdm
7
+ from transformers import pipeline
8
+
9
+
10
+ checker_model_name = "textattack/roberta-base-CoLA"
11
+ corrector_model_name = "pszemraj/flan-t5-large-grammar-synthesis"
12
+
13
+ # pipelines
14
+ checker = pipeline(
15
+ "text-classification",
16
+ checker_model_name,
17
+ )
18
+
19
+ if os.environ.get("HF_DEMO_NO_USE_ONNX") is None:
20
+ # load onnx runtime unless HF_DEMO_NO_USE_ONNX is set
21
+ from optimum.pipelines import pipeline
22
+
23
+ corrector = pipeline(
24
+ "text2text-generation", model=corrector_model_name, accelerator="ort"
25
+ )
26
+ else:
27
+ corrector = pipeline("text2text-generation", corrector_model_name)
28
+
29
+
30
+ def split_text(text: str) -> list:
31
+ # Split the text into sentences using regex
32
+ sentences = re.split(r"(?<=[^A-Z].[.?]) +(?=[A-Z])", text)
33
+
34
+ # Initialize a list to store the sentence batches
35
+ sentence_batches = []
36
+
37
+ # Initialize a temporary list to store the current batch of sentences
38
+ temp_batch = []
39
+
40
+ # Iterate through the sentences
41
+ for sentence in sentences:
42
+ # Add the sentence to the temporary batch
43
+ temp_batch.append(sentence)
44
+
45
+ # If the length of the temporary batch is between 2 and 3 sentences, or if it is the last batch, add it to the list of sentence batches
46
+ if len(temp_batch) >= 2 and len(temp_batch) <= 3 or sentence == sentences[-1]:
47
+ sentence_batches.append(temp_batch)
48
+ temp_batch = []
49
+
50
+ return sentence_batches
51
+
52
+
53
+ def correct_text(text: str, checker, corrector, separator: str = " ") -> str:
54
+ # Split the text into sentence batches
55
+ sentence_batches = split_text(text)
56
+
57
+ # Initialize a list to store the corrected text
58
+ corrected_text = []
59
+
60
+ # Iterate through the sentence batches
61
+ for batch in tqdm(
62
+ sentence_batches, total=len(sentence_batches), desc="correcting text.."
63
+ ):
64
+ # Join the sentences in the batch into a single string
65
+ raw_text = " ".join(batch)
66
+
67
+ # Check the grammar quality of the text using the text-classification pipeline
68
+ results = checker(raw_text)
69
+
70
+ # Only correct the text if the results of the text-classification are not LABEL_1 or are LABEL_1 with a score below 0.9
71
+ if results[0]["label"] != "LABEL_1" or (
72
+ results[0]["label"] == "LABEL_1" and results[0]["score"] < 0.9
73
+ ):
74
+ # Correct the text using the text-generation pipeline
75
+ corrected_batch = corrector(raw_text)
76
+ corrected_text.append(corrected_batch[0]["generated_text"])
77
+ else:
78
+ corrected_text.append(raw_text)
79
+
80
+ # Join the corrected text into a single string
81
+ corrected_text = separator.join(corrected_text)
82
+
83
+ return corrected_text
84
+
85
+
86
+ def update(text: str):
87
+ text = clean(text[:4000], lower=False)
88
+ return correct_text(text, checker, corrector)
89
+
90
+
91
+ with gr.Blocks() as demo:
92
+ gr.Markdown("# <center>Robust Grammar Correction with FLAN-T5</center>")
93
+ gr.Markdown(
94
+ "**Instructions:** Enter the text you want to correct in the textbox below (_text will be truncated to 4000 characters_). Click 'Process' to run."
95
+ )
96
+ gr.Markdown(
97
+ """Models:
98
+ - `textattack/roberta-base-CoLA` for grammar quality detection
99
+ - `pszemraj/flan-t5-large-grammar-synthesis` for grammar correction
100
+ """
101
+ )
102
+ with gr.Row():
103
+ inp = gr.Textbox(
104
+ label="input",
105
+ placeholder="PUT TEXT TO CHECK & CORRECT BROSKI",
106
+ value="I wen to the store yesturday to bye some food. I needd milk, bread, and a few otter things. The store was really crowed and I had a hard time finding everyting I needed. I finaly made it to the check out line and payed for my stuff.",
107
+ )
108
+ out = gr.Textbox(label="output", interactive=False)
109
+ btn = gr.Button("Process")
110
+ btn.click(fn=update, inputs=inp, outputs=out)
111
+ gr.Markdown("---")
112
+ gr.Markdown(
113
+ "- see the [model card](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis) for more info"
114
+ )
115
+ gr.Markdown("- if experiencing long wait times, feel free to duplicate the space!")
116
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ transformers
2
+ gradio
3
+ tqdm
4
+ torch
5
+ clean-text
6
+ accelerate
7
+ optimum[onnxruntime]