ULMER Louis (T0240644) commited on
Commit
05e69cc
1 Parent(s): 610463f

pushing the app

Browse files
__init__.py ADDED
File without changes
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from backend.data_augmenter import BackTranslatorAugmenter
4
+
5
+ os.environ['NO_PROXY'] = '127.0.0.1'
6
+ st.set_page_config(layout="wide", page_title="Paraphraser.AI", page_icon="🤖")
7
+ st.title('Paraphraser.AI 🤖')
8
+ st.header("An intelligent sentence paraphraser")
9
+
10
+ model_selection = st.sidebar.selectbox(
11
+ 'Select a paraphraser:',
12
+ ['Vladimir 🧑🏼','Maria 👩🏽'],
13
+ )
14
+
15
+ input_text = st.text_area('Please type the text to paraphrase')
16
+
17
+ class DummyAugmenter:
18
+ def __init__(self, in_lang="en", out_lang="ru") -> None:
19
+ pass
20
+ def back_translate(self,text):
21
+ return "La marche des vertueux est seumée d'obstacles"
22
+
23
+
24
+ if model_selection == 'Vladimir 🧑🏼':
25
+ model = BackTranslatorAugmenter(in_lang="en", out_lang="ru")
26
+ if model_selection == 'Maria 👩🏽':
27
+ model = BackTranslatorAugmenter(in_lang="en", out_lang="es")
28
+
29
+ if input_text:
30
+ st.header(f"Paraphrased text :")
31
+ st.write("".join(model.back_translate(input_text)))
32
+
app_gradio.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ os.environ['NO_PROXY'] = '127.0.0.1'
4
+
5
+ class DummyAugmenter:
6
+ def __init__(self, in_lang="en", out_lang="ru") -> None:
7
+ pass
8
+ def back_translate(self,text):
9
+ return "La marche des vertueux est seumée d'obstacles"
10
+
11
+ def greet(name):
12
+ return "Hello " + name + "!"
13
+
14
+ with gr.Blocks() as demo:
15
+ name = gr.Textbox(label="Please type the text to paraphrase")
16
+ output = gr.Textbox(label="Output Box")
17
+
18
+ greet_btn = gr.Button("Greet")
19
+ greet_btn.click(fn=greet, inputs=name, outputs=output)
20
+
21
+ demo.launch()
backend/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .data_augmenter import BackTranslatorAugmenter
backend/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (226 Bytes). View file
 
backend/__pycache__/data_augmenter.cpython-310.pyc ADDED
Binary file (5.57 kB). View file
 
backend/data_augmenter.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+ from tqdm import tqdm
4
+ import pandas as pd
5
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
6
+ import os
7
+ import json
8
+ import torch
9
+ from dotenv import load_dotenv
10
+
11
+ load_dotenv()
12
+ from nltk.tokenize import sent_tokenize
13
+
14
+ wd = os.path.dirname(os.path.realpath(__file__))
15
+
16
+
17
+ class BackTranslatorAugmenter:
18
+ """
19
+ A class that performs BackTranslation in order to do data augmentation.
20
+ For best results we recommend using bottleneck languages (`out_lang`)
21
+ such as russian (ru) and
22
+ spanish (es).
23
+
24
+ Example
25
+ -------
26
+ .. code-block:: python
27
+
28
+ data_augmenter = BackTranslatorAugmenter(out_lang="es")
29
+ text = "I want to augment this sentence"
30
+ print(text)
31
+ data_augmenter.back_translate(text, verbose=True)
32
+
33
+ :param in_lang: the text input language, defaults to "en"
34
+ :type in_lang: str, optional
35
+ :param out_lang: the language to translate with, defaults to "ru"
36
+ :type out_lang: str, optional
37
+ """
38
+
39
+ def __init__(self, in_lang="en", out_lang="ru") -> None:
40
+ if torch.cuda.is_available():
41
+ self.device = "cuda"
42
+ else:
43
+ self.device = "cpu"
44
+
45
+ self.in_tokenizer = AutoTokenizer.from_pretrained(
46
+ f"Helsinki-NLP/opus-mt-{in_lang}-{out_lang}",
47
+ cache_dir=os.getenv("TRANSFORMERS_CACHE"),
48
+ )
49
+ self.in_model = AutoModelForSeq2SeqLM.from_pretrained(
50
+ f"Helsinki-NLP/opus-mt-{in_lang}-{out_lang}",
51
+ cache_dir=os.getenv("TRANSFORMERS_CACHE"),
52
+ ).to(self.device)
53
+ self.out_tokenizer = AutoTokenizer.from_pretrained(
54
+ f"Helsinki-NLP/opus-mt-{out_lang}-{in_lang}",
55
+ cache_dir=os.getenv("TRANSFORMERS_CACHE"),
56
+ )
57
+ self.out_model = AutoModelForSeq2SeqLM.from_pretrained(
58
+ f"Helsinki-NLP/opus-mt-{out_lang}-{in_lang}",
59
+ cache_dir=os.getenv("TRANSFORMERS_CACHE"),
60
+ ).to(self.device)
61
+
62
+ def back_translate(self, text, verbose=False):
63
+ if verbose:
64
+ tic = time.time()
65
+ encoded_text = self.in_tokenizer(
66
+ text, return_tensors="pt", padding=True, truncation=True
67
+ ).to(self.device)
68
+ in_generated_ids = self.in_model.generate(**encoded_text)
69
+
70
+ in_preds = [
71
+ self.in_tokenizer.decode(
72
+ gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
73
+ )
74
+ for gen_id in in_generated_ids
75
+ ]
76
+ if verbose:
77
+ print("in_pred : ", in_preds)
78
+ encoded_text = self.out_tokenizer(
79
+ in_preds, return_tensors="pt", padding=True, truncation=True
80
+ ).to(self.device)
81
+ out_generated_ids = self.out_model.generate(**encoded_text)
82
+ out_preds = [
83
+ self.out_tokenizer.decode(
84
+ gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True
85
+ )
86
+ for gen_id in out_generated_ids
87
+ ]
88
+
89
+ if verbose:
90
+ tac = time.time()
91
+ print("out_pred : ", out_preds)
92
+ print("Elapsed time : ", tac - tic)
93
+ return out_preds
94
+
95
+ def back_translate_long(self, text, verbose=False):
96
+ sentences = sent_tokenize(text)
97
+ return [" ".join(self.back_translate(sentences, verbose=verbose))]
98
+
99
+
100
+ def do_backtranslation(**args):
101
+ df = pd.read_csv(args["input_data_path"])[:1]
102
+ data_augmenter = BackTranslatorAugmenter(
103
+ in_lang=args["in_lang"], out_lang=args["out_lang"]
104
+ )
105
+
106
+ dict_res = {col_name: [] for _, col_name in args["col_map"].items()}
107
+
108
+ for i in tqdm(range(0, len(df), args["batch_size"])):
109
+ for old_col, new_col in args["col_map"].items():
110
+ dict_res[new_col] += data_augmenter.back_translate(
111
+ list(df[old_col].iloc[i : i + args["batch_size"]])
112
+ )
113
+
114
+ augmented_df = pd.DataFrame(dict_res)
115
+ os.makedirs(os.path.dirname(args["output_data_path"]), exist_ok=True)
116
+ augmented_df.to_csv(args["output_data_path"])
117
+
118
+
119
+ if __name__ == "__main__":
120
+ parser = argparse.ArgumentParser(
121
+ description="Back Translate a dataset for better training"
122
+ )
123
+ parser.add_argument(
124
+ "-in_lang",
125
+ type=str,
126
+ default="en",
127
+ help="""the text input language, defaults to "en",
128
+ one can choose between {'es','ru','en','fr','de','pt','zh'}
129
+ but please have a look at https://huggingface.co/Helsinki-NLP to make sure the language
130
+ pair you ask for is available""",
131
+ )
132
+
133
+ parser.add_argument(
134
+ "-out_lang",
135
+ type=str,
136
+ default="ru",
137
+ help="The bottleneck language if you want to resume training one can"
138
+ "choose between {'es','ru','en','fr','de','pt','zh'} but please have a "
139
+ "look at https://huggingface.co/Helsinki-NLP to make sure the language"
140
+ "pair you ask for is available",
141
+ )
142
+
143
+ parser.add_argument(
144
+ "-input_data_path",
145
+ type=str,
146
+ default=os.path.join(wd, "dataset", "train_neurips_dataset.csv"),
147
+ help="dataset location, please note it should be a CSV file with two"
148
+ 'columns : "text" and "summary"',
149
+ )
150
+
151
+ parser.add_argument(
152
+ "-output_data_path",
153
+ type=str,
154
+ default=os.path.join(
155
+ wd, "dataset", "augmented_datas", "augmented_dataset_output.csv"
156
+ ),
157
+ help="augmented dataset output location",
158
+ )
159
+
160
+ parser.add_argument(
161
+ "-columns_mapping",
162
+ "--col_map",
163
+ type=json.loads,
164
+ default={"abstract": "text", "tldr": "summary"},
165
+ help="columns names to apply data augmentation on "
166
+ "you have to give a key/value pair dict such that "
167
+ "{'input_column_name1':'output_column_name1'} by default "
168
+ " it is set as {'abstract': 'text', 'tldr':'summary'}, "
169
+ "if you don't want to change the column names,"
170
+ " please provide a dict such that keys=values ",
171
+ )
172
+
173
+ parser.add_argument("-batch_size", type=int, default=25, help="batch_size")
174
+
175
+ args = parser.parse_args()
176
+ do_backtranslation(**vars(args))
env.yml ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name : nlp_paraphraser_env
2
+
3
+ channels:
4
+ - conda-forge
5
+ - pytorch
6
+ - huggingface
7
+ - default
8
+
9
+
10
+ dependencies:
11
+ - python=3.10
12
+ - pytorch-gpu
13
+ - albumentations
14
+ - torchvision
15
+ - tqdm
16
+ - streamlit
17
+ - transformers
18
+ - fastapi
19
+ - ipython
20
+ - matplotlib
21
+ - pandas
22
+ - pip
23
+ - scikit-learn
24
+ - streamlit
25
+ - fastapi
26
+ - uvicorn
27
+ - plotly
28
+ - tqdm
29
+
30
+
31
+
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python=3.10
2
+ pytorch-gpu
3
+ albumentations
4
+ torchvision
5
+ tqdm
6
+ streamlit
7
+ transformers
8
+ fastapi
9
+ ipython
10
+ matplotlib
11
+ pandas
12
+ pip
13
+ scikit-learn
14
+ streamlit
15
+ fastapi
16
+ uvicorn
17
+ plotly
18
+ tqdm