Spaces:
Build error
Build error
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
import torch | |
from mosestokenizer import * | |
from indicnlp.tokenize import sentence_tokenize | |
from docx import Document | |
import os | |
import torch | |
import time | |
import json | |
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler | |
from transformers import ( | |
AutoConfig, | |
AutoModelForQuestionAnswering, | |
AutoTokenizer, | |
squad_convert_examples_to_features | |
) | |
from transformers.data.processors.squad import SquadResult, SquadV2Processor, SquadExample | |
from transformers.data.metrics.squad_metrics import compute_predictions_logits | |
os.system('git clone https://github.com/TheAtticusProject/cuad.git') | |
os.system('mv cuad cuad-training') | |
os.system('unzip cuad-training/data.zip -d cuad-data/') | |
os.system('mkdir cuad-models') | |
os.system('curl https://zenodo.org/record/4599830/files/roberta-base.zip?download=1 --output cuad-models/roberta-base.zip') | |
os.system('unzip cuad-models/roberta-base.zip -d cuad-models/') | |
trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M" ) | |
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
trans_model = trans_model.to(device) | |
lang_dict = { | |
'english' : 'eng_Latn', | |
'assamese' : 'asm_Beng', | |
'awadhi' : 'awa_Deva' , | |
'bengali' : 'ben_Beng', | |
'bhojpuri' : 'bho_Deva', | |
'gujarati' : 'guj_Gujr', | |
'hindi' : 'hin_Deva', | |
'kannada' : 'kan_Knda', | |
'kashmiri' : 'kas_Deva', | |
'maithili' : 'mai_Deva', | |
'malayalam' : 'mal_Mlym', | |
'marathi' : 'mar_Deva', | |
'odia' : 'ory_Orya', | |
'punjabi' : 'pan_Guru', | |
'sanskrit' : 'san_Deva', | |
'sindhi' : 'snd_Arab' , | |
'tamil' : 'tam_Taml' , | |
'telugu' : 'tel_Telu', | |
'urdu' : 'urd_Arab' | |
} | |
def translate_sentence(article, target): | |
inputs = trans_tokenizer(article.replace("\"",""), return_tensors="pt").to(device) | |
translated_tokens = trans_model.generate( | |
**inputs, forced_bos_token_id=trans_tokenizer.lang_code_to_id[lang_dict[target]], max_length=100) | |
return trans_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] | |
INDIC_DICT = {"assamese" :"as", 'bengali' : 'bn', 'gujarati' : 'gu', | |
'hindi' : 'hi', | |
'kannada' : 'kn', | |
'malayalam' : 'ml', | |
'marathi' : 'mr', | |
'odia' : 'or', | |
'punjabi' : 'pa', | |
'tamil' : 'ta' , | |
'telugu' : 'te'} | |
def split_sentences(paragraph, language): | |
if language in INDIC_DICT.keys(): | |
return sentence_tokenize.sentence_split(paragraph, lang=INDIC_DICT[language]) | |
elif language == 'en': | |
with MosesSentenceSplitter('en') as splitter: | |
return splitter([paragraph]) | |
else: | |
return paragraph.split(".") | |
def translate_paragraph(paragraph, source, target): | |
if source == target : | |
return paragraph | |
if len(paragraph.split()) < 100: | |
return translate_sentence(paragraph, target) | |
else: | |
sentences = split_sentences(paragraph, source) | |
outputs = [] | |
for each_sentence in sentences: | |
outputs.append(translate_sentence(each_sentence, target)) | |
return " ".join(outputs) | |
def docx_replace(doc, data): | |
paragraphs = list(doc.paragraphs) | |
for t in doc.tables: | |
for row in t.rows: | |
for cell in row.cells: | |
for paragraph in cell.paragraphs: | |
paragraphs.append(paragraph) | |
for each in data: | |
key = list(each.keys())[0] | |
val = list(each.values())[0] | |
for p in paragraphs: | |
#key_name = '${{{}}}'.format(key) # I'm using placeholders in the form ${PlaceholderName} | |
key_name = key | |
if key_name in p.text: | |
#print(f'old one {p.text}') | |
inline = p.runs | |
# Replace strings and retain the same style. | |
# The text to be replaced can be split over several runs so | |
# search through, identify which runs need to have text replaced | |
# then replace the text in those identified | |
started = False | |
key_index = 0 | |
# found_runs is a list of (inline index, index of match, length of match) | |
found_runs = list() | |
found_all = False | |
replace_done = False | |
for i in range(len(inline)): | |
# case 1: found in single run so short circuit the replace | |
if key_name in inline[i].text and not started: | |
found_runs.append((i, inline[i].text.find(key_name), len(key_name))) | |
text = inline[i].text.replace(key_name, str(val)) | |
inline[i].text = text | |
replace_done = True | |
found_all = True | |
break | |
if key_name[key_index] not in inline[i].text and not started: | |
# keep looking ... | |
continue | |
# case 2: search for partial text, find first run | |
if key_name[key_index] in inline[i].text and inline[i].text[-1] in key_name and not started: | |
# check sequence | |
start_index = inline[i].text.find(key_name[key_index]) | |
check_length = len(inline[i].text) | |
for text_index in range(start_index, check_length): | |
if inline[i].text[text_index] != key_name[key_index]: | |
# no match so must be false positive | |
break | |
if key_index == 0: | |
started = True | |
chars_found = check_length - start_index | |
key_index += chars_found | |
found_runs.append((i, start_index, chars_found)) | |
if key_index != len(key_name): | |
continue | |
else: | |
# found all chars in key_name | |
found_all = True | |
break | |
# case 2: search for partial text, find subsequent run | |
if key_name[key_index] in inline[i].text and started and not found_all: | |
# check sequence | |
chars_found = 0 | |
check_length = len(inline[i].text) | |
for text_index in range(0, check_length): | |
if inline[i].text[text_index] == key_name[key_index]: | |
key_index += 1 | |
chars_found += 1 | |
else: | |
break | |
# no match so must be end | |
found_runs.append((i, 0, chars_found)) | |
if key_index == len(key_name): | |
found_all = True | |
break | |
if found_all and not replace_done: | |
for i, item in enumerate(found_runs): | |
index, start, length = [t for t in item] | |
if i == 0: | |
text = inline[index].text.replace(inline[index].text[start:start + length], str(val)) | |
inline[index].text = text | |
else: | |
text = inline[index].text.replace(inline[index].text[start:start + length], '') | |
inline[index].text = text | |
#print(p.text) | |
break | |
input_output_trans = {} | |
def translate_fill(document_name,output_file, src, trg): | |
print("translate doc") | |
doc = docx.Document(document_name) | |
if doc.paragraphs[0].text in list(input_output_trans.keys()): | |
lang_doc_dict = input_output_trans[doc.paragraphs[0].text] | |
if trg in lang_doc_dict.keys(): | |
time.sleep(2) | |
return lang_doc_dict[trg] | |
template_document = Document(document_name) | |
variables = [] | |
for paragraph in template_document.paragraphs: | |
if(paragraph.text.strip() != ""): | |
variables.append({paragraph.text : translate_paragraph(paragraph.text, src, trg)}) | |
for t in template_document.tables: | |
for row in t.rows: | |
for cell in row.cells: | |
for paragraph in cell.paragraphs: | |
if(paragraph.text.strip() != ""): | |
variables.append({paragraph.text : translate_paragraph(paragraph.text, src, trg)}) | |
docx_replace(template_document, variables) | |
template_document.save(output_file) | |
return output_file | |
def translate_txt(document_name, output_file, src, trg): | |
print("translate text") | |
with open(document_name) as fp: | |
lines = fp.readlines() | |
lines = [line.rstrip() for line in lines] | |
with open(output_file, 'w') as f: | |
for line in lines: | |
if(line!=""): | |
f.write( translate_paragraph(line, src, trg) + "\n") | |
else: | |
f.write("\n") | |
return output_file | |
info_model_path = 'cuad-models/roberta-base/' | |
info_config_class, info_model_class, info_tokenizer_class = ( | |
AutoConfig, AutoModelForQuestionAnswering, AutoTokenizer) | |
info_config = info_config_class.from_pretrained(info_model_path) | |
info_tokenizer = info_tokenizer_class.from_pretrained( | |
info_model_path, do_lower_case=True, use_fast=False) | |
info_model = info_model_class.from_pretrained(info_model_path, config=info_config) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
info_model.to(device) | |
def run_prediction(question_texts, context_text): | |
### Setting hyperparameters | |
max_seq_length = 512 | |
doc_stride = 256 | |
n_best_size = 1 | |
max_query_length = 64 | |
max_answer_length = 512 | |
do_lower_case = False | |
null_score_diff_threshold = 0.0 | |
# model_name_or_path = "../cuad-models/roberta-base/" | |
def to_list(tensor): | |
return tensor.detach().cpu().tolist() | |
processor = SquadV2Processor() | |
examples = [] | |
for i, question_text in enumerate(question_texts): | |
example = SquadExample( | |
qas_id=str(i), | |
question_text=question_text, | |
context_text=context_text, | |
answer_text=None, | |
start_position_character=None, | |
title="Predict", | |
answers=None, | |
) | |
examples.append(example) | |
features, dataset = squad_convert_examples_to_features( | |
examples=examples, | |
tokenizer= info_tokenizer, | |
max_seq_length=max_seq_length, | |
doc_stride=doc_stride, | |
max_query_length=max_query_length, | |
is_training=False, | |
return_dataset="pt", | |
threads=1, | |
) | |
eval_sampler = SequentialSampler(dataset) | |
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=10) | |
all_results = [] | |
for batch in eval_dataloader: | |
info_model.eval() | |
batch = tuple(t.to(device) for t in batch) | |
with torch.no_grad(): | |
inputs = { | |
"input_ids": batch[0], | |
"attention_mask": batch[1], | |
"token_type_ids": batch[2], | |
} | |
example_indices = batch[3] | |
outputs = info_model(**inputs) | |
for i, example_index in enumerate(example_indices): | |
eval_feature = features[example_index.item()] | |
unique_id = int(eval_feature.unique_id) | |
output = [to_list(output[i]) for output in outputs.to_tuple()] | |
start_logits, end_logits = output | |
result = SquadResult(unique_id, start_logits, end_logits) | |
all_results.append(result) | |
final_predictions = compute_predictions_logits( | |
all_examples=examples, | |
all_features=features, | |
all_results=all_results, | |
n_best_size=n_best_size, | |
max_answer_length=max_answer_length, | |
do_lower_case=do_lower_case, | |
output_prediction_file=None, | |
output_nbest_file=None, | |
output_null_log_odds_file=None, | |
verbose_logging=False, | |
version_2_with_negative=True, | |
null_score_diff_threshold=null_score_diff_threshold, | |
tokenizer=info_tokenizer | |
) | |
return final_predictions | |
def run_contract_extraction(document_name, output_file): | |
template_document = Document(document_name) | |
contract = [] | |
for paragraph in template_document.paragraphs: | |
if(paragraph.text.strip()!=''): | |
contract.append(paragraph.text) | |
contract = "\n".join(contract) | |
questions = [] | |
with open('./cuad-data/CUADv1.json') as json_file: | |
data = json.load(json_file) | |
#with open('./cuad-data/questions.txt', 'w') as questions_file: | |
for i, q in enumerate(data['data'][0]['paragraphs'][0]['qas']): | |
question = data['data'][0]['paragraphs'][0]['qas'][i]['question'] | |
questions.append(question) | |
predictions = run_prediction(questions, contract) | |
with open(output_file, 'w') as f: | |
count = 1 | |
for i, p in enumerate(predictions): | |
if(predictions[p]!=''): | |
#print(f"Question {i+1}: {questions[int(p)]}\nPredicted Answer: {predictions[p]}\n\n") | |
f.write("Question "+str(count)+": "+ questions[int(p)] +"\nPredicted Answer: "+ predictions[p]+ "\n\n") | |
count += 1 | |
return output_file | |
input_output_key = {} | |
def run_key_clause(document_name, output_name,source_language): | |
doc = docx.Document(document_name) | |
if doc.paragraphs[0].text in list(input_output_key.keys()): | |
time.sleep(2) | |
return input_output_key[doc.paragraphs[0].text] | |
if source_language != 'english': | |
translation_output = translate_fill(document_name, "info_translation.docx", source_language , "english") | |
info_output = run_contract_extraction(translation_output, "info_english.txt") | |
final_info = translate_txt(info_output, output_name, "english",source_language) | |
else: | |
final_info = run_contract_extraction(document_name, output_name) | |
return final_info | |
from transformers import AutoModelWithLMHead, AutoTokenizer | |
from docx import Document | |
qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") | |
qg_model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-question-generation-ap") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
qg_model.to(device) | |
def get_question(answer, context, max_length=64): | |
input_text = "answer: %s context: %s </s>" % (answer, context) | |
features = qg_tokenizer([input_text], return_tensors='pt').to(device) | |
output = qg_model.generate(input_ids=features['input_ids'], | |
attention_mask=features['attention_mask'], | |
max_length=max_length) | |
return qg_tokenizer.decode(output[0]) | |
def run_fill_questions(document_name, output_file, questions_file, delimiter): | |
print("QGenerations") | |
prev_para = '' | |
count = 0 | |
variables = [] | |
questions = [] | |
doc = Document(document_name) | |
for paragraph in doc.paragraphs: | |
if(paragraph.text.strip()==''): | |
continue | |
if(paragraph.text.count(delimiter)>0): | |
var_count = paragraph.text.count(delimiter) | |
format_str = paragraph.text.replace(delimiter, '{}') | |
new_string = format_str.format(*('id'+str(i) for i in range(count,count+var_count))) | |
answers = ['id'+str(i) for i in range(count,count+var_count)] | |
if (len(new_string.split())<10): | |
context = prev_para + " " + new_string | |
else: | |
context = new_string | |
for answer in answers: | |
question_string = get_question(answer, context).replace('<pad> question:','').replace('</s>','').strip() | |
question = "{{"+question_string+"}}" | |
questions.append(question_string) | |
new_string = new_string.replace(answer, question) | |
count += var_count | |
variables.append({paragraph.text : new_string }) | |
prev_para = paragraph.text | |
with open(questions_file, 'w') as f: | |
count = 1 | |
for p in questions: | |
f.write("Question "+str(count)+": "+ p +"\n") | |
count += 1 | |
docx_replace(doc, variables) | |
doc.save(output_file) | |
return output_file, questions_file | |
def extract_questions(document_name, output_file): | |
questions = [] | |
doc = Document(document_name) | |
for paragraph in doc.paragraphs: | |
if(paragraph.text.strip()==''): | |
continue | |
else: | |
q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip()) | |
questions.extend(q) | |
with open(output_file, 'w') as f: | |
count = 1 | |
for p in questions: | |
f.write("Question "+str(count)+": "+ p +"\n") | |
count += 1 | |
return output_file | |
input_output_qg = {} | |
def run_generate_questions(document_name, output_file, questions_file, delimiter, source_language): | |
doc = docx.Document(document_name) | |
if doc.paragraphs[0].text in list(input_output_qg.keys()): | |
qg_output = input_output_qg[doc.paragraphs[0].text] | |
q_output = extract_questions(qg_output, questions_file) | |
time.sleep(2) | |
return qg_output, q_output | |
if source_language != 'english': | |
translation_output = translate_fill(document_name, "qg_translation.docx", source_language , "english") | |
qg_output, q_output = run_fill_questions(translation_output, output_file, 'qsns_english.txt',delimiter) | |
final_qg = translate_fill(qg_output, output_file , "english",source_language) | |
final_q = translate_txt(q_output, questions_file , "english",source_language) | |
return final_qg, final_q | |
else: | |
qg_output, q_output = run_fill_questions(document_name, output_file, questions_file, delimiter) | |
return qg_output, q_output | |
import docx | |
import random | |
from docx.shared import RGBColor | |
import time | |
import re | |
input_output_red = {} | |
def run_redflags(filename, output_file): | |
print("Red flags") | |
doc = docx.Document(filename) | |
if doc.paragraphs[0].text in list(input_output_red.keys()): | |
return input_output_red[doc.paragraphs[0].text] | |
else: | |
for para in doc.paragraphs: | |
inline = para.runs | |
colour = False | |
if (len(para.text.split())>10) and random.random()>0.8: | |
colour = True | |
if colour: | |
for i in range(len(inline)): | |
inline[i].font.color.rgb = RGBColor(255, 000, 000) | |
time.sleep(2) | |
doc.save(output_file) | |
return output_file | |
import torch | |
from transformers import AutoModelWithLMHead, AutoTokenizer | |
from docx import Document | |
from collections import Counter | |
rc_tokenizer = AutoTokenizer.from_pretrained("tuner007/t5_abs_qa") | |
rc_model = AutoModelWithLMHead.from_pretrained("tuner007/t5_abs_qa") | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
rc_model = rc_model.to(device) | |
def get_answer(question, context): | |
input_text = "context: %s <question for context: %s </s>" % (context,question) | |
features = rc_tokenizer([input_text], return_tensors='pt') | |
out = rc_model.generate(input_ids=features['input_ids'].to(device), attention_mask=features['attention_mask'].to(device)) | |
return rc_tokenizer.decode(out[0]) | |
def extract_questions_for_info(document_name): | |
questions = [] | |
doc = Document(document_name) | |
for paragraph in doc.paragraphs: | |
if(paragraph.text.strip()==''): | |
continue | |
else: | |
q = re.findall(r'\{{(.*?)\}}',paragraph.text.strip()) | |
questions.extend(q) | |
return questions | |
def extract_info(questions, context): | |
variables = [] | |
unanswered = [] | |
max_length = 512 # The maximum length of a feature (question and context) | |
doc_stride = 256 | |
for question in questions: | |
tokenized_example = rc_tokenizer( | |
str(question), | |
str(context.replace('\'','').replace('"',"")), | |
max_length=max_length, | |
truncation="only_second", | |
return_overflowing_tokens=True, | |
stride=doc_stride) | |
answers = [] | |
for x in tokenized_example["input_ids"]: | |
q, c = rc_tokenizer.decode(x).split("</s>")[0], rc_tokenizer.decode(x).split("</s>")[1] | |
answers.append(get_answer(q, c).replace('<pad>','').replace('</s>','').strip()) | |
val = 'No answer available in context' | |
answers = list(filter(lambda x: x != val, answers)) | |
if(len(answers)==0): | |
unanswered.append(question) | |
else: | |
fre_list = Counter(answers) | |
answer = fre_list.most_common(1)[0][0] | |
variables.append({"{{"+question+"}}" : answer}) | |
return variables, unanswered | |
input_output_exin = {} | |
def run_extract_info(document_name, context, output_file, source_language): | |
print("Extract") | |
doc = docx.Document(document_name) | |
if doc.paragraphs[0].text in list(input_output_exin.keys()): | |
exin_output = input_output_exin[doc.paragraphs[0].text] | |
exin_unanswered = extract_questions_for_info(exin_output) | |
time.sleep(2) | |
return exin_output, exin_unanswered | |
else: | |
if source_language != 'english': | |
translation_output = translate_fill(document_name, "exin_translation.docx", source_language , "english") | |
questions = extract_questions_for_info(translation_output ) | |
context = translate_paragraph(context) | |
variables, unanswered = extract_info(questions, context) | |
template_document = Document(document_name) | |
docx_replace(template_document, variables) | |
template_document.save("exin_modified.docx") | |
final_exin = translate_fill("exin_modified.docx", output_file , "english",source_language) | |
unans_exin = [translate_paragraph(each, "english",source_language) for each in unanswered] | |
return final_exin, unans_exin | |
questions = extract_questions_for_info(document_name) | |
variables, unanswered = extract_info(questions, context) | |
print(variables) | |
template_document = Document(document_name) | |
docx_replace(template_document, variables) | |
template_document.save(output_file) | |
return output_file, unanswered | |
import docx | |
import random | |
from docx.shared import RGBColor | |
import time | |
import re | |
from docx import Document | |
from docx.enum.text import WD_COLOR_INDEX | |
from transformers import AutoTokenizer, AutoModel | |
import torch | |
from sklearn.metrics.pairwise import cosine_similarity | |
import numpy as np | |
similar_tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert' ) | |
similar_model = AutoModel.from_pretrained('ai4bharat/indic-bert' ) | |
similar_model.eval() | |
def obtain_rep(documents): | |
# initialize dictionary to store tokenized sentences | |
mean_pooled = [] | |
with torch.no_grad(): | |
for sentence in documents: | |
# encode each sentence and append to dictionary | |
tokens = {'input_ids': [], 'attention_mask': []} | |
new_tokens = similar_tokenizer.encode_plus(sentence, max_length=128, | |
truncation=True, padding='max_length', | |
return_tensors='pt') | |
tokens['input_ids'].append(new_tokens['input_ids'][0]) | |
tokens['attention_mask'].append(new_tokens['attention_mask'][0]) | |
tokens['input_ids'] = torch.stack(tokens['input_ids']) | |
tokens['attention_mask'] = torch.stack(tokens['attention_mask']) | |
outputs = similar_model(**tokens) | |
mean_pooled.append(outputs.pooler_output) | |
return torch.stack(mean_pooled).squeeze(1) | |
def similarity(documents, clauses): | |
clauses = clauses.detach().numpy() | |
documents = documents.detach().numpy() | |
sim = cosine_similarity(clauses,documents) | |
max_sim = np.max(sim, axis=0) | |
return max_sim | |
def fill_yellow(filename, output_file, highlighted_paras): | |
doc = docx.Document(filename) | |
for each in highlighted_paras: | |
for para in doc.paragraphs: | |
inline = para.runs | |
colour = False | |
if each in para.text: | |
colour = True | |
if colour: | |
for i in range(len(inline)): | |
inline[i].font.highlight_color = WD_COLOR_INDEX.YELLOW | |
break | |
doc.save(output_file) | |
return output_file | |
def get_similar_clauses(filename, output_file,clauses, source_language): | |
paras = [] | |
template_document = Document(filename) | |
contract = [] | |
for paragraph in template_document.paragraphs: | |
if(paragraph.text.strip()!=''): | |
contract.append(paragraph.text) | |
sentence_batch = [] | |
for paragraph in contract: | |
sentence_batch.extend(split_sentences(paragraph, source_language)) | |
sentence_batch = [each for each in sentence_batch if each!=' ' and len(each.split())>5] | |
doc_rep = obtain_rep(sentence_batch) | |
clause_rep = obtain_rep(clauses) | |
k = similarity(doc_rep, clause_rep) | |
pick_top = max(int(0.1*len(sentence_batch)),3) | |
ind = k.argsort()[-pick_top:][::-1] | |
for each_idx in ind: | |
paras.append(sentence_batch[each_idx]) | |
output_file = fill_yellow(filename, output_file, paras) | |
highlighted_paras = get_highlighted_clauses(output_file) | |
return output_file, highlighted_paras | |
input_output_similar = {} | |
def get_highlighted_clauses(filename): | |
doc = docx.Document(filename) | |
para_highlighted = [] | |
for para in doc.paragraphs: | |
inline = para.runs | |
colour = False | |
for i in range(len(inline)): | |
if inline[i].font.highlight_color == WD_COLOR_INDEX.YELLOW : | |
colour = True | |
break | |
if colour: | |
para_highlighted.append(para.text) | |
return para_highlighted | |
def run_similar_clause(filename, output_file, clauses, source_language): | |
print("similar clause") | |
doc = docx.Document(filename) | |
for doc_input in list(input_output_similar.keys()): | |
if doc.paragraphs[0].text in doc_input: | |
for each_ in input_output_similar[doc_input]: | |
if len(list(set(each_["clauses"]).intersection(set(clauses))))>0 : | |
output_file = each_["file"] | |
time.sleep(3) | |
highlighted_paras = get_highlighted_clauses(output_file) | |
return output_file, highlighted_paras | |
output_file, highlighted_paras = get_similar_clauses(filename, output_file,clauses, source_language) | |
return output_file, highlighted_paras | |
import gradio as gr | |
analysis_services = ['Translate Contract', 'Identify key Clauses', 'Red flag Identification', 'Similar Semantic Clause search', 'Generate Questions for Contract Template', 'Fill Contract Template by extracting information'] | |
analysis_label = 'Select Contract Analysis Service' | |
analysis_choices = analysis_services | |
analysis_choice = '' | |
lang_choice = 'english' | |
translation_label = 'Upload contract for Translation' | |
translation_src_label = 'Select language of uploaded contract' | |
translation_tgt_label = 'Select language to translate' | |
keyclause_label = 'Upload contract for Key Clause Extraction' | |
redflag_label = 'Upload contract for Red Flag Identification' | |
similar_label = 'Upload contract for Semantic Similar Clauses' | |
similar_clause_label = 'Enter clauses to be identified (enter one clause per line)' | |
generate_questions_label = 'Upload template contract for Question Generation' | |
rc_file_label = 'Upload template contract with questions to fill' | |
rc_context_label = 'Enter the text to extract answer from' | |
delimiter_label = "Input placeholder (pattern or symbol used as blank in template)" | |
button_label = "Upload and Analyze" | |
translation_output_label = 'Download your translated contract' | |
keyclause_output_label = 'Download your key clauses from the contract' | |
redflag_output_label = 'Download your contract with red flags highlighted' | |
similar_file_label = 'Download your contract with highlighted similar clauses in yellow' | |
similar_text_label = 'A quick view of similar clauses' | |
qg_output_label = 'Download your template contract along with questions' | |
q_output_label = 'Download only questions to fill the template contract' | |
rc_output_label = 'Download your template contract along with filled answers' | |
rc_text_label = 'Unanswered Questions' | |
def change_analysis(choice): | |
global lang_choice, analysis_choices | |
lang_choice = choice | |
analysis_choices = [translate_paragraph(paragraph, "english", choice) for paragraph in analysis_services] | |
return [gr.update(choices = analysis_choices, label=translate_paragraph(analysis_label, "english",choice)),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False)] | |
def change_inputs(choice): | |
global analysis_choice | |
analysis_choice = choice | |
if analysis_choice == analysis_choices[0]: | |
return [gr.update(visible=True, label = translate_paragraph(translation_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True, label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_tgt_label, "english",lang_choice)),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
elif analysis_choice == analysis_choices[1]: | |
return [gr.update(visible=True, label = translate_paragraph(keyclause_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
elif analysis_choice == analysis_choices[2]: | |
return [gr.update(visible=True, label = translate_paragraph(redflag_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
elif analysis_choice == analysis_choices[3]: | |
return [gr.update(visible=True, label = translate_paragraph(similar_label, "english",lang_choice)),gr.update(visible=True, label = translate_paragraph(similar_clause_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
elif analysis_choice == analysis_choices[4]: | |
return [gr.update(visible=True, label = translate_paragraph(generate_questions_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=True, label= translate_paragraph(delimiter_label,"english",lang_choice)), gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
elif analysis_choice == analysis_choices[5]: | |
return [gr.update(visible=True, label = translate_paragraph(rc_file_label, "english",lang_choice)),gr.update(visible=True, lines = 16, label = translate_paragraph(rc_context_label, "english",lang_choice)), gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=True,label=''),gr.update(visible=False),gr.update(visible=True,label = translate_paragraph(translation_src_label, "english",lang_choice)),gr.update(visible=False),gr.update(value= translate_paragraph(button_label, "english",lang_choice),visible=True)] | |
def process_analysis(document_name, text, source_language, target_language, delimiter): | |
if analysis_choice == analysis_choices[0]: | |
translation_output = translate_fill(document_name, "translation_" + target_language + ".docx", source_language , target_language) | |
return [gr.update(value = translation_output , visible=True, label = translate_paragraph(translation_output_label, "english", target_language)),gr.update(visible=False),gr.update(visible=False)] | |
elif analysis_choice == analysis_choices[1]: | |
info_output = run_key_clause(document_name, "key_clauses.txt",source_language) | |
return [gr.update(value = info_output, visible=True, label = translate_paragraph(keyclause_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)] | |
elif analysis_choice == analysis_choices[2]: | |
red_flag_output = run_redflags(document_name, "redflag.docx") | |
return [gr.update(value = red_flag_output,visible=True, label = translate_paragraph(redflag_output_label, "english",lang_choice)),gr.update(visible=False), gr.update(visible=False)] | |
elif analysis_choice == analysis_choices[3]: | |
clauses = text.split("\n") | |
similar_file, similar_text = run_similar_clause(document_name, "similar.docx", clauses, source_language) | |
similar_text = "\n\n\n".join(similar_text) | |
return [gr.update(value = similar_file, visible=True, label = translate_paragraph(similar_file_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = similar_text, visible=True, label = translate_paragraph(similar_text_label, "english",lang_choice))] | |
elif analysis_choice == analysis_choices[4]: | |
qg_output, q_output = run_generate_questions(document_name, "qsns_template.docx", "qsns_only.txt", delimiter, source_language) | |
return [gr.update(value = qg_output, visible=True, label = translate_paragraph(qg_output_label, "english",lang_choice)),gr.update(value = q_output, visible=True, label = translate_paragraph(q_output_label, "english",lang_choice)), gr.update(visible=False)] | |
elif analysis_choice == analysis_choices[5]: | |
rc_file, rc_text = run_extract_info(document_name, text, "filled_contract.docx", source_language) | |
rc_text = "\n\n".join(rc_text) | |
return [gr.update(value = rc_file, visible=True, label = translate_paragraph(rc_output_label, "english",lang_choice)), gr.update(visible=False),gr.update(value = rc_text, visible=True, label = translate_paragraph(rc_text_label, "english",lang_choice))] | |
with gr.Blocks() as demo: | |
lang_radio = gr.Radio(list(lang_dict.keys()), value = 'english', label="Select your language") | |
analysis_radio = gr.Radio(analysis_services , label=analysis_label) | |
with gr.Row(): | |
input_file = gr.File(interactive = True, visible = False) | |
with gr.Column(): | |
translation_source = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_src_label, visible=False) | |
translation_target = gr.Dropdown(choices = list(lang_dict.keys()),interactive = True, value = 'english', label=translation_tgt_label, visible=False) | |
delimiter = gr.Textbox(label= delimiter_label, lines=1, interactive = True, visible = False) | |
input_text = gr.Textbox(lines=4, interactive = True, visible = False) | |
button = gr.Button(value = button_label , visible = False) | |
output_file = gr.File(interactive = False, visible = False) | |
output_file2 = gr.File(interactive = False, visible = False) | |
output_text = gr.Textbox(interactive = False, visible = False) | |
lang_radio.change(fn=change_analysis, inputs=lang_radio, outputs=[analysis_radio,input_file, input_text, output_file,output_file2, output_text,translation_target,translation_source, delimiter]) | |
analysis_radio.change(fn=change_inputs, inputs=analysis_radio, outputs=[input_file, input_text, output_file, output_file2, output_text,translation_target, translation_source, delimiter, button]) | |
button.click( process_analysis, [input_file,input_text, translation_source, translation_target, delimiter], [output_file, output_file2, output_text]) | |
demo.launch(debug=True) | |