Spaces:
Sleeping
Sleeping
File size: 9,787 Bytes
2b935c8 308f4cd 2b935c8 426bae1 95c444e 426bae1 2b935c8 61493c1 2b935c8 61493c1 c84517a 61493c1 c84517a d0391de c84517a 2b935c8 84eb534 2b935c8 84eb534 d46ca93 f2d022e 2b935c8 f2d022e 2b935c8 0840afc f2d022e fb0a4ed 525b2d7 0c36028 0840afc 0c36028 525b2d7 2b935c8 525b2d7 862fea2 525b2d7 d969fdf 525b2d7 2b935c8 525b2d7 c84517a 525b2d7 2b935c8 525b2d7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 |
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tqdm import tqdm
import time
import sys
# import openai
import time
# import pandas as pd
import random
import csv
import os
import pickle
import json
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import string
from typing import List
import difflib
# import tiktoken
import re
from nltk.tokenize import sent_tokenize
from collections import defaultdict
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import numpy as np
from retrieve import get_retrieved_results, get_slide
# Ensure you have downloaded the 'punkt' tokenizer models
nltk.download('punkt')
import streamlit as st
# Get the parent directory
# parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
# Add the parent directory to the system path
# sys.path.append(parent_dir)
from utils import AzureModels, write_to_file, read_from_file
# from utils_open import OpenModels
# Function to calculate similarity
def calculate_similarity(sentence1: str, sentence2: str) -> float:
return difflib.SequenceMatcher(None, sentence1, sentence2).ratio()
# Function to highlight sentences based on similarity
def highlight_sentences(predicted: str, ground_truth: str) -> str:
ground_truth_sentences = nltk.sent_tokenize(ground_truth)
predicted_sentences = nltk.sent_tokenize(predicted)
highlighted_text = ""
for pred_sentence in predicted_sentences:
max_similarity = 0
for gt_sentence in ground_truth_sentences:
similarity = calculate_similarity(pred_sentence, gt_sentence)
if similarity > max_similarity:
max_similarity = similarity
# Determine shade of green
shade = max_similarity # No need to convert to int, max_similarity is already in [0, 1]
highlighted_text += f'<span style="background-color: rgba(0, 255, 0, {shade})">{pred_sentence}</span> '
return highlighted_text
st.title('Multi-Document Narrative Generation')
# options = ["Select", "Adobe Firefly", "Adobe Acrobat"]
# selection = st.selectbox('Select an example', options)
selection = "Adobe Firefly"
# Input for Presentation Title
presentation_title = st.text_input("Presentation Title")
# Input for Slide Title
slide_title = st.text_input("Slide Title")
# Option for uploading a folder (simulated by allowing multiple file uploads)
uploaded_files = st.file_uploader(
"Upload source documents (multiple .txt files allowed)",
accept_multiple_files=True,
type="txt"
)
if selection=="Select":
pass
elif selection=="Adobe Firefly":
# with open('wiki_1.json', 'r') as fr:
# list_1 = json.load(fr)
with open('wiki_2.json', 'r') as fr:
list_2 = json.load(fr)
tmp_ref_abstract = {}
file_count=0
for file in uploaded_files:
tmp_filename = file.name
tmp_content = file.read().decode('utf-8').strip()
tmp_ref_abstract[tmp_filename] = tmp_content
file_count+=1
document_name = presentation_title
section_names = [slide_title]*file_count
ref_doc_indices = np.arange(1,file_count+1).tolist()
list_1 = [
{
"abstract": "Write the '{}' section of the article titled '{}'.".format(slide_title, presentation_title),
"ref_abstract": tmp_ref_abstract,
"related_work": ""
}
]
else:
with open('wiki_2.json', 'r') as fr:
list_1 = json.load(fr)
with open('wiki_1.json', 'r') as fr:
list_2 = json.load(fr)
document_name = "Adobe Acrobat"
section_names = ["Introduction"]*3+["History"]*3+["Document Cloud"]*2
ref_doc_indices = np.arange(1,4).tolist() + np.arange(1,4).tolist() + np.arange(1,3).tolist()
# Initialize session state
if 'submit_clicked' not in st.session_state:
st.session_state.submit_clicked = False
inp_doc_list = []
inp_keys_list = []
retrieved_doc_list = []
if st.button('Submit'):
if 'retrieve_clicked' not in st.session_state:
st.session_state.retrieve_clicked = False
st.session_state.submit_clicked = True
# for item, ret_item in zip(list_1, retrieved_out):
for item in list_1:
for key in item['ref_abstract']:
inp_doc_list.append(item['ref_abstract'][key])
inp_keys_list.append(key)
# retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract'])
# Initialize session state
# if 'retrieve_clicked' not in st.session_state:
# st.session_state.retrieve_clicked = False
retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'"
ui_doc_list = []
ui_retrieved_doc_list = []
# 5 input text boxes for 5 input documents
st.header('Input Documents')
# doc1 = st.text_area('Document 1', value="1. What up bruh??")
for i in range(len(section_names)):
ui_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=inp_doc_list[i]))
# write_to_file('ui_doc.json', ui_inp_keys_list.jsondoc_list)
# ref_doc_indices[i], section_names[i], document_name
write_to_file("inp_keys_list.json", inp_keys_list)
write_to_file("section_names.json", section_names)
write_to_file("document_name.pickle", document_name)
if st.session_state.submit_clicked:
if st.button('Retrieve'):
# ui_doc_list=read_from_file('ui_doc.json')
inp_keys_list = read_from_file("inp_keys_list.json")
section_names = read_from_file("section_names.json")
document_name = read_from_file("document_name.pickle")
ui_retrieved_doc_list=[]
if 'organize_clicked' not in st.session_state:
st.session_state.organize_clicked = False
st.session_state.retrieve_clicked = True
retrieved_out = get_retrieved_results("gpt4o", 0, "fixed", list_2, list_1)
write_to_file("retrieved_docs.json", retrieved_out)
retrieved_out_train = get_retrieved_results("gpt4o", 0, "fixed", list_1, list_2)
write_to_file("retrieved_docs_train.json", retrieved_out_train)
for ret_item in retrieved_out:
for key in ret_item['ref_abstract']:
# inp_doc_list.append(item['ref_abstract'][key])
retrieved_doc_list.append(ret_item['ref_abstract'][key]['abstract'])
# Step 2: Lowercase the documents
st.header('Retrieved Documents')
retrieve_prompt_template = "{} : Document {} for the '{}' Section of the Article titled '{}'"
for i in range(len(section_names)):
ui_retrieved_doc_list.append(st.text_area(retrieve_prompt_template.format(inp_keys_list[i], ref_doc_indices[i], section_names[i], document_name), value=retrieved_doc_list[i]))
if st.session_state.retrieve_clicked:
if st.button('Organize'):
if 'summarize_clicked' not in st.session_state:
st.session_state.summarize_clicked = False
st.session_state.organize_clicked = True
st.header("Organization of the documents in the narrative")
topics_list = [slide_title]
organize_list = []
ui_organize_list = []
test_list = read_from_file("retrieved_docs.json")
train_list = read_from_file("retrieved_docs_train.json")
organize_out = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, True)
for i in range(len(organize_out)):
organize_list.append(organize_out[i])
ui_organize_list.append(st.text_area("Section: " + topics_list[i], value=organize_out[i]))
write_to_file("organized_docs.json", organize_out)
if st.session_state.organize_clicked:
if st.button("Summarize"):
# if 'narrative_clicked' not in st.session_state:
# st.session_state.narrative_clicked = False
st.session_state.summarize_clicked = True
st.header("Intent-based multi-document summary")
topics_list = [slide_title]
generate_list = []
ui_generate_list = []
slides_list = []
test_list = read_from_file("retrieved_docs.json")
train_list = read_from_file("retrieved_docs_train.json")
organize_out = read_from_file("organized_docs.json")
gen_summary_dict = get_retrieved_results("gpt4o", 1, "fixed", train_list, test_list, False, organize_out)
for i in range(len(gen_summary_dict)):
# highlighted_summary = highlight_sentences(gen_summary_dict[i], test_list[i]['abstract'])
slides_list.append(get_slide(topics_list[i], gen_summary_dict[i]))
# generate_list.append(.format(topics_list[i], gen_summary_dict[i]))
st.markdown(f"## {topics_list[i]}")
st.markdown(f"{gen_summary_dict[i]}")
# st.markdown(highlighted_summary, unsafe_allow_html=True)
st.header("Generated Narrative")
for i in range(len(slides_list)):
st.markdown("---")
st.markdown(slides_list[i])
st.markdown("---")
# if st.session_state.summarize_clicked:
# if st.button("Narrative"):
# st.session_state.narrative_clicked = True |