Spaces:
Runtime error
Runtime error
deploy at 2024-08-12 06:04:48.732585
Browse files- Dockerfile +10 -0
- config.ini +5 -0
- main.py +241 -0
- requirements.txt +11 -0
Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.10
|
2 |
+
WORKDIR /code
|
3 |
+
COPY --link --chown=1000 . .
|
4 |
+
RUN mkdir -p /tmp/cache/
|
5 |
+
RUN chmod a+rwx -R /tmp/cache/
|
6 |
+
ENV HF_HUB_CACHE=HF_HOME
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
ENV PYTHONUNBUFFERED=1 PORT=7860
|
10 |
+
CMD ["python", "main.py"]
|
config.ini
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[DEFAULT]
|
2 |
+
dataset_id = space-backup
|
3 |
+
db_dir = data
|
4 |
+
private_backup = True
|
5 |
+
|
main.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fasthtml_hf import setup_hf_backup
|
2 |
+
import io
|
3 |
+
import os
|
4 |
+
import traceback
|
5 |
+
from pydantic_core import from_json
|
6 |
+
from fasthtml.common import *
|
7 |
+
from PyPDF2 import PdfReader
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
+
from langchain.chains.summarize import load_summarize_chain
|
10 |
+
from langchain_core.prompts import PromptTemplate
|
11 |
+
from langchain_openai import ChatOpenAI
|
12 |
+
from langchain_anthropic import ChatAnthropic
|
13 |
+
from pydantic import BaseModel, Field, ValidationError
|
14 |
+
from langchain.output_parsers import PydanticOutputParser
|
15 |
+
|
16 |
+
# Initialize the fastHtml application
|
17 |
+
app, rt = fast_app()
|
18 |
+
|
19 |
+
# Define Pydantic models for structured output
|
20 |
+
|
21 |
+
# SummaryLine represents a single summary item with its keywords and description
|
22 |
+
class SummaryLine(BaseModel):
|
23 |
+
summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.",
|
24 |
+
max_length = 200)
|
25 |
+
keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.")
|
26 |
+
brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.",
|
27 |
+
min_length = 200,
|
28 |
+
max_length = 500)
|
29 |
+
|
30 |
+
# TopicSummaries represents a collection of summaries for a specific topic
|
31 |
+
class TopicSummaries(BaseModel):
|
32 |
+
topic: str = Field(description = "Topics of summary as mentioned in the instructions.")
|
33 |
+
summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.",
|
34 |
+
min_items=3,
|
35 |
+
max_items=5)
|
36 |
+
|
37 |
+
# CompleteSummary is the top-level model containing all topic summaries
|
38 |
+
class CompleteSummary(BaseModel):
|
39 |
+
summaries_list: List[TopicSummaries]
|
40 |
+
|
41 |
+
# Define the template for summarization
|
42 |
+
# This template provides instructions to the AI model on how to structure the summary
|
43 |
+
summarize_template = """
|
44 |
+
Write a concise summary of the case study given in the context. The summary should be based on the following topics.
|
45 |
+
"""
|
46 |
+
|
47 |
+
# Define the specific sections to be included in the summary
|
48 |
+
summary_sections = """
|
49 |
+
- Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data
|
50 |
+
- SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study
|
51 |
+
- Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons
|
52 |
+
- Ethical and Governance: Key considerations from ethical and governance perspective
|
53 |
+
|
54 |
+
"""
|
55 |
+
|
56 |
+
# Define the context string for one-pass summarization
|
57 |
+
# This string provides additional formatting instructions for the summary
|
58 |
+
context_str = """
|
59 |
+
<context>
|
60 |
+
{context_content}
|
61 |
+
</context>
|
62 |
+
|
63 |
+
The response must follow the following schema strictly. There will be penalty for not following the schema.
|
64 |
+
"""
|
65 |
+
|
66 |
+
# Define the template for the reduce step in map-reduce summarization
|
67 |
+
# This template instructs the model to consolidate multiple summaries into a final summary
|
68 |
+
refine_str = """The following are set of summaries given in a markdown format:
|
69 |
+
|
70 |
+
{previous_summary}
|
71 |
+
|
72 |
+
Now add the above summary with more context given below and create final summary, which should contain the following sections.
|
73 |
+
"""
|
74 |
+
|
75 |
+
# Function to get the appropriate language model based on user selection
|
76 |
+
def getModel(model, key):
|
77 |
+
if(model == 'OpenAI'):
|
78 |
+
os.environ['OPENAI_API_KEY'] = key
|
79 |
+
return ChatOpenAI(temperature=0, # Set to 0 for deterministic output
|
80 |
+
model="gpt-4o", # Using the GPT-4 Turbo model
|
81 |
+
max_tokens=4096) # Limit the response length
|
82 |
+
else:
|
83 |
+
os.environ['ANTHROPIC_API_KEY'] = key
|
84 |
+
return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length
|
85 |
+
|
86 |
+
# Function to highlight specific keywords in the text
|
87 |
+
def highlight_text(text, key_words):
|
88 |
+
for word in key_words:
|
89 |
+
text = text.replace(word, f'<span style="color:red;"><b>{word}</b></span>')
|
90 |
+
html_text = "<div>" + text + "</div>"
|
91 |
+
return eval(html2ft(html_text))
|
92 |
+
|
93 |
+
# Function to generate an HTML table from the summary object
|
94 |
+
def generate_table(summaries_obj):
|
95 |
+
column_names = ['Topic', "Summary"]
|
96 |
+
table_header = Thead(Tr(*[Th(key) for key in column_names]))
|
97 |
+
table_rows = []
|
98 |
+
for topic_summary in summaries_obj.summaries_list:
|
99 |
+
first_row = True
|
100 |
+
for summary in topic_summary.summaries:
|
101 |
+
if(first_row):
|
102 |
+
table_rows.append(Tr(Td(topic_summary.topic,
|
103 |
+
rowspan=f"{len(topic_summary.summaries)}",
|
104 |
+
style = "width: 10%;"),
|
105 |
+
Td(highlight_text(summary.summary_item, summary.keywords),
|
106 |
+
style = "width: 60%;"),
|
107 |
+
Td(Div(Details(Summary("Learn More:",
|
108 |
+
role="button",
|
109 |
+
cls="outline"),
|
110 |
+
P(summary.brief_descripton_of_summary)),
|
111 |
+
style ="padding: 0.5em 0.5em 0;"),
|
112 |
+
style = "width: 30%;")))
|
113 |
+
first_row = False
|
114 |
+
else:
|
115 |
+
table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords),
|
116 |
+
style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"),
|
117 |
+
Td(Div(Details(Summary("Learn More:",
|
118 |
+
role="button",
|
119 |
+
cls="outline"),
|
120 |
+
P(summary.brief_descripton_of_summary)),
|
121 |
+
style ="padding: 0.5em 0.5em 0;"),
|
122 |
+
style = "width: 30%;")))
|
123 |
+
|
124 |
+
return Div(Card(Table(table_header, Tbody(*table_rows))))
|
125 |
+
|
126 |
+
# Function to perform one-pass summarization on the given pages
|
127 |
+
def onepass_summarize(pages, summary_sections, model):
|
128 |
+
"""
|
129 |
+
Perform one-pass summarization on the given pages.
|
130 |
+
|
131 |
+
This function creates a summarization chain using the provided instructions
|
132 |
+
and model, then applies it to the input pages to generate a summary.
|
133 |
+
|
134 |
+
Args:
|
135 |
+
pages (list): List of pages (documents) to summarize
|
136 |
+
instructions (str): Custom instructions for summarization
|
137 |
+
model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
str: Summarized text in markdown format
|
141 |
+
"""
|
142 |
+
onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}"
|
143 |
+
print("Onepass instruction: " + onepass_summary_template)
|
144 |
+
|
145 |
+
output_parser = PydanticOutputParser(pydantic_object=CompleteSummary)
|
146 |
+
format_instructions = output_parser.get_format_instructions()
|
147 |
+
print("Format instructions: " + format_instructions)
|
148 |
+
|
149 |
+
# Create a prompt template combining the instructions and context
|
150 |
+
prompt = PromptTemplate.from_template(onepass_summary_template)
|
151 |
+
# Create an LLM chain with the model and prompt
|
152 |
+
summary_chain = prompt | model | output_parser
|
153 |
+
|
154 |
+
print("Getting Summary......")
|
155 |
+
# Invoke the chain on the input pages and return the summarized text
|
156 |
+
summaries = summary_chain.invoke({"context_content": pages,
|
157 |
+
"format_instructions": format_instructions})
|
158 |
+
return summaries
|
159 |
+
|
160 |
+
# Function to generate the configuration form for the web interface
|
161 |
+
def getConfigForm():
|
162 |
+
return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")(
|
163 |
+
Div(
|
164 |
+
Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;")
|
165 |
+
),
|
166 |
+
Div(
|
167 |
+
Label(Strong('Model: ')),
|
168 |
+
Select(Option("OpenAI"), Option("Anthropic"), id="model")
|
169 |
+
),
|
170 |
+
Div(
|
171 |
+
Label(Strong('Secret Key: ')),
|
172 |
+
Input(id="secret", type="password", placeholder="Key: "),
|
173 |
+
),
|
174 |
+
Div(
|
175 |
+
Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"),
|
176 |
+
Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'),
|
177 |
+
),
|
178 |
+
Div(
|
179 |
+
Label(Strong('Instruction: ')),
|
180 |
+
P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.',
|
181 |
+
style = 'font-size: 12px;'),
|
182 |
+
Textarea(summary_sections, id="instruction",
|
183 |
+
style="height:250px")
|
184 |
+
),
|
185 |
+
Div(
|
186 |
+
Button("Summarize")
|
187 |
+
),
|
188 |
+
Div(
|
189 |
+
Br(),
|
190 |
+
A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/",
|
191 |
+
target="_blank",
|
192 |
+
style = 'color: red; font-size: 16px;')
|
193 |
+
)))
|
194 |
+
|
195 |
+
# Define the route for the homepage
|
196 |
+
@app.get('/')
|
197 |
+
def homepage():
|
198 |
+
return Titled('Document Summarization', Grid( getConfigForm(),
|
199 |
+
Div(
|
200 |
+
Div(Label(Strong('Summarizing the document.... take a deep breath....')),
|
201 |
+
Progress(), id="indicator", cls="htmx-indicator"),
|
202 |
+
Div(id="result", style ="font-family:Helvetica; font-size=24pt;")
|
203 |
+
)
|
204 |
+
, style="grid-template-columns: 400px 1000px; gap: 50px;"
|
205 |
+
))
|
206 |
+
|
207 |
+
# Define the route for form submission
|
208 |
+
@app.post('/submit')
|
209 |
+
async def post(d:dict):
|
210 |
+
try:
|
211 |
+
# Check if a file was uploaded
|
212 |
+
if "file" in d.keys():
|
213 |
+
pages = await d['file'].read(-1)
|
214 |
+
pdf_reader = PdfReader(io.BytesIO(pages))
|
215 |
+
else:
|
216 |
+
return Div("File not uploaded.", cls = 'alert', )
|
217 |
+
|
218 |
+
# Extract text from each page of the PDF
|
219 |
+
text_content = ""
|
220 |
+
for page in pdf_reader.pages:
|
221 |
+
text_content += page.extract_text() + "\n"
|
222 |
+
|
223 |
+
# Get the appropriate language model
|
224 |
+
model = getModel(d['model'], d['secret'])
|
225 |
+
|
226 |
+
# Perform one-pass summarization
|
227 |
+
summaries = onepass_summarize(text_content, d['instruction'], model)
|
228 |
+
|
229 |
+
print(f"Summary Obtained: {summaries}")
|
230 |
+
|
231 |
+
# Generate and return the HTML table with the summaries
|
232 |
+
return generate_table(summaries)
|
233 |
+
|
234 |
+
except BaseException as e:
|
235 |
+
print(traceback.format_exc())
|
236 |
+
return str(e)
|
237 |
+
|
238 |
+
setup_hf_backup(app)
|
239 |
+
|
240 |
+
# Start the FastAPI server
|
241 |
+
serve()
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
python-fasthtml==0.2.4
|
2 |
+
streamlit==1.37.0
|
3 |
+
pandas==2.2.1
|
4 |
+
pypdf==4.2.0
|
5 |
+
PyPDF2==3.0.1
|
6 |
+
langchain==0.2.7
|
7 |
+
langchain-community==0.2.7
|
8 |
+
langchain-core==0.2.13
|
9 |
+
langchain-openai==0.1.10
|
10 |
+
openai==1.34.0
|
11 |
+
fasthtml-hf==0.1.4
|