Manaranjan commited on
Commit
27b5d83
·
verified ·
1 Parent(s): 3438cbb

deploy at 2024-08-12 06:04:48.732585

Browse files
Files changed (4) hide show
  1. Dockerfile +10 -0
  2. config.ini +5 -0
  3. main.py +241 -0
  4. requirements.txt +11 -0
Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ WORKDIR /code
3
+ COPY --link --chown=1000 . .
4
+ RUN mkdir -p /tmp/cache/
5
+ RUN chmod a+rwx -R /tmp/cache/
6
+ ENV HF_HUB_CACHE=HF_HOME
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ ENV PYTHONUNBUFFERED=1 PORT=7860
10
+ CMD ["python", "main.py"]
config.ini ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [DEFAULT]
2
+ dataset_id = space-backup
3
+ db_dir = data
4
+ private_backup = True
5
+
main.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fasthtml_hf import setup_hf_backup
2
+ import io
3
+ import os
4
+ import traceback
5
+ from pydantic_core import from_json
6
+ from fasthtml.common import *
7
+ from PyPDF2 import PdfReader
8
+ from PyPDF2 import PdfReader
9
+ from langchain.chains.summarize import load_summarize_chain
10
+ from langchain_core.prompts import PromptTemplate
11
+ from langchain_openai import ChatOpenAI
12
+ from langchain_anthropic import ChatAnthropic
13
+ from pydantic import BaseModel, Field, ValidationError
14
+ from langchain.output_parsers import PydanticOutputParser
15
+
16
+ # Initialize the fastHtml application
17
+ app, rt = fast_app()
18
+
19
+ # Define Pydantic models for structured output
20
+
21
+ # SummaryLine represents a single summary item with its keywords and description
22
+ class SummaryLine(BaseModel):
23
+ summary_item: str = Field(description = "Actual summary sentence that contains highlighting key data points or information.",
24
+ max_length = 200)
25
+ keywords: List[str] = Field(description = "A list of exact words or phrases in the summary item that highlights most important data points or key ideas.")
26
+ brief_descripton_of_summary: str = Field(description = "This is elaborate description to provide context or background to the summary item.",
27
+ min_length = 200,
28
+ max_length = 500)
29
+
30
+ # TopicSummaries represents a collection of summaries for a specific topic
31
+ class TopicSummaries(BaseModel):
32
+ topic: str = Field(description = "Topics of summary as mentioned in the instructions.")
33
+ summaries: List[SummaryLine] = Field(description = "This a list summary for a topic with each one having it's own keywords and context.",
34
+ min_items=3,
35
+ max_items=5)
36
+
37
+ # CompleteSummary is the top-level model containing all topic summaries
38
+ class CompleteSummary(BaseModel):
39
+ summaries_list: List[TopicSummaries]
40
+
41
+ # Define the template for summarization
42
+ # This template provides instructions to the AI model on how to structure the summary
43
+ summarize_template = """
44
+ Write a concise summary of the case study given in the context. The summary should be based on the following topics.
45
+ """
46
+
47
+ # Define the specific sections to be included in the summary
48
+ summary_sections = """
49
+ - Factual: Facts or information that contains numbers, dates, events etc. that are mostly quantitative or qualitative data
50
+ - SWOT: Key Strength, weakness, opportunities or threats that are mentioned in the case study
51
+ - Decisions and Outcomes: Key decisions taken and it's successful or failed outcomes and reasons
52
+ - Ethical and Governance: Key considerations from ethical and governance perspective
53
+
54
+ """
55
+
56
+ # Define the context string for one-pass summarization
57
+ # This string provides additional formatting instructions for the summary
58
+ context_str = """
59
+ <context>
60
+ {context_content}
61
+ </context>
62
+
63
+ The response must follow the following schema strictly. There will be penalty for not following the schema.
64
+ """
65
+
66
+ # Define the template for the reduce step in map-reduce summarization
67
+ # This template instructs the model to consolidate multiple summaries into a final summary
68
+ refine_str = """The following are set of summaries given in a markdown format:
69
+
70
+ {previous_summary}
71
+
72
+ Now add the above summary with more context given below and create final summary, which should contain the following sections.
73
+ """
74
+
75
+ # Function to get the appropriate language model based on user selection
76
+ def getModel(model, key):
77
+ if(model == 'OpenAI'):
78
+ os.environ['OPENAI_API_KEY'] = key
79
+ return ChatOpenAI(temperature=0, # Set to 0 for deterministic output
80
+ model="gpt-4o", # Using the GPT-4 Turbo model
81
+ max_tokens=4096) # Limit the response length
82
+ else:
83
+ os.environ['ANTHROPIC_API_KEY'] = key
84
+ return ChatAnthropic(model='claude-3-5-sonnet-20240620') # Limit the response length
85
+
86
+ # Function to highlight specific keywords in the text
87
+ def highlight_text(text, key_words):
88
+ for word in key_words:
89
+ text = text.replace(word, f'<span style="color:red;"><b>{word}</b></span>')
90
+ html_text = "<div>" + text + "</div>"
91
+ return eval(html2ft(html_text))
92
+
93
+ # Function to generate an HTML table from the summary object
94
+ def generate_table(summaries_obj):
95
+ column_names = ['Topic', "Summary"]
96
+ table_header = Thead(Tr(*[Th(key) for key in column_names]))
97
+ table_rows = []
98
+ for topic_summary in summaries_obj.summaries_list:
99
+ first_row = True
100
+ for summary in topic_summary.summaries:
101
+ if(first_row):
102
+ table_rows.append(Tr(Td(topic_summary.topic,
103
+ rowspan=f"{len(topic_summary.summaries)}",
104
+ style = "width: 10%;"),
105
+ Td(highlight_text(summary.summary_item, summary.keywords),
106
+ style = "width: 60%;"),
107
+ Td(Div(Details(Summary("Learn More:",
108
+ role="button",
109
+ cls="outline"),
110
+ P(summary.brief_descripton_of_summary)),
111
+ style ="padding: 0.5em 0.5em 0;"),
112
+ style = "width: 30%;")))
113
+ first_row = False
114
+ else:
115
+ table_rows.append(Tr(Td(highlight_text(summary.summary_item, summary.keywords),
116
+ style = f"width: 60%; rowspan='{len(topic_summary.summaries)}'"),
117
+ Td(Div(Details(Summary("Learn More:",
118
+ role="button",
119
+ cls="outline"),
120
+ P(summary.brief_descripton_of_summary)),
121
+ style ="padding: 0.5em 0.5em 0;"),
122
+ style = "width: 30%;")))
123
+
124
+ return Div(Card(Table(table_header, Tbody(*table_rows))))
125
+
126
+ # Function to perform one-pass summarization on the given pages
127
+ def onepass_summarize(pages, summary_sections, model):
128
+ """
129
+ Perform one-pass summarization on the given pages.
130
+
131
+ This function creates a summarization chain using the provided instructions
132
+ and model, then applies it to the input pages to generate a summary.
133
+
134
+ Args:
135
+ pages (list): List of pages (documents) to summarize
136
+ instructions (str): Custom instructions for summarization
137
+ model (ChatOpenAI): Instance of ChatOpenAI model to use for summarization
138
+
139
+ Returns:
140
+ str: Summarized text in markdown format
141
+ """
142
+ onepass_summary_template = summarize_template + summary_sections + context_str + "{format_instructions}"
143
+ print("Onepass instruction: " + onepass_summary_template)
144
+
145
+ output_parser = PydanticOutputParser(pydantic_object=CompleteSummary)
146
+ format_instructions = output_parser.get_format_instructions()
147
+ print("Format instructions: " + format_instructions)
148
+
149
+ # Create a prompt template combining the instructions and context
150
+ prompt = PromptTemplate.from_template(onepass_summary_template)
151
+ # Create an LLM chain with the model and prompt
152
+ summary_chain = prompt | model | output_parser
153
+
154
+ print("Getting Summary......")
155
+ # Invoke the chain on the input pages and return the summarized text
156
+ summaries = summary_chain.invoke({"context_content": pages,
157
+ "format_instructions": format_instructions})
158
+ return summaries
159
+
160
+ # Function to generate the configuration form for the web interface
161
+ def getConfigForm():
162
+ return Card(Form(hx_post="/submit", hx_target="#result", hx_swap_oob="innerHTML", hx_indicator="#indicator")(
163
+ Div(
164
+ Label(Strong("Model and Prompt Instruction: "), style="color:#3498db; font-size:25px;")
165
+ ),
166
+ Div(
167
+ Label(Strong('Model: ')),
168
+ Select(Option("OpenAI"), Option("Anthropic"), id="model")
169
+ ),
170
+ Div(
171
+ Label(Strong('Secret Key: ')),
172
+ Input(id="secret", type="password", placeholder="Key: "),
173
+ ),
174
+ Div(
175
+ Label(Strong('Upload File: '), "Upload only pdf file with max size of 1 MB"),
176
+ Input(id="file", type = 'file', placeholder="Key: ", accept = ".pdf", max = '1024000'),
177
+ ),
178
+ Div(
179
+ Label(Strong('Instruction: ')),
180
+ P('Provide the list of topics and their one line description for summarization as shown in example. Summarization will have these sections.',
181
+ style = 'font-size: 12px;'),
182
+ Textarea(summary_sections, id="instruction",
183
+ style="height:250px")
184
+ ),
185
+ Div(
186
+ Button("Summarize")
187
+ ),
188
+ Div(
189
+ Br(),
190
+ A("Developed by Manaranjan Pradhan", href="http://www.manaranjanp.com/",
191
+ target="_blank",
192
+ style = 'color: red; font-size: 16px;')
193
+ )))
194
+
195
+ # Define the route for the homepage
196
+ @app.get('/')
197
+ def homepage():
198
+ return Titled('Document Summarization', Grid( getConfigForm(),
199
+ Div(
200
+ Div(Label(Strong('Summarizing the document.... take a deep breath....')),
201
+ Progress(), id="indicator", cls="htmx-indicator"),
202
+ Div(id="result", style ="font-family:Helvetica; font-size=24pt;")
203
+ )
204
+ , style="grid-template-columns: 400px 1000px; gap: 50px;"
205
+ ))
206
+
207
+ # Define the route for form submission
208
+ @app.post('/submit')
209
+ async def post(d:dict):
210
+ try:
211
+ # Check if a file was uploaded
212
+ if "file" in d.keys():
213
+ pages = await d['file'].read(-1)
214
+ pdf_reader = PdfReader(io.BytesIO(pages))
215
+ else:
216
+ return Div("File not uploaded.", cls = 'alert', )
217
+
218
+ # Extract text from each page of the PDF
219
+ text_content = ""
220
+ for page in pdf_reader.pages:
221
+ text_content += page.extract_text() + "\n"
222
+
223
+ # Get the appropriate language model
224
+ model = getModel(d['model'], d['secret'])
225
+
226
+ # Perform one-pass summarization
227
+ summaries = onepass_summarize(text_content, d['instruction'], model)
228
+
229
+ print(f"Summary Obtained: {summaries}")
230
+
231
+ # Generate and return the HTML table with the summaries
232
+ return generate_table(summaries)
233
+
234
+ except BaseException as e:
235
+ print(traceback.format_exc())
236
+ return str(e)
237
+
238
+ setup_hf_backup(app)
239
+
240
+ # Start the FastAPI server
241
+ serve()
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ python-fasthtml==0.2.4
2
+ streamlit==1.37.0
3
+ pandas==2.2.1
4
+ pypdf==4.2.0
5
+ PyPDF2==3.0.1
6
+ langchain==0.2.7
7
+ langchain-community==0.2.7
8
+ langchain-core==0.2.13
9
+ langchain-openai==0.1.10
10
+ openai==1.34.0
11
+ fasthtml-hf==0.1.4