httpdaniel commited on
Commit
d23f7e7
·
1 Parent(s): 2fe07e8

Adding summariser

Browse files
Files changed (1) hide show
  1. app.py +74 -8
app.py CHANGED
@@ -1,25 +1,91 @@
1
  import gradio as gr
2
  from langchain_community.document_loaders import PyPDFLoader
 
 
3
 
4
- def summarise_pdf(pdf, progress=gr.Progress()):
5
 
6
- return "Summarised", "Complete!"
 
 
7
 
8
 
9
- with gr.Blocks() as demo:
 
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  gr.Markdown("<H1>PDF Summariser</H1>")
12
  gr.Markdown("<H3>Upload a PDF file and generate a summary</H3>")
13
- gr.Markdown("<H6>This project uses a MapReduce method to split the PDF into chunks, generate summaries of each of the chunks, and reduce them into a single final summary. Documents less than 3 pages use a Stuff method to simply stuff the entire document into the context window.</H6>")
 
 
 
 
 
14
 
15
  with gr.Row():
16
  with gr.Column(scale=1):
17
- pdf = gr.File(label="1. Upload PDF")
18
- summarise_btn = gr.Button(value="3. Summarise PDF", variant="primary")
19
- summary_progress = gr.Textbox(value="Not Started", label="Summary Progress")
20
  with gr.Column(scale=3):
21
  summary = gr.TextArea(label="Summary")
22
 
23
- summarise_btn.click(fn=summarise_pdf, inputs=pdf, outputs=[summary, summary_progress])
24
 
25
  demo.launch()
 
1
  import gradio as gr
2
  from langchain_community.document_loaders import PyPDFLoader
3
+ from huggingface_hub import AsyncInferenceClient, InferenceClient
4
+ import asyncio
5
 
 
6
 
7
+ model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
8
+ async_client = AsyncInferenceClient(model=model_name)
9
+ sync_client = InferenceClient(model=model_name)
10
 
11
 
12
+ def summarise_pdf(pdf):
13
+ loader = PyPDFLoader(pdf.name)
14
+ pages = loader.load()
15
 
16
+ summary = asyncio.run(map_method(pages))
17
+
18
+ return summary
19
+
20
+
21
+ async def map_method(pages):
22
+ chunk_size = 10
23
+ chunks = [pages[i : i + chunk_size] for i in range(0, len(pages), chunk_size)]
24
+
25
+ tasks = []
26
+ for chunk in chunks:
27
+ combined_content = combine_pages(chunk)
28
+ tasks.append(summarise_chunk(combined_content))
29
+
30
+ chunk_summaries = await asyncio.gather(*tasks)
31
+
32
+ final_summary = reduce_summaries(chunk_summaries)
33
+
34
+ return final_summary
35
+
36
+
37
+ def combine_pages(pages):
38
+ combined_content = "\n\n".join([page.page_content for page in pages])
39
+ return combined_content
40
+
41
+
42
+ async def summarise_chunk(chunk):
43
+ prompt = f"""Summarize the following document in 150-300 words, ensuring the most important ideas and main themes are highlighted:\n\n{chunk}"""
44
+
45
+ message = [{"role": "user", "content": prompt}]
46
+
47
+ result = await async_client.chat_completion(
48
+ messages=message,
49
+ max_tokens=2048,
50
+ temperature=0.1,
51
+ )
52
+
53
+ return result.choices[0].message["content"].strip()
54
+
55
+
56
+ def reduce_summaries(summaries):
57
+ combined_summaries = "\n\n".join(summaries)
58
+
59
+ reduce_prompt = f"Below is a collection of summaries, please synthesize them into a cohesive final summary, highlighting the key themes. Ensure the summary is concise and does not exceed 400 words:\n\n{combined_summaries}"
60
+
61
+ message = [{"role": "user", "content": reduce_prompt}]
62
+
63
+ result = sync_client.chat_completion(
64
+ messages=message,
65
+ max_tokens=2048,
66
+ temperature=0.1,
67
+ )
68
+
69
+ return result.choices[0].message["content"].strip()
70
+
71
+
72
+ with gr.Blocks(theme=gr.themes.Base()) as demo:
73
  gr.Markdown("<H1>PDF Summariser</H1>")
74
  gr.Markdown("<H3>Upload a PDF file and generate a summary</H3>")
75
+ gr.Markdown(
76
+ "<H6>This project uses a MapReduce method to split the PDF into chunks, generate summaries of each of the chunks asynchronously, and reduce them into a single final summary.</H6>"
77
+ )
78
+ gr.Markdown(
79
+ "<H6>Note: I have included The Metamorphosis by Franz Kafka as a default PDF to demonstrate its working on a large document. Replace this with any PDF you would like to summarise.</H6>"
80
+ )
81
 
82
  with gr.Row():
83
  with gr.Column(scale=1):
84
+ pdf = gr.File(label="Upload PDF", value="./TheMetamorphosis.pdf")
85
+ summarise_btn = gr.Button(value="Summarise PDF 🚀", variant="primary")
 
86
  with gr.Column(scale=3):
87
  summary = gr.TextArea(label="Summary")
88
 
89
+ summarise_btn.click(fn=summarise_pdf, inputs=pdf, outputs=summary)
90
 
91
  demo.launch()