Shreyas094 commited on
Commit
0bbc003
·
verified ·
1 Parent(s): 9739d19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -80
app.py CHANGED
@@ -4,98 +4,129 @@ from PyPDF2 import PdfReader
4
  import requests
5
  from dotenv import load_dotenv
6
  import tiktoken
 
7
  # Load environment variables
8
  load_dotenv()
 
9
  # Get the Hugging Face API token
10
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
11
  # Initialize the tokenizer
12
  tokenizer = tiktoken.get_encoding("cl100k_base")
 
13
  def count_tokens(text):
14
- return len(tokenizer.encode(text))
 
15
  def summarize_text(text, instructions, agent_name):
16
- print(f"{agent_name}: Starting summarization")
17
- API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
18
- headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
19
- payload = {
20
- "inputs": f"{instructions}\n\nText to summarize:\n{text}",
21
- "parameters": {"max_length": 500}
22
- }
23
- print(f"{agent_name}: Sending request to API")
24
- response = requests.post(API_URL, headers=headers, json=payload)
25
- print(f"{agent_name}: Received response from API")
26
- return response.json()[0]["generated_text"]
 
 
 
 
 
 
 
 
 
 
 
27
  def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
28
- print("Starting PDF processing")
29
- # Read PDF
30
- reader = PdfReader(pdf_file)
31
- text = ""
32
- for page in reader.pages:
33
- text += page.extract_text() + "\n\n"
34
- print(f"Extracted {len(reader.pages)} pages from PDF")
35
- # Chunk the text (simple splitting by pages for this example)
36
- chunks = text.split("\n\n")
37
- print(f"Split text into {len(chunks)} chunks")
38
- # Agent 1: Summarize each chunk
39
- agent1_summaries = []
40
- for i, chunk in enumerate(chunks):
41
- print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
42
- summary = summarize_text(chunk, chunk_instructions, "Agent 1")
43
- agent1_summaries.append(summary)
44
- print("Agent 1: Finished processing all chunks")
45
- # Concatenate Agent 1 summaries
46
- concatenated_summary = "\n\n".join(agent1_summaries)
47
- print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
48
- print(f"Concatenated Summary:{concatenated_summary}")
49
- # Sliding window approach
50
- window_size = 3500 # in tokens
51
- step_size = 3000 # overlap of 500 tokens
52
- windows = []
53
- current_position = 0
54
- while current_position < len(concatenated_summary):
55
- window_end = current_position
56
- window_text = ""
57
- while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
58
- window_text += concatenated_summary[window_end]
59
- window_end += 1
60
- windows.append(window_text)
61
- current_position += step_size
62
- print(f"Created {len(windows)} windows for intermediate summarization")
63
- # Intermediate summarization
64
- intermediate_summaries = []
65
- for i, window in enumerate(windows):
66
- print(f"Processing window {i+1}/{len(windows)}")
67
- summary = summarize_text(window, window_instructions, f"Window {i+1}")
68
- intermediate_summaries.append(summary)
69
- # Final summarization
70
- final_input = "\n\n".join(intermediate_summaries)
71
- print(f"Final input length: {count_tokens(final_input)} tokens")
72
- final_summary = summarize_text(final_input, final_instructions, "Agent 2")
73
- print("Agent 2: Finished final summarization")
74
- return final_summary
 
 
 
 
 
 
 
 
 
 
 
 
75
  def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
76
- if pdf_file is None:
77
- print("Error: No PDF file uploaded")
78
- return "Please upload a PDF file."
79
- try:
80
- print(f"Starting summarization process for file: {pdf_file.name}")
81
- summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
82
- print("Summarization process completed successfully")
83
- return summary
84
- except Exception as e:
85
- print(f"An error occurred: {str(e)}")
86
- return f"An error occurred: {str(e)}"
 
 
87
  # Gradio interface
88
  iface = gr.Interface(
89
- fn=pdf_summarizer,
90
- inputs=[
91
- gr.File(label="Upload PDF"),
92
- gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
93
- gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
94
- gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
95
- ],
96
- outputs=gr.Textbox(label="Summary"),
97
- title="PDF Earnings Summary Generator",
98
- description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
99
  )
 
100
  print("Launching Gradio interface")
101
  iface.launch()
 
4
  import requests
5
  from dotenv import load_dotenv
6
  import tiktoken
7
+
8
  # Load environment variables
9
  load_dotenv()
10
+
11
  # Get the Hugging Face API token
12
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
13
+
14
  # Initialize the tokenizer
15
  tokenizer = tiktoken.get_encoding("cl100k_base")
16
+
17
  def count_tokens(text):
18
+ return len(tokenizer.encode(text))
19
+
20
  def summarize_text(text, instructions, agent_name):
21
+ print(f"{agent_name}: Starting summarization")
22
+ API_URL = "https://api-inference.huggingface.co/models/meta-llama/Meta-Llama-3-8B-Instruct"
23
+ headers = {"Authorization": f"Bearer {HUGGINGFACE_TOKEN}"}
24
+
25
+ payload = {
26
+ "inputs": f"{instructions}\n\nText to summarize:\n{text}",
27
+ "parameters": {"max_length": 500}
28
+ }
29
+
30
+ print(f"{agent_name}: Sending request to API")
31
+ response = requests.post(API_URL, headers=headers, json=payload)
32
+ print(f"{agent_name}: Received response from API")
33
+
34
+ # Extracting only the generated summary from the response
35
+ generated_text = response.json()[0]["generated_text"]
36
+
37
+ # Assuming the model returns the entire input followed by the summary
38
+ # Split the generated text by the delimiter "\n\n" and take the last part as the summary
39
+ summary = generated_text.split("\n\n")[-1]
40
+
41
+ return summary
42
+
43
  def process_pdf(pdf_file, chunk_instructions, window_instructions, final_instructions):
44
+ print("Starting PDF processing")
45
+ # Read PDF
46
+ reader = PdfReader(pdf_file)
47
+ text = ""
48
+ for page in reader.pages:
49
+ text += page.extract_text() + "\n\n"
50
+
51
+ print(f"Extracted {len(reader.pages)} pages from PDF")
52
+
53
+ # Chunk the text (simple splitting by pages for this example)
54
+ chunks = text.split("\n\n")
55
+ print(f"Split text into {len(chunks)} chunks")
56
+
57
+ # Agent 1: Summarize each chunk
58
+ agent1_summaries = []
59
+ for i, chunk in enumerate(chunks):
60
+ print(f"Agent 1: Processing chunk {i+1}/{len(chunks)}")
61
+ summary = summarize_text(chunk, chunk_instructions, "Agent 1")
62
+ agent1_summaries.append(summary)
63
+
64
+ print("Agent 1: Finished processing all chunks")
65
+
66
+ # Concatenate Agent 1 summaries
67
+ concatenated_summary = "\n\n".join(agent1_summaries)
68
+ print(f"Concatenated Agent 1 summaries (length: {len(concatenated_summary)})")
69
+ print(f"Concatenated Summary: {concatenated_summary}")
70
+
71
+ # Sliding window approach
72
+ window_size = 3500 # in tokens
73
+ step_size = 3000 # overlap of 500 tokens
74
+ windows = []
75
+ current_position = 0
76
+
77
+ while current_position < len(concatenated_summary):
78
+ window_end = current_position
79
+ window_text = ""
80
+ while count_tokens(window_text) < window_size and window_end < len(concatenated_summary):
81
+ window_text += concatenated_summary[window_end]
82
+ window_end += 1
83
+ windows.append(window_text)
84
+ current_position += step_size
85
+
86
+ print(f"Created {len(windows)} windows for intermediate summarization")
87
+
88
+ # Intermediate summarization
89
+ intermediate_summaries = []
90
+ for i, window in enumerate(windows):
91
+ print(f"Processing window {i+1}/{len(windows)}")
92
+ summary = summarize_text(window, window_instructions, f"Window {i+1}")
93
+ intermediate_summaries.append(summary)
94
+
95
+ # Final summarization
96
+ final_input = "\n\n".join(intermediate_summaries)
97
+ print(f"Final input length: {count_tokens(final_input)} tokens")
98
+ final_summary = summarize_text(final_input, final_instructions, "Agent 2")
99
+ print("Agent 2: Finished final summarization")
100
+
101
+ return final_summary
102
+
103
  def pdf_summarizer(pdf_file, chunk_instructions, window_instructions, final_instructions):
104
+ if pdf_file is None:
105
+ print("Error: No PDF file uploaded")
106
+ return "Please upload a PDF file."
107
+
108
+ try:
109
+ print(f"Starting summarization process for file: {pdf_file.name}")
110
+ summary = process_pdf(pdf_file.name, chunk_instructions, window_instructions, final_instructions)
111
+ print("Summarization process completed successfully")
112
+ return summary
113
+ except Exception as e:
114
+ print(f"An error occurred: {str(e)}")
115
+ return f"An error occurred: {str(e)}"
116
+
117
  # Gradio interface
118
  iface = gr.Interface(
119
+ fn=pdf_summarizer,
120
+ inputs=[
121
+ gr.File(label="Upload PDF"),
122
+ gr.Textbox(label="Chunk Instructions", placeholder="Instructions for summarizing each chunk"),
123
+ gr.Textbox(label="Window Instructions", placeholder="Instructions for summarizing each window"),
124
+ gr.Textbox(label="Final Instructions", placeholder="Instructions for final summarization")
125
+ ],
126
+ outputs=gr.Textbox(label="Summary"),
127
+ title="PDF Earnings Summary Generator",
128
+ description="Upload a PDF of an earnings summary or transcript to generate a concise summary."
129
  )
130
+
131
  print("Launching Gradio interface")
132
  iface.launch()