Yoxas commited on
Commit
5ec4eda
·
verified ·
1 Parent(s): bb2e8f1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import PyPDF2
4
+ import pandas as pd
5
+ from transformers import pipeline, AutoTokenizer
6
+ import gradio as gr
7
+
8
+ # Function to clean text by keeping only alphanumeric characters and spaces
9
+ def clean_text(text):
10
+ return re.sub(r'[^a-zA-Z0-9\s]', '', text)
11
+
12
+ # Function to extract text from PDF files
13
+ def extract_text(pdf_file):
14
+ pdf_reader = PyPDF2.PdfReader(pdf_file)
15
+ text = ''
16
+ for page_num in range(len(pdf_reader.pages)):
17
+ text += pdf_reader.pages[page_num].extract_text()
18
+ return text
19
+
20
+ # Function to split text into chunks of a specified size
21
+ def split_text(text, chunk_size=1024):
22
+ words = text.split()
23
+ for i in range(0, len(words), chunk_size):
24
+ yield ' '.join(words[i:i + chunk_size])
25
+
26
+ # Load the LED tokenizer
27
+ led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
28
+
29
+ # Function to classify text using LED model
30
+ def classify_text(text):
31
+ classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
32
+ try:
33
+ return classifier(text)[0]['label']
34
+ except IndexError:
35
+ return "Unable to classify"
36
+
37
+ # Function to summarize text using BGE-m3 model
38
+ def summarize_text(text, max_length=100, min_length=30):
39
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
40
+ try:
41
+ return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
42
+ except IndexError:
43
+ return "Unable to summarize"
44
+
45
+ # Function to extract a title-like summary from the beginning of the text
46
+ def extract_title(text, max_length=20):
47
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
48
+ try:
49
+ return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
50
+ except IndexError:
51
+ return "Unable to extract title"
52
+
53
+ # Function to process PDF files and generate summaries
54
+ def process_pdfs(pdf_files):
55
+ data = []
56
+
57
+ for pdf_file in pdf_files:
58
+ text = extract_text(pdf_file)
59
+
60
+ # Extract a title from the beginning of the text
61
+ title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
62
+ title = extract_title(title_text)
63
+
64
+ # Initialize placeholders for combined results
65
+ combined_abstract = []
66
+ combined_cleaned_text = []
67
+
68
+ # Split text into chunks and process each chunk
69
+ for chunk in split_text(text, chunk_size=512):
70
+ # Summarize the text chunk
71
+ abstract = summarize_text(chunk)
72
+ combined_abstract.append(abstract)
73
+
74
+ # Clean the text chunk
75
+ cleaned_text = clean_text(chunk)
76
+ combined_cleaned_text.append(cleaned_text)
77
+
78
+ # Combine results from all chunks
79
+ final_abstract = ' '.join(combined_abstract)
80
+ final_cleaned_text = ' '.join(combined_cleaned_text)
81
+
82
+ # Append the data to the list
83
+ data.append([title, final_abstract, final_cleaned_text])
84
+
85
+ # Create a DataFrame from the data list
86
+ df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
87
+
88
+ # Save the DataFrame to a CSV file in the same folder as the source folder
89
+ csv_file_path = 'processed_pdfs.csv'
90
+ df.to_csv(csv_file_path, index=False)
91
+
92
+ return csv_file_path
93
+
94
+ # Gradio interface
95
+ pdf_input = gr.inputs.File(label="Upload PDF Files", type="file", multiple=True)
96
+ csv_output = gr.outputs.File(label="Download CSV")
97
+
98
+ gr.Interface(
99
+ fn=process_pdfs,
100
+ inputs=pdf_input,
101
+ outputs=csv_output,
102
+ title="PDF Summarizer",
103
+ description="Upload PDF files and get a summarized CSV file."
104
+ ).launch()