File size: 4,730 Bytes
3453878
 
32759e5
3453878
 
1f6f397
3453878
 
 
 
8e6b567
ee72e16
3453878
 
8e6b567
34c68f0
8e6b567
 
 
 
 
3453878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c9cacf8
3453878
 
 
 
 
 
 
c9cacf8
3453878
 
 
 
 
 
 
c9cacf8
3453878
 
 
 
 
 
 
 
 
 
c9cacf8
3453878
8e6b567
3453878
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import re
import torch
import pandas as pd
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
from gradio import Interface, File
import gradio as gr
import spaces

# Load the tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

# Load the model separately
model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3")

# Move the model to CUDA if available
if torch.cuda.is_available():
    model = model.to("cuda")

# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
# Function to extract text from PDF files
def extract_text(pdf_file):
    try:
        pdf_reader = PdfReader(pdf_file)
        if pdf_reader.is_encrypted:
            print(f"Skipping encrypted file: {pdf_file}")
            return None
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text() or ''
        return text
    except Exception as e:
        print(f"Error extracting text from {pdf_file}: {e}")
        return None

# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

# Function to classify text using LED model
@spaces.GPU(duration=120)
def classify_text(text):
    try:
        return classifier(text)[0]['label']
    except IndexError:
        return "Unable to classify"

# Function to summarize text using the summarizer model
@spaces.GPU(duration=120)
def summarize_text(text, max_length=100, min_length=30):
    try:
        return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to summarize"

# Function to extract a title-like summary from the beginning of the text
@spaces.GPU(duration=120)
def extract_title(text, max_length=20):
    try:
        return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
    except IndexError:
        return "Unable to extract title"

# Define the folder path and CSV file path
# output_folder_path = '/content/drive/My Drive/path_to_output'  # Adjust this to your actual path

# Define the Gradio interface for file upload and download
@spaces.GPU(duration=120)
def process_files(pdf_files):
    data = []
    for pdf_file in pdf_files:
        text = extract_text(pdf_file)

        # Skip encrypted files
        if text is None:
            continue

        # Extract a title from the beginning of the text
        title_text = ' '.join(text.split()[:512])  # Take the first 512 tokens for title extraction
        title = extract_title(title_text)

        # Initialize placeholders for combined results
        combined_abstract = []
        combined_cleaned_text = []

        # Split text into chunks and process each chunk
        for chunk in split_text(text, chunk_size=512):
            # Summarize the text chunk
            abstract = summarize_text(chunk)
            combined_abstract.append(abstract)

            # Clean the text chunk
            cleaned_text = clean_text(chunk)
            combined_cleaned_text.append(cleaned_text)

        # Combine results from all chunks
        final_abstract = ' '.join(combined_abstract)
        final_cleaned_text = ' '.join(combined_cleaned_text)

        # Append the data to the list
        data.append([title, final_abstract, final_cleaned_text])

    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])

    # Save the DataFrame to a CSV file
    output_file_path = 'processed_pdfs.csv'
    df.to_csv(output_file_path, index=False)

    return output_file_path
    
# Gradio interface
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
csv_output = gr.File(label="Download CSV")

gr.Interface(
    fn=process_files, 
    inputs=pdf_input, 
    outputs=csv_output,
    title="Dataset creation",
    description="Upload PDF files and get a summarized CSV file.",
    article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
                <p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
                <p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
).launch(share=True)