Spaces:
Runtime error
Runtime error
File size: 4,730 Bytes
3453878 32759e5 3453878 1f6f397 3453878 8e6b567 ee72e16 3453878 8e6b567 34c68f0 8e6b567 3453878 c9cacf8 3453878 c9cacf8 3453878 c9cacf8 3453878 c9cacf8 3453878 8e6b567 3453878 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
import os
import re
import torch
import pandas as pd
from PyPDF2 import PdfReader
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
from gradio import Interface, File
import gradio as gr
import spaces
# Load the tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3")
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
# Load the model separately
model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3")
# Move the model to CUDA if available
if torch.cuda.is_available():
model = model.to("cuda")
# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Function to extract text from PDF files
def extract_text(pdf_file):
try:
pdf_reader = PdfReader(pdf_file)
if pdf_reader.is_encrypted:
print(f"Skipping encrypted file: {pdf_file}")
return None
text = ''
for page in pdf_reader.pages:
text += page.extract_text() or ''
return text
except Exception as e:
print(f"Error extracting text from {pdf_file}: {e}")
return None
# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
words = text.split()
for i in range(0, len(words), chunk_size):
yield ' '.join(words[i:i + chunk_size])
# Function to classify text using LED model
@spaces.GPU(duration=120)
def classify_text(text):
try:
return classifier(text)[0]['label']
except IndexError:
return "Unable to classify"
# Function to summarize text using the summarizer model
@spaces.GPU(duration=120)
def summarize_text(text, max_length=100, min_length=30):
try:
return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to summarize"
# Function to extract a title-like summary from the beginning of the text
@spaces.GPU(duration=120)
def extract_title(text, max_length=20):
try:
return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to extract title"
# Define the folder path and CSV file path
# output_folder_path = '/content/drive/My Drive/path_to_output' # Adjust this to your actual path
# Define the Gradio interface for file upload and download
@spaces.GPU(duration=120)
def process_files(pdf_files):
data = []
for pdf_file in pdf_files:
text = extract_text(pdf_file)
# Skip encrypted files
if text is None:
continue
# Extract a title from the beginning of the text
title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
title = extract_title(title_text)
# Initialize placeholders for combined results
combined_abstract = []
combined_cleaned_text = []
# Split text into chunks and process each chunk
for chunk in split_text(text, chunk_size=512):
# Summarize the text chunk
abstract = summarize_text(chunk)
combined_abstract.append(abstract)
# Clean the text chunk
cleaned_text = clean_text(chunk)
combined_cleaned_text.append(cleaned_text)
# Combine results from all chunks
final_abstract = ' '.join(combined_abstract)
final_cleaned_text = ' '.join(combined_cleaned_text)
# Append the data to the list
data.append([title, final_abstract, final_cleaned_text])
# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
# Save the DataFrame to a CSV file
output_file_path = 'processed_pdfs.csv'
df.to_csv(output_file_path, index=False)
return output_file_path
# Gradio interface
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
csv_output = gr.File(label="Download CSV")
gr.Interface(
fn=process_files,
inputs=pdf_input,
outputs=csv_output,
title="Dataset creation",
description="Upload PDF files and get a summarized CSV file.",
article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
<p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
<p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
).launch(share=True) |