import os import re import torch import pandas as pd from PyPDF2 import PdfReader from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification from gradio import Interface, File import gradio as gr import spaces # Load the tokenizer and model led_tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-reranker-v2-m3") summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt") # Load the model separately model = AutoModelForSequenceClassification.from_pretrained("BAAI/bge-reranker-v2-m3") # Move the model to CUDA if available if torch.cuda.is_available(): model = model.to("cuda") # Function to clean text by keeping only alphanumeric characters and spaces def clean_text(text): return re.sub(r'[^a-zA-Z0-9\s]', '', text) # Function to extract text from PDF files def extract_text(pdf_file): try: pdf_reader = PdfReader(pdf_file) if pdf_reader.is_encrypted: print(f"Skipping encrypted file: {pdf_file}") return None text = '' for page in pdf_reader.pages: text += page.extract_text() or '' return text except Exception as e: print(f"Error extracting text from {pdf_file}: {e}") return None # Function to split text into chunks of a specified size def split_text(text, chunk_size=1024): words = text.split() for i in range(0, len(words), chunk_size): yield ' '.join(words[i:i + chunk_size]) # Function to classify text using LED model @spaces.GPU(duration=120) def classify_text(text): try: return classifier(text)[0]['label'] except IndexError: return "Unable to classify" # Function to summarize text using the summarizer model @spaces.GPU(duration=120) def summarize_text(text, max_length=100, min_length=30): try: return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text'] except IndexError: return "Unable to summarize" # Function to extract a title-like summary from the beginning of the text @spaces.GPU(duration=120) def extract_title(text, max_length=20): try: return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text'] except IndexError: return "Unable to extract title" # Define the folder path and CSV file path # output_folder_path = '/content/drive/My Drive/path_to_output' # Adjust this to your actual path # Define the Gradio interface for file upload and download @spaces.GPU(duration=120) def process_files(pdf_files): data = [] for pdf_file in pdf_files: text = extract_text(pdf_file) # Skip encrypted files if text is None: continue # Extract a title from the beginning of the text title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction title = extract_title(title_text) # Initialize placeholders for combined results combined_abstract = [] combined_cleaned_text = [] # Split text into chunks and process each chunk for chunk in split_text(text, chunk_size=512): # Summarize the text chunk abstract = summarize_text(chunk) combined_abstract.append(abstract) # Clean the text chunk cleaned_text = clean_text(chunk) combined_cleaned_text.append(cleaned_text) # Combine results from all chunks final_abstract = ' '.join(combined_abstract) final_cleaned_text = ' '.join(combined_cleaned_text) # Append the data to the list data.append([title, final_abstract, final_cleaned_text]) # Create a DataFrame from the data list df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content']) # Save the DataFrame to a CSV file output_file_path = 'processed_pdfs.csv' df.to_csv(output_file_path, index=False) return output_file_path # Gradio interface pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple") csv_output = gr.File(label="Download CSV") gr.Interface( fn=process_files, inputs=pdf_input, outputs=csv_output, title="Dataset creation", description="Upload PDF files and get a summarized CSV file.", article="""

This is an experimental app that allows you to create a dataset from research papers.

This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.

The output file is a CSV with 3 columns: title, abstract, and content.

""" ).launch(share=True)