Spaces:

Yoxas
/

Creatingdataset

Runtime error

App Files Files Community

Creatingdataset / app.py

Yoxas

Create app.py

5ec4eda verified 11 months ago

raw

history blame

3.87 kB

	import os
	import re
	import PyPDF2
	import pandas as pd
	from transformers import pipeline, AutoTokenizer
	import gradio as gr

	# Function to clean text by keeping only alphanumeric characters and spaces
	def clean_text(text):
	return re.sub(r'[^a-zA-Z0-9\s]', '', text)

	# Function to extract text from PDF files
	def extract_text(pdf_file):
	pdf_reader = PyPDF2.PdfReader(pdf_file)
	text = ''
	for page_num in range(len(pdf_reader.pages)):
	text += pdf_reader.pages[page_num].extract_text()
	return text

	# Function to split text into chunks of a specified size
	def split_text(text, chunk_size=1024):
	words = text.split()
	for i in range(0, len(words), chunk_size):
	yield ' '.join(words[i:i + chunk_size])

	# Load the LED tokenizer
	led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")

	# Function to classify text using LED model
	def classify_text(text):
	classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
	try:
	return classifier(text)[0]['label']
	except IndexError:
	return "Unable to classify"

	# Function to summarize text using BGE-m3 model
	def summarize_text(text, max_length=100, min_length=30):
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
	try:
	return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to summarize"

	# Function to extract a title-like summary from the beginning of the text
	def extract_title(text, max_length=20):
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
	try:
	return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to extract title"

	# Function to process PDF files and generate summaries
	def process_pdfs(pdf_files):
	data = []

	for pdf_file in pdf_files:
	text = extract_text(pdf_file)

	# Extract a title from the beginning of the text
	title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
	title = extract_title(title_text)

	# Initialize placeholders for combined results
	combined_abstract = []
	combined_cleaned_text = []

	# Split text into chunks and process each chunk
	for chunk in split_text(text, chunk_size=512):
	# Summarize the text chunk
	abstract = summarize_text(chunk)
	combined_abstract.append(abstract)

	# Clean the text chunk
	cleaned_text = clean_text(chunk)
	combined_cleaned_text.append(cleaned_text)

	# Combine results from all chunks
	final_abstract = ' '.join(combined_abstract)
	final_cleaned_text = ' '.join(combined_cleaned_text)

	# Append the data to the list
	data.append([title, final_abstract, final_cleaned_text])

	# Create a DataFrame from the data list
	df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])

	# Save the DataFrame to a CSV file in the same folder as the source folder
	csv_file_path = 'processed_pdfs.csv'
	df.to_csv(csv_file_path, index=False)

	return csv_file_path

	# Gradio interface
	pdf_input = gr.inputs.File(label="Upload PDF Files", type="file", multiple=True)
	csv_output = gr.outputs.File(label="Download CSV")

	gr.Interface(
	fn=process_pdfs,
	inputs=pdf_input,
	outputs=csv_output,
	title="PDF Summarizer",
	description="Upload PDF files and get a summarized CSV file."
	).launch()