File size: 4,298 Bytes
6616d67
 
 
 
 
 
 
 
1adb2ef
 
 
 
6616d67
 
 
1adb2ef
6616d67
1adb2ef
 
 
 
 
 
 
 
 
6616d67
1adb2ef
6616d67
1adb2ef
6616d67
 
 
 
1adb2ef
6616d67
1adb2ef
6616d67
 
 
 
1adb2ef
 
 
 
 
 
 
 
6616d67
1adb2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6616d67
1adb2ef
 
 
 
6616d67
1adb2ef
6616d67
 
1adb2ef
 
 
 
 
6616d67
1adb2ef
 
6616d67
1adb2ef
 
6616d67
1adb2ef
 
 
 
6616d67
 
1adb2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6616d67
 
1adb2ef
47bffbc
2a65e96
f7c9274
2a65e96
6616d67
 
1adb2ef
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Here are the imports
import PyPDF2
import re
import torch
from transformers import pipeline
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
import gradio as gr
import io
import numpy as np
import soundfile as sf
import tempfile

# Here is the code

# Function to extract and clean abstract from PDF
def extract_and_clean_abstract(uploaded_file):
    if uploaded_file is None:
        return "No file uploaded."

    # Read the file using its temporary file path
    with open(uploaded_file.name, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        full_text = ""
        for page in reader.pages:
            full_text += page.extract_text()

    # Find the abstract
    pattern = r"(Abstract|ABSTRACT|abstract)(.*?)(Introduction|INTRODUCTION|introduction|1|Keywords|KEYWORDS|keywords)"
    match = re.search(pattern, full_text, re.DOTALL)

    if match:
        abstract = match.group(2).strip()
    else:
        return "Abstract not found."

    # Clean the abstract
    cleaned_abstract = abstract.replace('\n', ' ').replace('- ', '')

    return cleaned_abstract

# Function to summarize text
def summarize_text(text):
    # Initialize the summarization pipeline with the summarization model
    summarizer = pipeline(
        "summarization",
        "pszemraj/led-base-book-summary",
        device=0 if torch.cuda.is_available() else -1,
    )

    # Generate the summary
    result = summarizer(
        text,
        min_length=8,
        max_length=25,
        no_repeat_ngram_size=3,
        encoder_no_repeat_ngram_size=3,
        repetition_penalty=3.5,
        num_beams=4,
        do_sample=False,
        early_stopping=True,
    )
     # Extract the first sentence from the summary
    first_sentence = re.split(r'(?<=[.:;!?])\s', result[0]['summary_text'])[0]

    return first_sentence

# Function for text-to-speech
def text_to_speech(text):
    # Check if CUDA is available and set the device accordingly
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load the TTS model and task from Hugging Face Hub
    models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
        "facebook/fastspeech2-en-ljspeech", # Or another TTS model of your choice
        arg_overrides={"vocoder": "hifigan", "fp16": False}
    )

    # Ensure the model is on the correct device
    model = models[0].to(device)

    # Update the config with the data config from the task
    TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)

    # Build the generator
    generator = task.build_generator([model], cfg)

    # Get the model input from the text
    sample = TTSHubInterface.get_model_input(task, text)
    sample["net_input"]["src_tokens"] = sample["net_input"]["src_tokens"].to(device)
    sample["net_input"]["src_lengths"] = sample["net_input"]["src_lengths"].to(device)

    # Generate the waveform
    wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)

    # Move the waveform to CPU if it's on GPU
    if wav.is_cuda:
        wav = wav.cpu()

    # Write the waveform to a temporary file and return the file path
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
        sf.write(tmp_file.name, wav.numpy(), rate)
        return tmp_file.name

def process_pdf(uploaded_file):
    """
    Process the uploaded PDF file to extract, summarize the abstract, and convert it to speech.
    """
    abstract = extract_and_clean_abstract(uploaded_file)
    summary = summarize_text(abstract)
    audio_output = text_to_speech(summary)
    return audio_output

# Create Gradio interface
iface = gr.Interface(
    fn=process_pdf,
    inputs=gr.File(label="Upload PDF"),
    outputs=gr.Audio(label="Audio Summary"),
    title="PDF Abstract Summary to Speech",
    description="Upload only a PDF file that has an abstract. The model will extract its abstract, summarize it, and converts the summary to speech.",
    examples=[["Article 11 Hidden Technical Debt in Machine Learning Systems.pdf"], ["Article 7 Efficient Estimation of Word Representations in Vector Space.pdf"],["Article 6 BloombergGPT_ A Large Language Model for Finance.pdf"]]
)

# Run the Gradio app
iface.launch()