rashisinghal's picture
Update app.py
da4e22b
raw
history blame
7.96 kB
# https://huggingface.co/spaces/rashisinghal/ai_speech_application
# Here are the imports
"""
!pip install pymupdf
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets sentencepiece
!pip install unidecode
!pip install transformers
!pip install gradio
"""
import gradio as gr
import fitz
import torch
from unidecode import unidecode
import pandas as pd
import numpy as np
import re
import soundfile as sf
from IPython.display import Audio
from datasets import load_dataset
from transformers import pipeline
from transformers import SpeechT5HifiGan
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
# Here is the code
def pdf_to_speech(pdf_path):
# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.
doc = fitz.open(pdf_path)
# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
# The output is a list of tuple items, each item will look like this:
# (x0, yo, x1, y1, "lines in the block", block_no, block_type)
# Since our PDF is a multipage document we will using a loop to get the plain text from the document
for page in doc:
text = page.get_text()
output = page.get_text("blocks")
# ANALYZING THE TEXT TO EXTRACT ABSTRACT
# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
# The “block_dict” is a dictionary containing detailed information of all spans in a document.
block_dict = {}
page_num = 1
for page in doc: # Iterate all pages in the document
file_dict = page.get_text('dict') # Get the page dictionary
block = file_dict['blocks'] # Get the block information
block_dict[page_num] = block # Store in block dictionary
page_num += 1 # Increase the page value by 1
# In this we will retrieve the spans and store them in a DataFrame as follow:
# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
rows = []
for page_num, blocks in block_dict.items():
for block in blocks:
if block['type'] == 0:
for line in block['lines']:
for span in line['spans']:
xmin, ymin, xmax, ymax = list(span['bbox'])
font_size = span['size']
text = unidecode(span['text'])
span_font = span['font']
is_upper = False
is_bold = False
if "bold" in span_font.lower():
is_bold = True
if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
is_upper = True
if text.replace(" ","") != "":
rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])
span_scores=[]
span_num_occur={}
special = '[(_:/,#%\=@)]'
for index, span_row in span_df.iterrows():
score = round(span_row.font_size)
text = span_row.text
if not re.search(special, text):
if span_row.is_bold:
score +=1
if span_row.is_upper:
score +=1
span_scores.append(score)
values, counts = np.unique(span_scores, return_counts=True)
# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.
values, counts = np.unique(span_scores, return_counts=True)
style_dict = {}
for value, count in zip(values, counts):
style_dict[value] = count
sorted(style_dict.items(), key=lambda x: x[1])
# From this, we will be able to create a new column in our span dataframe for the tag information.
# More the occurances means its a Paragraph and not the heading
p_size = max(style_dict, key=style_dict.get)
idx = 0
tag = {}
for size in sorted(values, reverse = True):
idx += 1
if size == p_size:
idx = 0
tag[size] = 'p'
if size > p_size:
tag[size] = 'h{0}'.format(idx)
if size < p_size:
tag[size] = 's{0}'.format(idx)
span_tags = [tag[score] for score in span_scores]
span_df['tag'] = span_tags
# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
# Thus we can easily extract information based on headings.
headings_list = []
text_list = []
tmp = []
heading = ''
for index, span_row in span_df.iterrows():
text = span_row.text
tag = span_row.tag
if 'h' in tag:
headings_list.append(text)
text_list.append('\n'.join(tmp))
tmp = []
heading = text
else:
tmp.append(text)
text_list.append('\n'.join(tmp))
text_list = text_list[1:]
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )
# Extracting the content of the column of the dataframe where the another column named heading is Abstract.
# Basically, extracting the content of the paragraph abstract
str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()
# Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text
new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
summarized_text=new_summarized_pipeline(str_abstract)
# Creating string from the list of dictionary
str_summary = ",".join([item['summary_text'] for item in summarized_text])
# We tokenize the input with the processor. The input is the string that we generated of the summary
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
inputs = processor(text=str_summary, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
with torch.no_grad():
speech = vocoder(spectrogram)
# Generating the speech of the summarized one liner Abstract
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sr=16000
return (sr,speech.numpy())
# Audio(speech, rate=16000)
# Using Gradio Interface to specify the function name, inputs and outputs
app = gr.Interface(fn=pdf_to_speech,
inputs="file",
outputs="audio",
title="PDF Abstract to Audio Application",
description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
theme="soft")
app.launch()