Spaces:
Runtime error
Runtime error
# https://huggingface.co/spaces/rashisinghal/ai_speech_application | |
# Here are the imports | |
""" | |
!pip install pymupdf | |
!pip install git+https://github.com/huggingface/transformers.git | |
!pip install datasets sentencepiece | |
!pip install unidecode | |
!pip install transformers | |
!pip install gradio | |
""" | |
import gradio as gr | |
import fitz | |
import torch | |
from unidecode import unidecode | |
import pandas as pd | |
import numpy as np | |
import re | |
import soundfile as sf | |
from IPython.display import Audio | |
from datasets import load_dataset | |
from transformers import pipeline | |
from transformers import SpeechT5HifiGan | |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
# Here is the code | |
def pdf_to_speech(pdf_path): | |
# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text. | |
doc = fitz.open(pdf_path) | |
# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method. | |
# The output is a list of tuple items, each item will look like this: | |
# (x0, yo, x1, y1, "lines in the block", block_no, block_type) | |
# Since our PDF is a multipage document we will using a loop to get the plain text from the document | |
for page in doc: | |
text = page.get_text() | |
output = page.get_text("blocks") | |
# ANALYZING THE TEXT TO EXTRACT ABSTRACT | |
# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text. | |
# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object. | |
# The “block_dict” is a dictionary containing detailed information of all spans in a document. | |
block_dict = {} | |
page_num = 1 | |
for page in doc: # Iterate all pages in the document | |
file_dict = page.get_text('dict') # Get the page dictionary | |
block = file_dict['blocks'] # Get the block information | |
block_dict[page_num] = block # Store in block dictionary | |
page_num += 1 # Increase the page value by 1 | |
# In this we will retrieve the spans and store them in a DataFrame as follow: | |
# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line. | |
# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only. | |
spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag']) | |
rows = [] | |
for page_num, blocks in block_dict.items(): | |
for block in blocks: | |
if block['type'] == 0: | |
for line in block['lines']: | |
for span in line['spans']: | |
xmin, ymin, xmax, ymax = list(span['bbox']) | |
font_size = span['size'] | |
text = unidecode(span['text']) | |
span_font = span['font'] | |
is_upper = False | |
is_bold = False | |
if "bold" in span_font.lower(): | |
is_bold = True | |
if re.sub("[\(\[].*?[\)\]]", "", text).isupper(): | |
is_upper = True | |
if text.replace(" ","") != "": | |
rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size)) | |
span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size']) | |
span_scores=[] | |
span_num_occur={} | |
special = '[(_:/,#%\=@)]' | |
for index, span_row in span_df.iterrows(): | |
score = round(span_row.font_size) | |
text = span_row.text | |
if not re.search(special, text): | |
if span_row.is_bold: | |
score +=1 | |
if span_row.is_upper: | |
score +=1 | |
span_scores.append(score) | |
values, counts = np.unique(span_scores, return_counts=True) | |
# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences. | |
values, counts = np.unique(span_scores, return_counts=True) | |
style_dict = {} | |
for value, count in zip(values, counts): | |
style_dict[value] = count | |
sorted(style_dict.items(), key=lambda x: x[1]) | |
# From this, we will be able to create a new column in our span dataframe for the tag information. | |
# More the occurances means its a Paragraph and not the heading | |
p_size = max(style_dict, key=style_dict.get) | |
idx = 0 | |
tag = {} | |
for size in sorted(values, reverse = True): | |
idx += 1 | |
if size == p_size: | |
idx = 0 | |
tag[size] = 'p' | |
if size > p_size: | |
tag[size] = 'h{0}'.format(idx) | |
if size < p_size: | |
tag[size] = 's{0}'.format(idx) | |
span_tags = [tag[score] for score in span_scores] | |
span_df['tag'] = span_tags | |
# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information | |
# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings. | |
# Thus we can easily extract information based on headings. | |
headings_list = [] | |
text_list = [] | |
tmp = [] | |
heading = '' | |
for index, span_row in span_df.iterrows(): | |
text = span_row.text | |
tag = span_row.tag | |
if 'h' in tag: | |
headings_list.append(text) | |
text_list.append('\n'.join(tmp)) | |
tmp = [] | |
heading = text | |
else: | |
tmp.append(text) | |
text_list.append('\n'.join(tmp)) | |
text_list = text_list[1:] | |
text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] ) | |
# Extracting the content of the column of the dataframe where the another column named heading is Abstract. | |
# Basically, extracting the content of the paragraph abstract | |
str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item() | |
# Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text | |
new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") | |
summarized_text=new_summarized_pipeline(str_abstract) | |
# Creating string from the list of dictionary | |
str_summary = ",".join([item['summary_text'] for item in summarized_text]) | |
# We tokenize the input with the processor. The input is the string that we generated of the summary | |
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
inputs = processor(text=str_summary, return_tensors="pt") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
with torch.no_grad(): | |
speech = vocoder(spectrogram) | |
# Generating the speech of the summarized one liner Abstract | |
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) | |
sr=16000 | |
return (sr,speech.numpy()) | |
# Audio(speech, rate=16000) | |
# Using Gradio Interface to specify the function name, inputs and outputs | |
app = gr.Interface(fn=pdf_to_speech, | |
inputs="file", | |
outputs="audio", | |
title="PDF Abstract to Audio Application", | |
description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.", | |
theme="soft") | |
app.launch() |