Spaces:

rashisinghal
/

ai_speech_application

Runtime error

App Files Files Community

ai_speech_application / app.py

rashisinghal

Update app.py

da4e22b about 1 year ago

raw

history blame

7.96 kB

	# https://huggingface.co/spaces/rashisinghal/ai_speech_application

	# Here are the imports
	"""
	!pip install pymupdf
	!pip install git+https://github.com/huggingface/transformers.git
	!pip install datasets sentencepiece
	!pip install unidecode
	!pip install transformers
	!pip install gradio
	"""
	import gradio as gr
	import fitz
	import torch
	from unidecode import unidecode
	import pandas as pd
	import numpy as np
	import re
	import soundfile as sf
	from IPython.display import Audio
	from datasets import load_dataset
	from transformers import pipeline
	from transformers import SpeechT5HifiGan
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech


	# Here is the code


	def pdf_to_speech(pdf_path):
	# The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text.

	doc = fitz.open(pdf_path)

	# We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method.
	# The output is a list of tuple items, each item will look like this:
	# (x0, yo, x1, y1, "lines in the block", block_no, block_type)


	# Since our PDF is a multipage document we will using a loop to get the plain text from the document
	for page in doc:
	text = page.get_text()
	output = page.get_text("blocks")

	# ANALYZING THE TEXT TO EXTRACT ABSTRACT

	# A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text.
	# To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object.
	# The “block_dict” is a dictionary containing detailed information of all spans in a document.


	block_dict = {}
	page_num = 1
	for page in doc: # Iterate all pages in the document
	file_dict = page.get_text('dict') # Get the page dictionary
	block = file_dict['blocks'] # Get the block information
	block_dict[page_num] = block # Store in block dictionary
	page_num += 1 # Increase the page value by 1


	# In this we will retrieve the spans and store them in a DataFrame as follow:
	# The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line.
	# Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only.


	spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag'])
	rows = []
	for page_num, blocks in block_dict.items():
	for block in blocks:
	if block['type'] == 0:
	for line in block['lines']:
	for span in line['spans']:
	xmin, ymin, xmax, ymax = list(span['bbox'])
	font_size = span['size']
	text = unidecode(span['text'])
	span_font = span['font']
	is_upper = False
	is_bold = False
	if "bold" in span_font.lower():
	is_bold = True
	if re.sub("[\(\[].*?[\)\]]", "", text).isupper():
	is_upper = True
	if text.replace(" ","") != "":
	rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size))
	span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size'])

	span_scores=[]
	span_num_occur={}
	special = '[(_:/,#%\=@)]'
	for index, span_row in span_df.iterrows():

	score = round(span_row.font_size)
	text = span_row.text
	if not re.search(special, text):
	if span_row.is_bold:
	score +=1
	if span_row.is_upper:
	score +=1
	span_scores.append(score)
	values, counts = np.unique(span_scores, return_counts=True)


	# From this, we want to know the numer of unique text styles in the document, and the number of its occurrences.

	values, counts = np.unique(span_scores, return_counts=True)
	style_dict = {}
	for value, count in zip(values, counts):
	style_dict[value] = count
	sorted(style_dict.items(), key=lambda x: x[1])


	# From this, we will be able to create a new column in our span dataframe for the tag information.
	# More the occurances means its a Paragraph and not the heading

	p_size = max(style_dict, key=style_dict.get)
	idx = 0
	tag = {}
	for size in sorted(values, reverse = True):
	idx += 1
	if size == p_size:
	idx = 0
	tag[size] = 'p'
	if size > p_size:
	tag[size] = 'h{0}'.format(idx)
	if size < p_size:
	tag[size] = 's{0}'.format(idx)


	span_tags = [tag[score] for score in span_scores]
	span_df['tag'] = span_tags

	# We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information
	# since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings.
	# Thus we can easily extract information based on headings.

	headings_list = []
	text_list = []
	tmp = []
	heading = ''

	for index, span_row in span_df.iterrows():
	text = span_row.text
	tag = span_row.tag
	if 'h' in tag:
	headings_list.append(text)
	text_list.append('\n'.join(tmp))
	tmp = []
	heading = text
	else:
	tmp.append(text)
	text_list.append('\n'.join(tmp))
	text_list = text_list[1:]
	text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] )

	# Extracting the content of the column of the dataframe where the another column named heading is Abstract.
	# Basically, extracting the content of the paragraph abstract
	str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item()

	# Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text

	new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify")
	summarized_text=new_summarized_pipeline(str_abstract)

	# Creating string from the list of dictionary
	str_summary = ",".join([item['summary_text'] for item in summarized_text])

	# We tokenize the input with the processor. The input is the string that we generated of the summary

	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

	inputs = processor(text=str_summary, return_tensors="pt")

	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

	speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

	spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)


	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	with torch.no_grad():
	speech = vocoder(spectrogram)

	# Generating the speech of the summarized one liner Abstract
	speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

	sr=16000
	return (sr,speech.numpy())
	# Audio(speech, rate=16000)


	# Using Gradio Interface to specify the function name, inputs and outputs
	app = gr.Interface(fn=pdf_to_speech,
	inputs="file",
	outputs="audio",
	title="PDF Abstract to Audio Application",
	description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.",
	theme="soft")

	app.launch()