# https://huggingface.co/spaces/rashisinghal/ai_speech_application # Here are the imports """ !pip install pymupdf !pip install git+https://github.com/huggingface/transformers.git !pip install datasets sentencepiece !pip install unidecode !pip install transformers !pip install gradio """ import gradio as gr from pymupdf import fitz import torch from unidecode import unidecode import pandas as pd import numpy as np import re import soundfile as sf from IPython.display import Audio from datasets import load_dataset from transformers import pipeline from transformers import SpeechT5HifiGan from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech # Here is the code def pdf_to_speech(pdf_path): # The “doc” is a PyMuPDF’s Document class representing the whole document. We will get every necessary information from it, including the text. doc = fitz.open(pdf_path) # We need to isolate various sections of the page in order to search for Abstract Paragraph. It can be done by passing the parameter “blocks” to the get_text() method. # The output is a list of tuple items, each item will look like this: # (x0, yo, x1, y1, "lines in the block", block_no, block_type) # Since our PDF is a multipage document we will using a loop to get the plain text from the document for page in doc: text = page.get_text() output = page.get_text("blocks") # ANALYZING THE TEXT TO EXTRACT ABSTRACT # A span is an inline container that helps mark up a part of a text or a part of a document. In short, span is a small chunk of text. # To get the spans from the PDF file, we have passed the parameter “dict” into the get_text() method of the doc object. # The “block_dict” is a dictionary containing detailed information of all spans in a document. block_dict = {} page_num = 1 for page in doc: # Iterate all pages in the document file_dict = page.get_text('dict') # Get the page dictionary block = file_dict['blocks'] # Get the block information block_dict[page_num] = block # Store in block dictionary page_num += 1 # Increase the page value by 1 # In this we will retrieve the spans and store them in a DataFrame as follow: # The code tries to loop over the page, blocks, and lines in a document. Then we will get every span in a line. # Although there are some properties in the spans, we care about the bbox (the bounding box), size, font, and text only. spans = pd.DataFrame(columns=['xmin', 'ymin', 'xmax', 'ymax', 'text', 'tag']) rows = [] for page_num, blocks in block_dict.items(): for block in blocks: if block['type'] == 0: for line in block['lines']: for span in line['spans']: xmin, ymin, xmax, ymax = list(span['bbox']) font_size = span['size'] text = unidecode(span['text']) span_font = span['font'] is_upper = False is_bold = False if "bold" in span_font.lower(): is_bold = True if re.sub("[\(\[].*?[\)\]]", "", text).isupper(): is_upper = True if text.replace(" ","") != "": rows.append((xmin, ymin, xmax, ymax, text, is_upper, is_bold, span_font, font_size)) span_df = pd.DataFrame(rows, columns=['xmin','ymin','xmax','ymax', 'text', 'is_upper','is_bold','span_font', 'font_size']) span_scores=[] span_num_occur={} special = '[(_:/,#%\=@)]' for index, span_row in span_df.iterrows(): score = round(span_row.font_size) text = span_row.text if not re.search(special, text): if span_row.is_bold: score +=1 if span_row.is_upper: score +=1 span_scores.append(score) values, counts = np.unique(span_scores, return_counts=True) # From this, we want to know the numer of unique text styles in the document, and the number of its occurrences. values, counts = np.unique(span_scores, return_counts=True) style_dict = {} for value, count in zip(values, counts): style_dict[value] = count sorted(style_dict.items(), key=lambda x: x[1]) # From this, we will be able to create a new column in our span dataframe for the tag information. # More the occurances means its a Paragraph and not the heading p_size = max(style_dict, key=style_dict.get) idx = 0 tag = {} for size in sorted(values, reverse = True): idx += 1 if size == p_size: idx = 0 tag[size] = 'p' if size > p_size: tag[size] = 'h{0}'.format(idx) if size < p_size: tag[size] = 's{0}'.format(idx) span_tags = [tag[score] for score in span_scores] span_df['tag'] = span_tags # We’re now clear on which text is the headings and which one is the content in the document. This is very useful when extracting information # since we want all paragraphs below a heading will be grouped. We will create a new dataframe where we can store the text by headings. # Thus we can easily extract information based on headings. headings_list = [] text_list = [] tmp = [] heading = '' for index, span_row in span_df.iterrows(): text = span_row.text tag = span_row.tag if 'h' in tag: headings_list.append(text) text_list.append('\n'.join(tmp)) tmp = [] heading = text else: tmp.append(text) text_list.append('\n'.join(tmp)) text_list = text_list[1:] text_df = pd.DataFrame(zip(headings_list, text_list),columns=['heading', 'content'] ) # Extracting the content of the column of the dataframe where the another column named heading is Abstract. # Basically, extracting the content of the paragraph abstract str_abstract=text_df.loc[text_df['heading'] == 'Abstract', 'content'].item() # Using the Summarization model pszemraj/long-t5-tglobal-base-sci-simplify in the pipeline in order to generate summary of text new_summarized_pipeline= pipeline(task="summarization", model="pszemraj/long-t5-tglobal-base-sci-simplify") summarized_text=new_summarized_pipeline(str_abstract) # Creating string from the list of dictionary str_summary = ",".join([item['summary_text'] for item in summarized_text]) # We tokenize the input with the processor. The input is the string that we generated of the summary processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") inputs = processor(text=str_summary, return_tensors="pt") embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") with torch.no_grad(): speech = vocoder(spectrogram) # Generating the speech of the summarized one liner Abstract speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder) sr=16000 return (sr,speech.numpy()) # Audio(speech, rate=16000) # Using Gradio Interface to specify the function name, inputs and outputs app = gr.Interface(fn=pdf_to_speech, inputs="file", outputs="audio", title="PDF Abstract to Audio Application", description="This App accepts PDF which has Abstract , summarises it and converts into Speech. Click to upload PDF with abstract.", theme="soft") app.launch()