from io import StringIO, BytesIO import gradio as gr from pdfminer.high_level import extract_text from transformers import pipeline import pandas as pd import numpy as np nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing") class Group(): def __init__(self): self.id = 0 self.text = '' def getgroup(self,text): if self.text == text: return self.id else: self.id +=1 self.text = text return self.id grp_gen = Group() def entities_to_df(entities): df = pd.DataFrame(entities) df['entity'] = df['entity'].apply(lambda x: x[2:]) df['group'] = df['entity'].apply(grp_gen.getgroup) group_tag = df.groupby(by='group') img_tagging = group_tag.agg({ 'start':min, 'end':max, 'entity':np.unique, 'word':lambda x: " ".join(x) }) return img_tagging def transform_entity_type(entities): for d in entities: d['entity'] = d['entity'][0] return entities def highlight_text(fileObj): path = BytesIO(fileObj) text = extract_text(path) entities = nlp(text) df = entities_to_df(entities) entities = df.to_dict('records') entities = transform_entity_type(entities) return {"text": text, "entities": entities} examples = ['Beiersdorf sees slower sales this year after bumper 2022 By Reuters.pdf'] gr.Interface(fn=highlight_text, inputs=gr.inputs.File(file_count="single", type="bytes"), outputs=gr.HighlightedText(), examples=examples ).launch()