Spaces:
Sleeping
Sleeping
from io import StringIO, BytesIO | |
import gradio as gr | |
from pdfminer.high_level import extract_text | |
from transformers import pipeline | |
import pandas as pd | |
import numpy as np | |
nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing") | |
class Group(): | |
def __init__(self): | |
self.id = 0 | |
self.text = '' | |
def getgroup(self,text): | |
if self.text == text: | |
return self.id | |
else: | |
self.id +=1 | |
self.text = text | |
return self.id | |
grp_gen = Group() | |
def entities_to_df(entities): | |
df = pd.DataFrame(entities) | |
df['entity'] = df['entity'].apply(lambda x: x[2:]) | |
df['group'] = df['entity'].apply(grp_gen.getgroup) | |
group_tag = df.groupby(by='group') | |
img_tagging = group_tag.agg({ | |
'start':min, | |
'end':max, | |
'entity':np.unique, | |
'word':lambda x: " ".join(x) | |
}) | |
return img_tagging | |
def transform_entity_type(entities): | |
for d in entities: | |
d['entity'] = d['entity'][0] | |
return entities | |
def highlight_text(fileObj): | |
path = BytesIO(fileObj) | |
text = extract_text(path) | |
entities = nlp(text) | |
df = entities_to_df(entities) | |
entities = df.to_dict('records') | |
entities = transform_entity_type(entities) | |
return {"text": text, "entities": entities} | |
examples = ['Beiersdorf sees slower sales this year after bumper 2022 By Reuters.pdf'] | |
gr.Interface(fn=highlight_text, | |
inputs=gr.inputs.File(file_count="single", type="bytes"), | |
outputs=gr.HighlightedText(), | |
examples=examples | |
).launch() | |