ner-investing / app.py
carblacac's picture
add example to app
7442c2a
from io import StringIO, BytesIO
import gradio as gr
from pdfminer.high_level import extract_text
from transformers import pipeline
import pandas as pd
import numpy as np
nlp = pipeline("ner", model="carblacac/ner-investing", tokenizer="carblacac/ner-investing")
class Group():
def __init__(self):
self.id = 0
self.text = ''
def getgroup(self,text):
if self.text == text:
return self.id
else:
self.id +=1
self.text = text
return self.id
grp_gen = Group()
def entities_to_df(entities):
df = pd.DataFrame(entities)
df['entity'] = df['entity'].apply(lambda x: x[2:])
df['group'] = df['entity'].apply(grp_gen.getgroup)
group_tag = df.groupby(by='group')
img_tagging = group_tag.agg({
'start':min,
'end':max,
'entity':np.unique,
'word':lambda x: " ".join(x)
})
return img_tagging
def transform_entity_type(entities):
for d in entities:
d['entity'] = d['entity'][0]
return entities
def highlight_text(fileObj):
path = BytesIO(fileObj)
text = extract_text(path)
entities = nlp(text)
df = entities_to_df(entities)
entities = df.to_dict('records')
entities = transform_entity_type(entities)
return {"text": text, "entities": entities}
examples = ['Beiersdorf sees slower sales this year after bumper 2022 By Reuters.pdf']
gr.Interface(fn=highlight_text,
inputs=gr.inputs.File(file_count="single", type="bytes"),
outputs=gr.HighlightedText(),
examples=examples
).launch()