Spaces:

nickmuchi
/

Earnings-Call-Analysis-Whisperer

Running

App Files Files Community

nickmuchi commited on Aug 13, 2023

Commit

1487c65

1 Parent(s): 7dbe2d3

Update functions.py

Browse files

Files changed (1) hide show

functions.py +3 -314

functions.py CHANGED Viewed

@@ -9,7 +9,7 @@ import plotly_express as px
 import nltk
 import plotly.graph_objects as go
 from optimum.onnxruntime import ORTModelForSequenceClassification
-from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import streamlit as st
 import en_core_web_lg
@@ -73,18 +73,15 @@ def load_models():
     '''Load and cache all the models to be used'''
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
-    kg_model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
-    kg_tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
-    emb_tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-xl')
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True)
     ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
     cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
     sbert = SentenceTransformer('all-MiniLM-L6-v2')
-    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert
 @st.cache_resource
 def get_spacy():
@@ -93,7 +90,7 @@ def get_spacy():
 nlp = get_spacy()
-sent_pipe, sum_pipe, ner_pipe, cross_encoder, kg_model, kg_tokenizer, emb_tokenizer, sbert  = load_models()
 @st.cache_data
 def get_yt_audio(url):
@@ -696,317 +693,9 @@ def fin_ext(text):
 ## Knowledge Graphs code
-@st.cache_data
-def extract_relations_from_model_output(text):
-    relations = []
-    relation, subject, relation, object_ = '', '', '', ''
-    text = text.strip()
-    current = 'x'
-    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
-    for token in text_replaced.split():
-        if token == "<triplet>":
-            current = 't'
-            if relation != '':
-                relations.append({
-                    'head': subject.strip(),
-                    'type': relation.strip(),
-                    'tail': object_.strip()
-                })
-                relation = ''
-            subject = ''
-        elif token == "<subj>":
-            current = 's'
-            if relation != '':
-                relations.append({
-                    'head': subject.strip(),
-                    'type': relation.strip(),
-                    'tail': object_.strip()
-                })
-            object_ = ''
-        elif token == "<obj>":
-            current = 'o'
-            relation = ''
-        else:
-            if current == 't':
-                subject += ' ' + token
-            elif current == 's':
-                object_ += ' ' + token
-            elif current == 'o':
-                relation += ' ' + token
-    if subject != '' and relation != '' and object_ != '':
-        relations.append({
-            'head': subject.strip(),
-            'type': relation.strip(),
-            'tail': object_.strip()
-        })
-    return relations
-def from_text_to_kb(text, model, tokenizer, article_url, span_length=128, article_title=None,
-                    article_publish_date=None, verbose=False):
-    # tokenize whole text
-    inputs = tokenizer([text], return_tensors="pt")
-    # compute span boundaries
-    num_tokens = len(inputs["input_ids"][0])
-    if verbose:
-        print(f"Input has {num_tokens} tokens")
-    num_spans = math.ceil(num_tokens / span_length)
-    if verbose:
-        print(f"Input has {num_spans} spans")
-    overlap = math.ceil((num_spans * span_length - num_tokens) /
-                        max(num_spans - 1, 1))
-    spans_boundaries = []
-    start = 0
-    for i in range(num_spans):
-        spans_boundaries.append([start + span_length * i,
-                                 start + span_length * (i + 1)])
-        start -= overlap
-    if verbose:
-        print(f"Span boundaries are {spans_boundaries}")
-    # transform input with spans
-    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
-                  for boundary in spans_boundaries]
-    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
-                    for boundary in spans_boundaries]
-    inputs = {
-        "input_ids": torch.stack(tensor_ids),
-        "attention_mask": torch.stack(tensor_masks)
-    }
-    # generate relations
-    num_return_sequences = 3
-    gen_kwargs = {
-        "max_length": 256,
-        "length_penalty": 0,
-        "num_beams": 3,
-        "num_return_sequences": num_return_sequences
-    }
-    generated_tokens = model.generate(
-        **inputs,
-        **gen_kwargs,
-    )
-    # decode relations
-    decoded_preds = tokenizer.batch_decode(generated_tokens,
-                                           skip_special_tokens=False)
-    # create kb
-    kb = KB()
-    i = 0
-    for sentence_pred in decoded_preds:
-        current_span_index = i // num_return_sequences
-        relations = extract_relations_from_model_output(sentence_pred)
-        for relation in relations:
-            relation["meta"] = {
-                article_url: {
-                    "spans": [spans_boundaries[current_span_index]]
-                }
-            }
-            kb.add_relation(relation, article_title, article_publish_date)
-        i += 1
-    return kb
 def get_article(url):
     article = Article(url)
     article.download()
     article.parse()
     return article
-def from_url_to_kb(url, model, tokenizer):
-    article = get_article(url)
-    config = {
-        "article_title": article.title,
-        "article_publish_date": article.publish_date
-    }
-    kb = from_text_to_kb(article.text, model, tokenizer, article.url, **config)
-    return kb
-def get_news_links(query, lang="en", region="US", pages=1):
-    googlenews = GoogleNews(lang=lang, region=region)
-    googlenews.search(query)
-    all_urls = []
-    for page in range(pages):
-        googlenews.get_page(page)
-        all_urls += googlenews.get_links()
-    return list(set(all_urls))
-def from_urls_to_kb(urls, model, tokenizer, verbose=False):
-    kb = KB()
-    if verbose:
-        print(f"{len(urls)} links to visit")
-    for url in urls:
-        if verbose:
-            print(f"Visiting {url}...")
-        try:
-            kb_url = from_url_to_kb(url, model, tokenizer)
-            kb.merge_with_kb(kb_url)
-        except ArticleException:
-            if verbose:
-                print(f"  Couldn't download article at url {url}")
-    return kb
-def save_network_html(kb, filename="network.html"):
-    # create network
-    net = Network(directed=True, width="700px", height="700px")
-    # nodes
-    color_entity = "#00FF00"
-    for e in kb.entities:
-        net.add_node(e, shape="circle", color=color_entity)
-    # edges
-    for r in kb.relations:
-        net.add_edge(r["head"], r["tail"],
-                    title=r["type"], label=r["type"])
-    # save network
-    net.repulsion(
-        node_distance=200,
-        central_gravity=0.2,
-        spring_length=200,
-        spring_strength=0.05,
-        damping=0.09
-    )
-    net.set_edge_smooth('dynamic')
-    net.show(filename)
-def save_kb(kb, filename):
-    with open(filename, "wb") as f:
-        pickle.dump(kb, f)
-class CustomUnpickler(pickle.Unpickler):
-    def find_class(self, module, name):
-        if name == 'KB':
-            return KB
-        return super().find_class(module, name)
-def load_kb(filename):
-    res = None
-    with open(filename, "rb") as f:
-        res = CustomUnpickler(f).load()
-    return res
-class KB():
-    def __init__(self):
-        self.entities = {} # { entity_title: {...} }
-        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
-          # meta: { article_url: { spans: [...] } } ]
-        self.sources = {} # { article_url: {...} }
-    def merge_with_kb(self, kb2):
-        for r in kb2.relations:
-            article_url = list(r["meta"].keys())[0]
-            source_data = kb2.sources[article_url]
-            self.add_relation(r, source_data["article_title"],
-                              source_data["article_publish_date"])
-    def are_relations_equal(self, r1, r2):
-        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])
-    def exists_relation(self, r1):
-        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)
-    def merge_relations(self, r2):
-        r1 = [r for r in self.relations
-              if self.are_relations_equal(r2, r)][0]
-        # if different article
-        article_url = list(r2["meta"].keys())[0]
-        if article_url not in r1["meta"]:
-            r1["meta"][article_url] = r2["meta"][article_url]
-        # if existing article
-        else:
-            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
-                            if span not in r1["meta"][article_url]["spans"]]
-            r1["meta"][article_url]["spans"] += spans_to_add
-    def get_wikipedia_data(self, candidate_entity):
-        try:
-            page = wikipedia.page(candidate_entity, auto_suggest=False)
-            entity_data = {
-                "title": page.title,
-                "url": page.url,
-                "summary": page.summary
-            }
-            return entity_data
-        except:
-            return None
-    def add_entity(self, e):
-        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}
-    def add_relation(self, r, article_title, article_publish_date):
-        # check on wikipedia
-        candidate_entities = [r["head"], r["tail"]]
-        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]
-        # if one entity does not exist, stop
-        if any(ent is None for ent in entities):
-            return
-        # manage new entities
-        for e in entities:
-            self.add_entity(e)
-        # rename relation entities with their wikipedia titles
-        r["head"] = entities[0]["title"]
-        r["tail"] = entities[1]["title"]
-        # add source if not in kb
-        article_url = list(r["meta"].keys())[0]
-        if article_url not in self.sources:
-            self.sources[article_url] = {
-                "article_title": article_title,
-                "article_publish_date": article_publish_date
-            }
-        # manage new relation
-        if not self.exists_relation(r):
-            self.relations.append(r)
-        else:
-            self.merge_relations(r)
-    def get_textual_representation(self):
-        res = ""
-        res += "### Entities\n"
-        for e in self.entities.items():
-            # shorten summary
-            e_temp = (e[0], {k:(v[:100] + "..." if k == "summary" else v) for k,v in e[1].items()})
-            res += f"- {e_temp}\n"
-        res += "\n"
-        res += "### Relations\n"
-        for r in self.relations:
-            res += f"- {r}\n"
-        res += "\n"
-        res += "### Sources\n"
-        for s in self.sources.items():
-            res += f"- {s}\n"
-        return res
-def save_network_html(kb, filename="network.html"):
-    # create network
-    net = Network(directed=True, width="700px", height="700px", bgcolor="#eeeeee")
-    # nodes
-    color_entity = "#00FF00"
-    for e in kb.entities:
-        net.add_node(e, shape="circle", color=color_entity)
-    # edges
-    for r in kb.relations:
-        net.add_edge(r["head"], r["tail"],
-                    title=r["type"], label=r["type"])
-    # save network
-    net.repulsion(
-        node_distance=200,
-        central_gravity=0.2,
-        spring_length=200,
-        spring_strength=0.05,
-        damping=0.09
-    )
-    net.set_edge_smooth('dynamic')
-    net.show(filename)

 import nltk
 import plotly.graph_objects as go
 from optimum.onnxruntime import ORTModelForSequenceClassification
+from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
 from sentence_transformers import SentenceTransformer, CrossEncoder, util
 import streamlit as st
 import en_core_web_lg
     '''Load and cache all the models to be used'''
     q_model = ORTModelForSequenceClassification.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
     q_tokenizer = AutoTokenizer.from_pretrained("nickmuchi/quantized-optimum-finbert-tone")
     ner_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
     sent_pipe = pipeline("text-classification",model=q_model, tokenizer=q_tokenizer)
     sum_pipe = pipeline("summarization",model="philschmid/flan-t5-base-samsum",clean_up_tokenization_spaces=True)
     ner_pipe = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer, grouped_entities=True)
     cross_encoder = CrossEncoder('cross-encoder/mmarco-mMiniLMv2-L12-H384-v1') #cross-encoder/ms-marco-MiniLM-L-12-v2
     sbert = SentenceTransformer('all-MiniLM-L6-v2')
+    return sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert
 @st.cache_resource
 def get_spacy():
 nlp = get_spacy()
+sent_pipe, sum_pipe, ner_pipe, cross_encoder, sbert  = load_models()
 @st.cache_data
 def get_yt_audio(url):
 ## Knowledge Graphs code
 def get_article(url):
     article = Article(url)
     article.download()
     article.parse()
     return article