Spaces:

pierreguillou
/

extracao_das_palavras_frases_chave_em_portugues

Runtime error

App Files Files Community

pierreguillou commited on Dec 17, 2022

Commit

543c8b7

1 Parent(s): b5cb1eb

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -64

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 import gradio as gr
 import wget
 from ftlangdetect import detect
@@ -28,10 +29,8 @@ kw_model = {
 # os.system("python -m spacy download pt_core_news_lg")
 # download stop words in Portuguese
-#import nltk
-#nltk.download('stopwords')
-#from nltk.corpus import stopwords
-#stop_words = set(stopwords.words('portuguese'))
 # Part-of-Speech Tagging for Portuguese (https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Portuguese/03-POS-Keywords-Portuguese.html)
 pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
@@ -40,10 +39,64 @@ pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
 vectorizer = KeyphraseCountVectorizer(spacy_pipeline='pt_core_news_lg', stop_words=None, pos_pattern=pos_pattern, lowercase=False)
 # function principal (keywords)
-def get_kw_html(model_id, doc, top_n, diversity):
   # detect lang
-  res = detect(text=doc, low_memory=False)
   lang = res["lang"]
   score = res["score"]
@@ -63,68 +116,24 @@ def get_kw_html(model_id, doc, top_n, diversity):
   else:
-    # keywords
-    def get_kw(kw_model=kw_model[model_id], doc=doc, top_n=top_n, diversity=diversity):
-      keywords = kw_model.extract_keywords(doc,
-                                          vectorizer=vectorizer,
-                                          use_mmr=True, diversity=diversity,
-                                          top_n=top_n,
-                                          )
-      keywords_json = {item[0]:item[1] for item in keywords}
-      return keywords, keywords_json
-    # highlight
-    def get_html(keywords, doc=doc):
-      # ordering of lists (from longest keywords to shortest ones)
-      list3 = [keyword[0] for keyword in keywords]
-      list2 = [len(item.split()) for item in list3]
-      list1 = list(range(len(list2)))
-      list2, list1 = (list(t) for t in zip(*sorted(zip(list2, list1))))
-      list1 = list1[::-1]
-      keywords_list = [list3[idx] for idx in list1]
-      # converting doc to html format
-      html_doc = doc
-      for idx,keyword in enumerate(keywords_list):
-        if sum([True if keyword in item else False for item in keywords_list[:idx]]) == 0:
-          if keyword not in '<span style="color: black; background-color: yellow; padding:2px">' and keyword not in '</span>':
-            html_doc = html_doc.replace(keyword, '<span style="color: black; background-color: yellow; padding:2px">' + keyword + '</span>')
-      html_doc = '<p style="font-size:150%; line-height:120%">' + html_doc + '</p>'
-      return html_doc
-    # function to clean text of document
-    doc = clean(doc,
-        fix_unicode=True,               # fix various unicode errors
-        to_ascii=False,                  # transliterate to closest ASCII representation
-        lower=False,                    # lowercase text
-        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
-        no_urls=False,                  # replace all URLs with a special token
-        no_emails=False,                # replace all email addresses with a special token
-        no_phone_numbers=False,         # replace all phone numbers with a special token
-        no_numbers=False,               # replace all numbers with a special token
-        no_digits=False,                # replace all digits with a special token
-        no_currency_symbols=False,      # replace all currency symbols with a special token
-        no_punct=False,                 # remove punctuations
-        replace_with_punct="",          # instead of removing punctuations you may replace them
-        replace_with_url="<URL>",
-        replace_with_email="<EMAIL>",
-        replace_with_phone_number="<PHONE>",
-        replace_with_number="<NUMBER>",
-        replace_with_digit="0",
-        replace_with_currency_symbol="<CUR>",
-        lang="pt"                       # set to 'de' for German special handling
-    )
     # get keywords and highlighted text
     keywords, keywords_json = get_kw()
     html_doc = get_html(keywords)
-    label = f"A palavra/frase chave com a maior probabilidade é: {keywords[0]}"
   return label, keywords_json, html_doc
 title = "Extração das key palavras/frases em português"
 description = '<p>(17/12/2022) Forneça seu próprio documento em português e o APP vai fazer a extração das palavras/frases chave com as maiores probabilidades de similardide ao texto.\
 <br />Segundo você, qual é o melhor modelo?</p>\
 <p>Este aplicativo usa os modelos seguintes:\
@@ -134,6 +143,7 @@ description = '<p>(17/12/2022) Forneça seu próprio documento em português e o
 <br />- <a href="https://github.com/TimSchopf/KeyphraseVectorizers#keyphrasevectorizers">KeyphraseVectorizers</a> para definir o vetorizador que extrai palavras/frases chave com padrões de parte do texto de um documento.\
 <br />- <a href="https://maartengr.github.io/KeyBERT/index.html">KeyBERT</a> para calcular as similaridades entre as palavras/frases chave e o texto do documento.</p>'
 doc_original = """
 As contas de pelo menos seis jornalistas norte-americanos que cobrem tecnologia foram suspensas pelo Twitter na noite desta quinta-feira (15). Os profissionais escrevem sobre o tema para diversos veículos de comunicação dos Estados Unidos, como os jornais 'The New York Times' e 'Washington Post'.
@@ -152,8 +162,9 @@ examples = [
     [doc_original.strip()],
 ]
 interface_0 = gr.Interface(
-    fn=partial(get_kw_html, 0),
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
@@ -167,7 +178,7 @@ interface_0 = gr.Interface(
 )
 interface_1 = gr.Interface(
-    fn=partial(get_kw_html, 1),
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
@@ -181,7 +192,7 @@ interface_1 = gr.Interface(
 )
 interface_2 = gr.Interface(
-    fn=partial(get_kw_html, 2),
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
@@ -194,6 +205,7 @@ interface_2 = gr.Interface(
         ]
 )
 demo = gr.Parallel(interface_0, interface_1, interface_2,
                    title=title,
                    description=description,

 import os
+import subprocess
 import gradio as gr
 import wget
 from ftlangdetect import detect
 # os.system("python -m spacy download pt_core_news_lg")
 # download stop words in Portuguese
+output = subprocess.run(["python", "stopwords.py"], capture_output=True, text=True)
+stop_words = eval(output.stdout)
 # Part-of-Speech Tagging for Portuguese (https://melaniewalsh.github.io/Intro-Cultural-Analytics/05-Text-Analysis/Multilingual/Portuguese/03-POS-Keywords-Portuguese.html)
 pos_pattern='<CONJ.*>*<ADP.*>*<ADV.*>*<NUM.*>*<ADJ.*>*<N.*>+'
 vectorizer = KeyphraseCountVectorizer(spacy_pipeline='pt_core_news_lg', stop_words=None, pos_pattern=pos_pattern, lowercase=False)
 # function principal (keywords)
+def get_kw_html(doc, top_n, diversity, model_id):
+  # keywords
+  def get_kw(kw_model=kw_model[model_id], doc=doc, top_n=top_n, diversity=diversity):
+    keywords = kw_model.extract_keywords(doc,
+                                         vectorizer=vectorizer,
+                                         use_mmr=True, diversity=diversity,
+                                         top_n=top_n,
+                                         )
+    keywords_json = {item[0]:item[1] for item in keywords}
+    return keywords, keywords_json
+  # highlight
+  def get_html(keywords, doc=doc):
+    # ordering of lists (from longest keywords to shortest ones)
+    list3 = [keyword[0] for keyword in keywords]
+    list2 = [len(item.split()) for item in list3]
+    list1 = list(range(len(list2)))
+    list2, list1 = (list(t) for t in zip(*sorted(zip(list2, list1))))
+    list1 = list1[::-1]
+    keywords_list = [list3[idx] for idx in list1]
+    # converting doc to html format
+    html_doc = doc
+    for idx,keyword in enumerate(keywords_list):
+      if sum([True if keyword in item else False for item in keywords_list[:idx]]) == 0:
+        if keyword not in '<span style="color: black; background-color: yellow; padding:2px">' and keyword not in '</span>':
+          html_doc = html_doc.replace(keyword, '<span style="color: black; background-color: yellow; padding:2px">' + keyword + '</span>')
+    html_doc = '<p style="font-size:150%; line-height:120%">' + html_doc + '</p>'
+    return html_doc
+  # function to clean text of document
+  doc = clean(doc,
+              fix_unicode=True,               # fix various unicode errors
+              to_ascii=False,                  # transliterate to closest ASCII representation
+              lower=False,                    # lowercase text
+              no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
+              no_urls=False,                  # replace all URLs with a special token
+              no_emails=False,                # replace all email addresses with a special token
+              no_phone_numbers=False,         # replace all phone numbers with a special token
+              no_numbers=False,               # replace all numbers with a special token
+              no_digits=False,                # replace all digits with a special token
+              no_currency_symbols=False,      # replace all currency symbols with a special token
+              no_punct=False,                 # remove punctuations
+              replace_with_punct="",          # instead of removing punctuations you may replace them
+              replace_with_url="<URL>",
+              replace_with_email="<EMAIL>",
+              replace_with_phone_number="<PHONE>",
+              replace_with_number="<NUMBER>",
+              replace_with_digit="0",
+              replace_with_currency_symbol="<CUR>",
+              lang="pt"                       # set to 'de' for German special handling
+              )
   # detect lang
+  res = detect(text=str(doc), low_memory=False)
   lang = res["lang"]
   score = res["score"]
   else:
     # get keywords and highlighted text
     keywords, keywords_json = get_kw()
     html_doc = get_html(keywords)
+    label = f"A palavra/frase chave com a maior probabilidade é: [ {keywords[0][0]} ]"
   return label, keywords_json, html_doc
+def get_kw_html_0(doc, top_n, diversity, model_id=0):
+  return get_kw_html(doc, top_n, diversity, model_id)
+def get_kw_html_1(doc, top_n, diversity, model_id=1):
+  return get_kw_html(doc, top_n, diversity, model_id)
+def get_kw_html_2(doc, top_n, diversity, model_id=2):
+  return get_kw_html(doc, top_n, diversity, model_id)
 title = "Extração das key palavras/frases em português"
 description = '<p>(17/12/2022) Forneça seu próprio documento em português e o APP vai fazer a extração das palavras/frases chave com as maiores probabilidades de similardide ao texto.\
 <br />Segundo você, qual é o melhor modelo?</p>\
 <p>Este aplicativo usa os modelos seguintes:\
 <br />- <a href="https://github.com/TimSchopf/KeyphraseVectorizers#keyphrasevectorizers">KeyphraseVectorizers</a> para definir o vetorizador que extrai palavras/frases chave com padrões de parte do texto de um documento.\
 <br />- <a href="https://maartengr.github.io/KeyBERT/index.html">KeyBERT</a> para calcular as similaridades entre as palavras/frases chave e o texto do documento.</p>'
+# examples
 doc_original = """
 As contas de pelo menos seis jornalistas norte-americanos que cobrem tecnologia foram suspensas pelo Twitter na noite desta quinta-feira (15). Os profissionais escrevem sobre o tema para diversos veículos de comunicação dos Estados Unidos, como os jornais 'The New York Times' e 'Washington Post'.
     [doc_original.strip()],
 ]
+# interfaces
 interface_0 = gr.Interface(
+    fn=get_kw_html_0,
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
 )
 interface_1 = gr.Interface(
+    fn=get_kw_html_1,
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
 )
 interface_2 = gr.Interface(
+    fn=get_kw_html_2,
     inputs=[
         gr.Textbox(lines=15, label="Texto do documento"),
         gr.Slider(1, 20, value=5, label="Número das palavras/frases chave a procurar (padrão: 5)"),
         ]
 )
+# app
 demo = gr.Parallel(interface_0, interface_1, interface_2,
                    title=title,
                    description=description,