Spaces:

unb-lamfo-nlp-mcti
/

NLP-W2V-CNN-Multi

Runtime error

App Files Files Community

chap0lin commited on Dec 8, 2022

Commit

8034507

•

1 Parent(s): 0078548

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -36

app.py CHANGED Viewed

@@ -93,34 +93,45 @@ def remove_stopwords(text, is_lower_case=False, stopwords=None):
   return filtered_text
-def pre_process():
-  opo_texto_sem_caracteres_especiais = (remove_accented_chars(sentence))
-  sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
-  sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
-  sentenceLowered = sentenceWithoutPunctuation.lower()
-  sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
-  sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
-  return nltk.word_tokenize(sentenceLemStopped)
 def classify(df, new_column = True):
   sentencesMCTIList_xp8 = df['opo_pre_tkn']
-  print("Dados da planilha adquiridos")
   formatted_sentences = []
   for sentence in sentencesMCTIList_xp8:
     formatted_sentences.append(json.loads(sentence.replace("'",'"')))
-  # del sentencesMCTIList_xp8
-  print(sentencesMCTIList_xp8[0])
-  print("##########################")
-  print(formatted_sentences[0][0])
-  print("Transformado em W2V")
   words = list(reloaded_w2v_model.wv.vocab)
   item_shape = np.shape(reloaded_w2v_model.wv[words[0]])
-  # print(formatted_sentences)
   MCTIinput_vector = []
   for sentence in formatted_sentences:
@@ -132,12 +143,10 @@ def classify(df, new_column = True):
         aux_vector.append(np.zeros(item_shape))
     MCTIinput_vector.append(aux_vector)
   del formatted_sentences
-  print("Convertido W2V")
   MCTIinput_padded = pad_sequences(MCTIinput_vector, maxlen=2726, padding='pre')
   del MCTIinput_vector
-  print("Sentenças com Padding")
-  print(len(MCTIinput_padded))
-  print(len(MCTIinput_padded[0]))
   predictions = reconstructed_model_CNN.predict(MCTIinput_padded)
   del MCTIinput_padded
   print(predictions)
@@ -148,6 +157,9 @@ def classify(df, new_column = True):
   del predictions
   df['classification'] = cleaned_up_predictions
   return df
 def gen_output(data):
@@ -166,27 +178,18 @@ def app(operacao, resultado, dados):
   data = pd.read_excel(dados)
   print("Dados Carregados!")
-  # boxes = {'Color': ['Green','Green','Green','Blue','Blue','Red','Red','Red'],
-  #          'Shape': ['Rectangle','Rectangle','Square','Rectangle','Square','Square','Square','Rectangle'],
-  #          'Price': [10,15,5,5,10,15,15,5]
-  #         }
-  # df = pd.DataFrame(boxes, columns= ['Color','Shape','Price'])
-  # data.to_excel("output.xlsx")
-  # return "output.xlsx"
   if operacao == "Pré-processamento + Classificação" :
-    pre_process()
-    classify(resultado == "Nova Coluna")
-    output = gen_output()
     return output
   elif operacao == "Apenas Pré-processamento" :
-    pre_process()
-    output = gen_output()
     return output
   elif operacao == "Apenas Classificação" :
-    print("Apenas Classificação Selecionado!")
     df = classify(data, resultado == "Nova Coluna")
     output = gen_output(df)

   return filtered_text
+def pre_process(df):
+  opo_texto_data = df['opo_texto']
+  opo_texto_ele_data = df['opo_texto_ele']
+  opo_texto_final = []
+  for i in range(len(opo_texto_data)):
+    if opo_texto_data[i] == opo_texto_ele_data[i]:
+      opo_texto_final.append(opo_texto_data[i])
+    elif pd.isna(opo_texto_ele_data[i]):
+      opo_texto_final.append(opo_texto_data[i])
+    elif len(nltk.word_tokenize(opo_texto_data[i])) < 4000:
+      opo_texto_final.append(opo_texto_data[i]+". "+opo_texto_ele_data[i])
+    else:
+      opo_texto_final.append(opo_texto_data[i])
+  pre_processed_data = []
+  for opo in opo_texto_final:
+    opo_texto_sem_caracteres_especiais = (remove_accented_chars(opo))
+    sentenceExpanded = contractions.fix(opo_texto_sem_caracteres_especiais)
+    sentenceWithoutPunctuation = remove_special_characters(sentenceExpanded , remove_digits=True)
+    sentenceLowered = sentenceWithoutPunctuation.lower()
+    sentenceLemmatized = spacy_lemmatize_text(sentenceLowered)
+    sentenceLemStopped = remove_stopwords(sentenceLemmatized, is_lower_case=False)
+    sentenceTokenized = nltk.word_tokenize(sentenceLemStopped)
+    pre_processed_data.append(sentenceTokenized)
+  df['opo_pre_tkn'] = pre_processed_data
+  return df
 def classify(df, new_column = True):
   sentencesMCTIList_xp8 = df['opo_pre_tkn']
   formatted_sentences = []
   for sentence in sentencesMCTIList_xp8:
     formatted_sentences.append(json.loads(sentence.replace("'",'"')))
+  del sentencesMCTIList_xp8
   words = list(reloaded_w2v_model.wv.vocab)
   item_shape = np.shape(reloaded_w2v_model.wv[words[0]])
   MCTIinput_vector = []
   for sentence in formatted_sentences:
         aux_vector.append(np.zeros(item_shape))
     MCTIinput_vector.append(aux_vector)
   del formatted_sentences
   MCTIinput_padded = pad_sequences(MCTIinput_vector, maxlen=2726, padding='pre')
   del MCTIinput_vector
   predictions = reconstructed_model_CNN.predict(MCTIinput_padded)
   del MCTIinput_padded
   print(predictions)
   del predictions
   df['classification'] = cleaned_up_predictions
+  if not new_column:
+    df = df.loc[df['classification'] == 1]
   return df
 def gen_output(data):
   data = pd.read_excel(dados)
   print("Dados Carregados!")
   if operacao == "Pré-processamento + Classificação" :
+    df = pre_process(data)
+    df = classify(df, resultado == "Nova Coluna")
+    output = gen_output(df)
     return output
   elif operacao == "Apenas Pré-processamento" :
+    df = pre_process(data)
+    output = gen_output(df)
     return output
   elif operacao == "Apenas Classificação" :
     df = classify(data, resultado == "Nova Coluna")
     output = gen_output(df)