MARITESS / app.py
MarMont's picture
allow examples again
be5bb8c
raw
history blame
18.9 kB
import pandas as pd
import tweepy
import re
import emoji
import spacy
import gensim
import json
import string
from spacy.tokenizer import Tokenizer
from gensim.parsing.preprocessing import STOPWORDS as SW
from wordcloud import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from pprint import pprint
import numpy as np
import tqdm
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
import torch
from transformers import T5ForConditionalGeneration,T5Tokenizer
from googletrans import Translator
from bertopic import BERTopic
from umap import UMAP
from sklearn.feature_extraction.text import CountVectorizer
from operator import itemgetter
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
global df
bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F'
client = tweepy.Client(bearer_token=bearer_token)
nlp = spacy.load('en_core_web_lg')
print('hi')
def scrape(keywords):
query = keywords + ' (lang:en OR lang:tl) -is:retweet'
max_results = 100
tweet_fields=['geo', 'id', 'lang', 'created_at']
expansions=['geo.place_id']
place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type']
response = client.search_recent_tweets(
query=query,
max_results=max_results,
tweet_fields=tweet_fields,
expansions=expansions,
place_fields=place_fields
)
tweets = []
for x in response[0]:
tweets.append(str(x))
place_data = str(response[1])
df = pd.DataFrame(tweets, columns=['tweet'])
return df, place_data
def get_example(dataset):
df = pd.read_csv(dataset + '.csv')
return df
def give_emoji_free_text(text):
"""
Removes emoji's from tweets
Accepts:
Text (tweets)
Returns:
Text (emoji free tweets)
"""
emoji_list = [c for c in text if c in emoji.EMOJI_DATA]
clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
return clean_text
def url_free_text(text):
'''
Cleans text from urls
'''
text = re.sub(r'http\S+', '', text)
return text
def get_lemmas(text):
'''Used to lemmatize the processed tweets'''
lemmas = []
doc = nlp(text)
for token in doc:
if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
lemmas.append(token.lemma_)
return lemmas
# Tokenizer function
def tokenize(text):
"""
Parses a string into a list of semantic units (words)
Args:
text (str): The string that the function will tokenize.
Returns:
list: tokens parsed out
"""
# Removing url's
pattern = r"http\S+"
tokens = re.sub(pattern, "", text) # https://www.youtube.com/watch?v=O2onA4r5UaY
tokens = re.sub('[^a-zA-Z 0-9]', '', text)
tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) # Remove punctuation
tokens = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
# tokens = re.sub('@*!*$*', '', text) # Remove @ ! $
tokens = tokens.strip(',') # TESTING THIS LINE
tokens = tokens.strip('?') # TESTING THIS LINE
tokens = tokens.strip('!') # TESTING THIS LINE
tokens = tokens.strip("'") # TESTING THIS LINE
tokens = tokens.strip(".") # TESTING THIS LINE
tokens = tokens.lower().split() # Make text lowercase and split it
return tokens
def split_corpus(corpus, n):
for i in range(0, len(corpus), n):
corpus_split = corpus
yield corpus_split[i:i + n]
def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1):
print('compute coherence values base lda')
coherence_values = []
model_list = []
for num_topics in range(start, limit, step):
model = gensim.models.ldamodel.LdaModel(corpus=corpus,
num_topics=num_topics,
random_state=100,
chunksize=200,
passes=10,
per_word_topics=True,
id2word=id2word)
model_list.append(model)
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence)
coherence_values.append(coherencemodel.get_coherence())
return coherence_values
def compute_coherence_values2(corpus, dictionary, k, a, b):
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=num_topics,
random_state=100,
chunksize=200,
passes=10,
alpha=a,
eta=b,
per_word_topics=True)
coherence_model_lda = CoherenceModel(model=lda_model,
texts=df['lemma_tokens'],
dictionary=id2word,
coherence='c_v')
return coherence_model_lda.get_coherence()
def assignMaxTopic(l):
maxTopic = max(l,key=itemgetter(1))[0]
return maxTopic
def assignTopic(l):
topics = []
for x in l:
topics.append(x[0])
def get_topic_value(row, i):
if len(row) == 1:
return row[0][1]
else:
try:
return row[i][1]
except Exception as e:
print(e)
def cleaning(df):
df.rename(columns = {'tweet':'original_tweets'}, inplace = True)
# Apply the function above and get tweets free of emoji's
call_emoji_free = lambda x: give_emoji_free_text(x)
# Apply `call_emoji_free` which calls the function to remove all emoji's
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free)
#Create a new column with url free tweets
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text)
f = open('stopwords-tl.json')
tlStopwords = json.loads(f.read())
stopwords = set(STOPWORDS)
stopwords.update(tlStopwords)
stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi'])
# Tokenizer
tokenizer = Tokenizer(nlp.vocab)
# Custom stopwords
custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@']
# Customize stop words by adding to the default list
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
# ALL_STOP_WORDS = spacy + gensim + wordcloud
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords)
tokens = []
STOP_WORDS.update(stopwords)
for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500):
doc_tokens = []
for token in doc:
if token.text.lower() not in STOP_WORDS:
doc_tokens.append(token.text.lower())
tokens.append(doc_tokens)
# Makes tokens column
df['tokens'] = tokens
# Make tokens a string again
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']]
df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas)
# Make lemmas a string again
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']]
# Apply tokenizer
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize)
return df
def full_lda(df):
print('cleaning')
print('base model setup')
# Create a id2word dictionary
global id2word
id2word = Dictionary(df['lemma_tokens'])
# Filtering Extremes
id2word.filter_extremes(no_below=2, no_above=.99)
# Creating a corpus object
global corpus
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
global corpus_og
corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']]
corpus_split = corpus
print('split corpus')
split_corpus(corpus_split, 5)
print('after split corpus')
print(corpus_split)
global coherence
coherence = 'c_v'
coherence_averages = [0] * 8
for i in range(5):
print('coherence averages ' + str(i))
training_corpus = corpus_split
training_corpus.remove(training_corpus[i])
# print(training_corpus[i])
coherence_values = compute_coherence_values_base_lda(dictionary=id2word,
corpus=training_corpus,
texts=df['lemma_tokens'],
start=2,
limit=10,
step=1,
coherence='c_v')
# print(coherence_values + str(i))
for j in range(len(coherence_values)):
coherence_averages[j] += coherence_values[j]
coherence_averages = [x / 5 for x in coherence_averages]
if coherence == 'c_v':
k_max = max(coherence_averages)
else:
k_max = min(coherence_averages, key=abs)
global num_topics
num_topics = coherence_averages.index(k_max) + 2
print('hyperparameter opt')
grid = {}
grid['Validation_Set'] = {}
min_topics = 1
max_topics = 10
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
alpha = [0.05, 0.1, 0.5, 1, 5, 10]
# alpha.append('symmetric')
# alpha.append('asymmetric')
beta = [0.05, 0.1, 0.5, 1, 5, 10]
# beta.append('symmetric')
num_of_docs = len(corpus_og)
corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)),
corpus_og]
corpus_title = ['75% Corpus', '100% Corpus']
model_results = {'Validation_Set': [],
'Alpha': [],
'Beta': [],
'Coherence': []
}
if 1 == 1:
pbar = tqdm.tqdm(total=540)
for i in range(len(corpus_sets)):
for a in alpha:
for b in beta:
cv = compute_coherence_values2(corpus=corpus_sets[i],
dictionary=id2word,
k=num_topics,
a=a,
b=b)
model_results['Validation_Set'].append(corpus_title[i])
model_results['Alpha'].append(a)
model_results['Beta'].append(b)
model_results['Coherence'].append(cv)
pbar.update(1)
pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False)
pbar.close()
params_df = pd.read_csv('lda_tuning_results_new.csv')
params_df = params_df[params_df.Validation_Set == '75% Corpus']
params_df.reset_index(inplace=True)
params_df = params_df.replace(np.inf, -np.inf)
max_params = params_df.loc[params_df['Coherence'].idxmax()]
max_coherence = max_params['Coherence']
max_alpha = max_params['Alpha']
max_beta = max_params['Beta']
max_validation_set = max_params['Validation_Set']
global lda_model_final
lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og,
id2word=id2word,
num_topics=num_topics,
random_state=100,
chunksize=200,
passes=10,
alpha=max_alpha,
eta=max_beta,
per_word_topics=True)
coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word,
coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
lda_topics = lda_model_final.show_topics(num_words=10)
print('assign topics')
topics = []
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]
for topic in lda_topics:
topics.append(preprocess_string(topic[1], filters))
df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))]
df = df[df['topic'].map(lambda d: len(d)) > 0]
df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row))
global topic_clusters
topic_clusters = []
for i in range(num_topics):
topic_clusters.append(df[df['max_topic'].isin(([i]))])
topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist()
print('rep topics')
global top_tweets
top_tweets = []
for i in range(len(topic_clusters)):
tweets = df.loc[df['max_topic'] == i]
tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i))
# tweets['topic'] = [row[i][1] for row in tweets['topic']]
tweets_sorted = tweets.sort_values('topic', ascending=False)
tweets_sorted.drop_duplicates(subset=['original_tweets'])
rep_tweets = tweets_sorted['original_tweets']
rep_tweets = [*set(rep_tweets)]
top_tweets.append(rep_tweets[:5])
# print('Topic ', i)
# print(rep_tweets[:5])
return top_tweets
def topic_summarization(topic_groups):
tokenizer = AutoTokenizer.from_pretrained("Michau/t5-base-en-generate-headline")
model = AutoModelForSeq2SeqLM.from_pretrained("Michau/t5-base-en-generate-headline")
translator = Translator()
headlines = []
for i in range(len(topic_groups)):
tweets = " ".join(topic_groups[i])
# print(tweets)
out = translator.translate(tweets, dest='en')
text = out.text
# print(tweets)
max_len = 256
encoding = tokenizer.encode_plus(text, return_tensors = "pt")
input_ids = encoding["input_ids"]
attention_masks = encoding["attention_mask"]
beam_outputs = model.generate(
input_ids = input_ids,
attention_mask = attention_masks,
max_length = 64,
num_beams = 3,
early_stopping = True,
)
result = tokenizer.decode(beam_outputs[0])
print(result)
headlines.append("Topic " + str(i) + " " + result)
return headlines
def compute_coherence_value_bertopic(topic_model):
topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)]
coherence_model = CoherenceModel(topics=topic_words,
texts=df['lemma_tokens'],
corpus=corpus,
dictionary=id2word,
coherence=coherence)
coherence_score = coherence_model.get_coherence()
return coherence_score
def base_bertopic(df):
df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x))
global id2word
id2word = Dictionary(df['lemma_tokens'])
global corpus
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']]
global umap_model
umap_model = UMAP(n_neighbors=15,
n_components=5,
min_dist=0.0,
metric='cosine',
random_state=100)
base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True)
topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string'])
try:
print(compute_coherence_value_bertopic(base_topic_model))
except:
print('huh')
print(base_topic_model.get_topic_info())
print('Unable to generate meaningful topics (Base BERTopic model)')
def optimized_bertopic(df):
vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english")
optimized_topic_model = BERTopic(umap_model=umap_model,
language="multilingual",
n_gram_range=(1, 3),
vectorizer_model=vectorizer_model,
calculate_probabilities=True)
topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string'])
try:
print(compute_coherence_value_bertopic(optimized_topic_model))
except:
print('huh optimized')
print(optimized_topic_model.get_topic_info())
print('Unable to generate meaningful topics, base BERTopic model if possible')
rep_docs = optimized_topic_model.representative_docs_
global top_tweets
top_tweets = []
for topic in rep_docs:
if topic == -1:
print('test')
continue
topic_docs = rep_docs.get(topic)
tweets = []
for doc in topic_docs:
index = df.isin([doc]).any(axis=1).idxmax()
# print(index)
tweets.append(df.loc[index, 'original_tweets'])
# print(tweets)
top_tweets.append(tweets)
return top_tweets
global examples
examples = [["katip,katipunan", "LDA"],
["katip,katipunan", "BERTopic"],
["bgc,bonifacio global city", "LDA"],
["bgc,bonifacio global city", "BERTopic"],
["pobla,poblacion", "LDA"],
["pobla,poblacion", "BERTopic"],
["cubao","LDA"],
["cubao","BERTopic"],
["taft", "LDA"],
["taft", "BERTopic"]
]
def main(dataset, model
# progress=gr.Progress(track_tqdm=True)
):
global df
examples = [ "katip,katipunan",
"bgc,bonifacio global city",
"pobla,poblacion",
"cubao",
"taft"
]
keyword_list = dataset.split(',')
if len(keyword_list) > 1:
keywords = '(' + ' OR '.join(keyword_list) + ')'
else:
keywords = keyword_list[0]
# if dataset in examples:
# df = get_example(keywords)
# place_data = 'test'
# else:
print(dataset)
df, place_data = scrape(keywords)
print(place_data)
print(df)
if model == 'LDA':
df = cleaning(df)
print('doing lda')
top_tweets = full_lda(df)
print('done lda')
# place_data = 'test'
else:
df = cleaning(df)
base_bertopic(df)
top_tweets = optimized_bertopic(df)
print('doing topic summarization')
headlines = topic_summarization(top_tweets)
headlines = '\n'.join(str(h) for h in headlines)
# print(headlines)
top_tweets = '\n\n'.join (str(tweet) for tweet in top_tweets)
return place_data, headlines, top_tweets
iface = gr.Interface(fn=main,
inputs=["text",
gr.Dropdown(["LDA",
"BERTopic"],
label="Model")
],
# examples=examples,
outputs=["text",
"text",
"text"],
enable_queue=True,
debug=True,
)
iface.launch(debug=True, enable_queue=True, share=True)