|
import pandas as pd |
|
import tweepy |
|
import re |
|
import emoji |
|
import spacy |
|
import gensim |
|
import json |
|
import string |
|
|
|
from spacy.tokenizer import Tokenizer |
|
from gensim.parsing.preprocessing import STOPWORDS as SW |
|
from wordcloud import STOPWORDS |
|
|
|
from gensim.corpora import Dictionary |
|
from gensim.models.coherencemodel import CoherenceModel |
|
from pprint import pprint |
|
|
|
import numpy as np |
|
import tqdm |
|
|
|
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric |
|
|
|
import torch |
|
from transformers import T5ForConditionalGeneration,T5Tokenizer |
|
from googletrans import Translator |
|
|
|
from bertopic import BERTopic |
|
from umap import UMAP |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
|
from operator import itemgetter |
|
|
|
import gradio as gr |
|
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
global df |
|
bearer_token = 'AAAAAAAAAAAAAAAAAAAAACEigwEAAAAACoP8KHJYLOKCL4OyB9LEPV00VB0%3DmyeDROUvw4uipHwvbPPfnTuY0M9ORrLuXrMvcByqZhwo3SUc4F' |
|
client = tweepy.Client(bearer_token=bearer_token) |
|
nlp = spacy.load('en_core_web_lg') |
|
print('hi') |
|
|
|
def scrape(keywords): |
|
query = keywords + ' (lang:en OR lang:tl) -is:retweet' |
|
max_results = 100 |
|
tweet_fields=['geo', 'id', 'lang', 'created_at'] |
|
expansions=['geo.place_id'] |
|
place_fields = ['contained_within', 'country', 'country_code', 'full_name', 'geo', 'id', 'name', 'place_type'] |
|
|
|
response = client.search_recent_tweets( |
|
query=query, |
|
max_results=max_results, |
|
tweet_fields=tweet_fields, |
|
expansions=expansions, |
|
place_fields=place_fields |
|
) |
|
|
|
tweets = [] |
|
for x in response[0]: |
|
tweets.append(str(x)) |
|
|
|
place_data = str(response[1]) |
|
|
|
df = pd.DataFrame(tweets, columns=['tweet']) |
|
|
|
return df, place_data |
|
|
|
def get_example(dataset): |
|
df = pd.read_csv(dataset + '.csv') |
|
return df |
|
|
|
def give_emoji_free_text(text): |
|
""" |
|
Removes emoji's from tweets |
|
Accepts: |
|
Text (tweets) |
|
Returns: |
|
Text (emoji free tweets) |
|
""" |
|
emoji_list = [c for c in text if c in emoji.EMOJI_DATA] |
|
clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)]) |
|
return clean_text |
|
|
|
def url_free_text(text): |
|
''' |
|
Cleans text from urls |
|
''' |
|
text = re.sub(r'http\S+', '', text) |
|
return text |
|
|
|
def get_lemmas(text): |
|
'''Used to lemmatize the processed tweets''' |
|
lemmas = [] |
|
|
|
doc = nlp(text) |
|
|
|
for token in doc: |
|
if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'): |
|
lemmas.append(token.lemma_) |
|
|
|
return lemmas |
|
|
|
|
|
def tokenize(text): |
|
""" |
|
Parses a string into a list of semantic units (words) |
|
Args: |
|
text (str): The string that the function will tokenize. |
|
Returns: |
|
list: tokens parsed out |
|
""" |
|
|
|
pattern = r"http\S+" |
|
|
|
tokens = re.sub(pattern, "", text) |
|
tokens = re.sub('[^a-zA-Z 0-9]', '', text) |
|
tokens = re.sub('[%s]' % re.escape(string.punctuation), '', text) |
|
tokens = re.sub('\w*\d\w*', '', text) |
|
|
|
tokens = tokens.strip(',') |
|
tokens = tokens.strip('?') |
|
tokens = tokens.strip('!') |
|
tokens = tokens.strip("'") |
|
tokens = tokens.strip(".") |
|
|
|
tokens = tokens.lower().split() |
|
|
|
return tokens |
|
|
|
def split_corpus(corpus, n): |
|
for i in range(0, len(corpus), n): |
|
corpus_split = corpus |
|
yield corpus_split[i:i + n] |
|
|
|
def compute_coherence_values_base_lda(dictionary, corpus, texts, limit, coherence, start=2, step=1): |
|
print('compute coherence values base lda') |
|
coherence_values = [] |
|
model_list = [] |
|
for num_topics in range(start, limit, step): |
|
model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
|
num_topics=num_topics, |
|
random_state=100, |
|
chunksize=200, |
|
passes=10, |
|
per_word_topics=True, |
|
id2word=id2word) |
|
model_list.append(model) |
|
coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=coherence) |
|
coherence_values.append(coherencemodel.get_coherence()) |
|
|
|
return coherence_values |
|
|
|
def compute_coherence_values2(corpus, dictionary, k, a, b): |
|
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, |
|
id2word=id2word, |
|
num_topics=num_topics, |
|
random_state=100, |
|
chunksize=200, |
|
passes=10, |
|
alpha=a, |
|
eta=b, |
|
per_word_topics=True) |
|
coherence_model_lda = CoherenceModel(model=lda_model, |
|
texts=df['lemma_tokens'], |
|
dictionary=id2word, |
|
coherence='c_v') |
|
|
|
return coherence_model_lda.get_coherence() |
|
|
|
def assignMaxTopic(l): |
|
maxTopic = max(l,key=itemgetter(1))[0] |
|
return maxTopic |
|
|
|
def assignTopic(l): |
|
topics = [] |
|
for x in l: |
|
topics.append(x[0]) |
|
|
|
def get_topic_value(row, i): |
|
if len(row) == 1: |
|
return row[0][1] |
|
else: |
|
try: |
|
return row[i][1] |
|
except Exception as e: |
|
print(e) |
|
|
|
def cleaning(df): |
|
df.rename(columns = {'tweet':'original_tweets'}, inplace = True) |
|
|
|
|
|
call_emoji_free = lambda x: give_emoji_free_text(x) |
|
|
|
|
|
df['emoji_free_tweets'] = df['original_tweets'].apply(call_emoji_free) |
|
|
|
|
|
df['url_free_tweets'] = df['emoji_free_tweets'].apply(url_free_text) |
|
|
|
|
|
|
|
f = open('stopwords-tl.json') |
|
tlStopwords = json.loads(f.read()) |
|
stopwords = set(STOPWORDS) |
|
stopwords.update(tlStopwords) |
|
stopwords.update(['na', 'sa', 'ko', 'ako', 'ng', 'mga', 'ba', 'ka', 'yung', 'lang', 'di', 'mo', 'kasi']) |
|
|
|
|
|
tokenizer = Tokenizer(nlp.vocab) |
|
|
|
|
|
|
|
custom_stopwords = ['hi','\n','\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want', 'like', '$', '@'] |
|
|
|
|
|
|
|
STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords) |
|
|
|
|
|
ALL_STOP_WORDS = STOP_WORDS.union(SW).union(stopwords) |
|
|
|
|
|
tokens = [] |
|
STOP_WORDS.update(stopwords) |
|
|
|
for doc in tokenizer.pipe(df['url_free_tweets'], batch_size=500): |
|
doc_tokens = [] |
|
for token in doc: |
|
if token.text.lower() not in STOP_WORDS: |
|
doc_tokens.append(token.text.lower()) |
|
tokens.append(doc_tokens) |
|
|
|
|
|
df['tokens'] = tokens |
|
|
|
|
|
df['tokens_back_to_text'] = [' '.join(map(str, l)) for l in df['tokens']] |
|
|
|
df['lemmas'] = df['tokens_back_to_text'].apply(get_lemmas) |
|
|
|
|
|
df['lemmas_back_to_text'] = [' '.join(map(str, l)) for l in df['lemmas']] |
|
|
|
|
|
df['lemma_tokens'] = df['lemmas_back_to_text'].apply(tokenize) |
|
|
|
return df |
|
|
|
def full_lda(df): |
|
|
|
print('cleaning') |
|
|
|
|
|
print('base model setup') |
|
|
|
global id2word |
|
id2word = Dictionary(df['lemma_tokens']) |
|
|
|
|
|
id2word.filter_extremes(no_below=2, no_above=.99) |
|
|
|
|
|
global corpus |
|
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] |
|
global corpus_og |
|
corpus_og = [id2word.doc2bow(d) for d in df['lemma_tokens']] |
|
|
|
corpus_split = corpus |
|
print('split corpus') |
|
split_corpus(corpus_split, 5) |
|
print('after split corpus') |
|
print(corpus_split) |
|
|
|
global coherence |
|
coherence = 'c_v' |
|
|
|
coherence_averages = [0] * 8 |
|
for i in range(5): |
|
print('coherence averages ' + str(i)) |
|
training_corpus = corpus_split |
|
training_corpus.remove(training_corpus[i]) |
|
|
|
coherence_values = compute_coherence_values_base_lda(dictionary=id2word, |
|
corpus=training_corpus, |
|
texts=df['lemma_tokens'], |
|
start=2, |
|
limit=10, |
|
step=1, |
|
coherence='c_v') |
|
|
|
for j in range(len(coherence_values)): |
|
coherence_averages[j] += coherence_values[j] |
|
|
|
coherence_averages = [x / 5 for x in coherence_averages] |
|
|
|
if coherence == 'c_v': |
|
k_max = max(coherence_averages) |
|
else: |
|
k_max = min(coherence_averages, key=abs) |
|
|
|
global num_topics |
|
num_topics = coherence_averages.index(k_max) + 2 |
|
|
|
print('hyperparameter opt') |
|
grid = {} |
|
grid['Validation_Set'] = {} |
|
|
|
min_topics = 1 |
|
max_topics = 10 |
|
step_size = 1 |
|
topics_range = range(min_topics, max_topics, step_size) |
|
|
|
alpha = [0.05, 0.1, 0.5, 1, 5, 10] |
|
|
|
|
|
|
|
beta = [0.05, 0.1, 0.5, 1, 5, 10] |
|
|
|
|
|
num_of_docs = len(corpus_og) |
|
corpus_sets = [gensim.utils.ClippedCorpus(corpus_og, int(num_of_docs*0.75)), |
|
corpus_og] |
|
corpus_title = ['75% Corpus', '100% Corpus'] |
|
model_results = {'Validation_Set': [], |
|
'Alpha': [], |
|
'Beta': [], |
|
'Coherence': [] |
|
} |
|
if 1 == 1: |
|
pbar = tqdm.tqdm(total=540) |
|
|
|
for i in range(len(corpus_sets)): |
|
for a in alpha: |
|
for b in beta: |
|
cv = compute_coherence_values2(corpus=corpus_sets[i], |
|
dictionary=id2word, |
|
k=num_topics, |
|
a=a, |
|
b=b) |
|
model_results['Validation_Set'].append(corpus_title[i]) |
|
model_results['Alpha'].append(a) |
|
model_results['Beta'].append(b) |
|
model_results['Coherence'].append(cv) |
|
|
|
pbar.update(1) |
|
pd.DataFrame(model_results).to_csv('lda_tuning_results_new.csv', index=False) |
|
pbar.close() |
|
|
|
params_df = pd.read_csv('lda_tuning_results_new.csv') |
|
params_df = params_df[params_df.Validation_Set == '75% Corpus'] |
|
params_df.reset_index(inplace=True) |
|
params_df = params_df.replace(np.inf, -np.inf) |
|
max_params = params_df.loc[params_df['Coherence'].idxmax()] |
|
max_coherence = max_params['Coherence'] |
|
max_alpha = max_params['Alpha'] |
|
max_beta = max_params['Beta'] |
|
max_validation_set = max_params['Validation_Set'] |
|
|
|
global lda_model_final |
|
lda_model_final = gensim.models.ldamodel.LdaModel(corpus=corpus_og, |
|
id2word=id2word, |
|
num_topics=num_topics, |
|
random_state=100, |
|
chunksize=200, |
|
passes=10, |
|
alpha=max_alpha, |
|
eta=max_beta, |
|
per_word_topics=True) |
|
|
|
coherence_model_lda = CoherenceModel(model=lda_model_final, texts=df['lemma_tokens'], dictionary=id2word, |
|
coherence='c_v') |
|
coherence_lda = coherence_model_lda.get_coherence() |
|
|
|
lda_topics = lda_model_final.show_topics(num_words=10) |
|
|
|
print('assign topics') |
|
topics = [] |
|
filters = [lambda x: x.lower(), strip_punctuation, strip_numeric] |
|
|
|
for topic in lda_topics: |
|
topics.append(preprocess_string(topic[1], filters)) |
|
|
|
df['topic'] = [sorted(lda_model_final[corpus_og][text][0]) for text in range(len(df['original_tweets']))] |
|
|
|
df = df[df['topic'].map(lambda d: len(d)) > 0] |
|
df['max_topic'] = df['topic'].map(lambda row: assignMaxTopic(row)) |
|
|
|
global topic_clusters |
|
topic_clusters = [] |
|
for i in range(num_topics): |
|
topic_clusters.append(df[df['max_topic'].isin(([i]))]) |
|
topic_clusters[i] = topic_clusters[i]['original_tweets'].tolist() |
|
|
|
print('rep topics') |
|
global top_tweets |
|
top_tweets = [] |
|
for i in range(len(topic_clusters)): |
|
tweets = df.loc[df['max_topic'] == i] |
|
tweets['topic'] = tweets['topic'].apply(lambda x: get_topic_value(x, i)) |
|
|
|
tweets_sorted = tweets.sort_values('topic', ascending=False) |
|
tweets_sorted.drop_duplicates(subset=['original_tweets']) |
|
rep_tweets = tweets_sorted['original_tweets'] |
|
rep_tweets = [*set(rep_tweets)] |
|
top_tweets.append(rep_tweets[:5]) |
|
|
|
|
|
|
|
return top_tweets |
|
|
|
def topic_summarization(topic_groups): |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("Michau/t5-base-en-generate-headline") |
|
model = AutoModelForSeq2SeqLM.from_pretrained("Michau/t5-base-en-generate-headline") |
|
translator = Translator() |
|
|
|
headlines = [] |
|
for i in range(len(topic_groups)): |
|
tweets = " ".join(topic_groups[i]) |
|
|
|
out = translator.translate(tweets, dest='en') |
|
text = out.text |
|
|
|
|
|
max_len = 256 |
|
|
|
encoding = tokenizer.encode_plus(text, return_tensors = "pt") |
|
input_ids = encoding["input_ids"] |
|
attention_masks = encoding["attention_mask"] |
|
|
|
beam_outputs = model.generate( |
|
input_ids = input_ids, |
|
attention_mask = attention_masks, |
|
max_length = 64, |
|
num_beams = 3, |
|
early_stopping = True, |
|
) |
|
|
|
result = tokenizer.decode(beam_outputs[0]) |
|
print(result) |
|
headlines.append("Topic " + str(i) + " " + result) |
|
|
|
return headlines |
|
|
|
def compute_coherence_value_bertopic(topic_model): |
|
topic_words = [[words for words, _ in topic_model.get_topic(topic)] for topic in range(len(set(topics))-1)] |
|
coherence_model = CoherenceModel(topics=topic_words, |
|
texts=df['lemma_tokens'], |
|
corpus=corpus, |
|
dictionary=id2word, |
|
coherence=coherence) |
|
coherence_score = coherence_model.get_coherence() |
|
|
|
return coherence_score |
|
|
|
def base_bertopic(df): |
|
df['lemma_tokens_string'] = df['lemma_tokens'].apply(lambda x: ' '.join(x)) |
|
global id2word |
|
id2word = Dictionary(df['lemma_tokens']) |
|
global corpus |
|
corpus = [id2word.doc2bow(d) for d in df['lemma_tokens']] |
|
|
|
global umap_model |
|
umap_model = UMAP(n_neighbors=15, |
|
n_components=5, |
|
min_dist=0.0, |
|
metric='cosine', |
|
random_state=100) |
|
|
|
base_topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True) |
|
|
|
topics, probabilities = base_topic_model.fit_transform(df['lemma_tokens_string']) |
|
|
|
try: |
|
print(compute_coherence_value_bertopic(base_topic_model)) |
|
except: |
|
print('huh') |
|
print(base_topic_model.get_topic_info()) |
|
print('Unable to generate meaningful topics (Base BERTopic model)') |
|
|
|
def optimized_bertopic(df): |
|
vectorizer_model = CountVectorizer(max_features=1_000, stop_words="english") |
|
optimized_topic_model = BERTopic(umap_model=umap_model, |
|
language="multilingual", |
|
n_gram_range=(1, 3), |
|
vectorizer_model=vectorizer_model, |
|
calculate_probabilities=True) |
|
|
|
topics, probabilities = optimized_topic_model.fit_transform(df['lemma_tokens_string']) |
|
|
|
try: |
|
print(compute_coherence_value_bertopic(optimized_topic_model)) |
|
except: |
|
print('huh optimized') |
|
print(optimized_topic_model.get_topic_info()) |
|
print('Unable to generate meaningful topics, base BERTopic model if possible') |
|
|
|
rep_docs = optimized_topic_model.representative_docs_ |
|
|
|
global top_tweets |
|
top_tweets = [] |
|
|
|
for topic in rep_docs: |
|
if topic == -1: |
|
print('test') |
|
continue |
|
topic_docs = rep_docs.get(topic) |
|
|
|
tweets = [] |
|
for doc in topic_docs: |
|
index = df.isin([doc]).any(axis=1).idxmax() |
|
|
|
tweets.append(df.loc[index, 'original_tweets']) |
|
|
|
top_tweets.append(tweets) |
|
return top_tweets |
|
|
|
global examples |
|
examples = [["katip,katipunan", "LDA"], |
|
["katip,katipunan", "BERTopic"], |
|
["bgc,bonifacio global city", "LDA"], |
|
["bgc,bonifacio global city", "BERTopic"], |
|
["pobla,poblacion", "LDA"], |
|
["pobla,poblacion", "BERTopic"], |
|
["cubao","LDA"], |
|
["cubao","BERTopic"], |
|
["taft", "LDA"], |
|
["taft", "BERTopic"] |
|
] |
|
|
|
def main(dataset, model |
|
|
|
): |
|
global df |
|
examples = [ "katip,katipunan", |
|
"bgc,bonifacio global city", |
|
"pobla,poblacion", |
|
"cubao", |
|
"taft" |
|
] |
|
keyword_list = dataset.split(',') |
|
if len(keyword_list) > 1: |
|
keywords = '(' + ' OR '.join(keyword_list) + ')' |
|
else: |
|
keywords = keyword_list[0] |
|
|
|
|
|
|
|
|
|
|
|
print(dataset) |
|
df, place_data = scrape(keywords) |
|
|
|
print(place_data) |
|
print(df) |
|
|
|
if model == 'LDA': |
|
df = cleaning(df) |
|
print('doing lda') |
|
top_tweets = full_lda(df) |
|
print('done lda') |
|
|
|
else: |
|
df = cleaning(df) |
|
base_bertopic(df) |
|
top_tweets = optimized_bertopic(df) |
|
|
|
print('doing topic summarization') |
|
headlines = topic_summarization(top_tweets) |
|
headlines = '\n'.join(str(h) for h in headlines) |
|
|
|
top_tweets = '\n\n'.join (str(tweet) for tweet in top_tweets) |
|
|
|
|
|
|
|
return place_data, headlines, top_tweets |
|
|
|
|
|
iface = gr.Interface(fn=main, |
|
inputs=["text", |
|
gr.Dropdown(["LDA", |
|
"BERTopic"], |
|
label="Model") |
|
], |
|
|
|
outputs=["text", |
|
"text", |
|
"text"], |
|
enable_queue=True, |
|
debug=True, |
|
) |
|
iface.launch(debug=True, enable_queue=True, share=True) |
|
|