Spaces:
Sleeping
Sleeping
from tabula import read_pdf | |
from bs4 import BeautifulSoup | |
import requests | |
from llama_cpp import Llama | |
from bertopic.representation import KeyBERTInspired, LlamaCPP | |
from sentence_transformers import SentenceTransformer | |
from umap import UMAP | |
from hdbscan import HDBSCAN | |
from bertopic import BERTopic | |
import PIL | |
import numpy as np | |
import datamapplot | |
import re | |
def get_links(): | |
#reads table from pdf file | |
dfs = read_pdf("Artificial_Intelligence_Bookmarks_AwesomeList.pdf",pages="all") #upload pdf file | |
links = dfs[0]['Unnamed: 2'].to_list() | |
for i in range(len(dfs)-1): | |
links.extend(dfs[i+1]['Url'].to_list()) | |
return links | |
#-------------------------------------- | |
# text processing | |
def remove_tags(html): | |
# parse html content | |
soup = BeautifulSoup(html, "html.parser") | |
for data in soup(['style', 'script']): | |
# Remove tags | |
data.decompose() | |
# return data by retrieving the tag content | |
return ' '.join(soup.stripped_strings) | |
def remove_emoji(data): | |
emoj = re.compile("[" | |
u"\U0001F600-\U0001F64F" # emoticons | |
u"\U0001F300-\U0001F5FF" # symbols & pictographs | |
u"\U0001F680-\U0001F6FF" # transport & map symbols | |
u"\U0001F1E0-\U0001F1FF" # flags (iOS) | |
u"\U00002500-\U00002BEF" # chinese char | |
u"\U00002702-\U000027B0" | |
u"\U000024C2-\U0001F251" | |
u"\U0001f926-\U0001f937" | |
u"\U00010000-\U0010ffff" | |
u"\u2640-\u2642" | |
u"\u2600-\u2B55" | |
u"\u200d" | |
u"\u23cf" | |
u"\u23e9" | |
u"\u231a" | |
u"\ufe0f" # dingbats | |
u"\u3030" | |
"]+", re.UNICODE) | |
return re.sub(emoj, '', data) | |
#------------------------------------- | |
def get_page(link): | |
try: | |
#print(link) | |
x = requests.get(link) | |
raw_html = x.text | |
clean_text = remove_tags(raw_html)[:1050] | |
clean_text = remove_emoji(clean_text) | |
return clean_text | |
except: | |
print(link) | |
def get_documents(links): | |
pre_processed_text = [get_page(link) for link in links] | |
while(None in pre_processed_text): | |
pre_processed_text.remove(None) | |
pre_processed_text = [i for i in pre_processed_text if len(i) > 999] | |
return pre_processed_text | |
#---------------------------------------- | |
def get_topics(docs): | |
# Use llama.cpp to load in a Quantized LLM | |
llm = Llama(model_path="openhermes-2.5-mistral-7b.Q4_K_M.gguf", n_gpu_layers=-1, n_ctx=4096, stop=["Q:", "\n"]) | |
prompt = """ Q: | |
I have a topic that contains the following documents: | |
[DOCUMENTS] | |
The topic is described by the following keywords: '[KEYWORDS]'. | |
Based on the above information, can you give a short label of the topic of at most 5 words? | |
A: | |
""" | |
representation_model = { | |
"KeyBERT": KeyBERTInspired(), | |
"LLM": LlamaCPP(llm, prompt=prompt), | |
} | |
# Pre-calculate embeddings | |
embedding_model = SentenceTransformer("BAAI/bge-small-en") | |
embeddings = embedding_model.encode(docs, show_progress_bar=True) | |
# Pre-reduce embeddings for visualization purposes | |
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings) | |
# Define sub-models | |
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) | |
hdbscan_model = HDBSCAN(min_cluster_size=2, metric='euclidean', cluster_selection_method='eom', prediction_data=True) | |
topic_model = BERTopic( | |
# Sub-models | |
embedding_model=embedding_model, | |
umap_model=umap_model, | |
hdbscan_model=hdbscan_model, | |
representation_model=representation_model, | |
# Hyperparameters | |
top_n_words=10, | |
verbose=True | |
) | |
# Train model | |
topics, probs = topic_model.fit_transform(docs, embeddings) | |
return topic_model | |
#------------------------------- | |
# Visualize Topics | |
def get_figure(topic_model): | |
# Prepare logo | |
bertopic_logo_response = requests.get( | |
"https://raw.githubusercontent.com/MaartenGr/BERTopic/master/images/logo.png", | |
stream=True, | |
headers={'User-Agent': 'My User Agent 1.0'} | |
) | |
bertopic_logo = np.asarray(PIL.Image.open(bertopic_logo_response.raw)) | |
# Create a label for each document | |
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()] | |
llm_labels = [label if label else "Unlabelled" for label in llm_labels] | |
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics] | |
# Run the visualization | |
fig = datamapplot.create_plot( | |
reduced_embeddings, | |
all_labels, | |
label_font_size=11, | |
title="ArXiv - BERTopic", | |
sub_title="Topics labeled with `openhermes-2.5-mistral-7b`", | |
label_wrap_width=20, | |
use_medoids=True, | |
logo=bertopic_logo, | |
logo_width=0.16 | |
) | |
return fig | |