import requests
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import numpy as np
import threading
import gradio as gr
import re
from bs4 import BeautifulSoup
from markdown import markdown
import nltk
from nltk.tokenize import sent_tokenize
import string
import unicodedata
nltk.download('punkt')
POST_ID = 0
REFERENDUM_TYPE = "referendums_v2"
VOTE_TYPE = "ReferendumV2" # "Motion", "Fellowship", "Referendum", "ReferendumV2", "DemocracyProposal"
UPDATE_INTERVAL = 1800
def dot_product(u, v):
res = np.dot(u, v)
return res
def markdn_2_str(text):
html = markdown(text)
clean_text = ' '.join(BeautifulSoup(html, features="html.parser").findAll(string=True))
markdown_link_pattern = re.compile(r'\[.*?\]\(.*?\)')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
clean_text = re.sub(markdown_link_pattern, ' ', clean_text) # remove markdown style links
clean_text = re.sub(url_pattern, ' ', clean_text) # remove regular links
clean_text = clean_text.replace('\n', ' ') # remove \n
return clean_text
def get_sum(prop):
key_word = "KSM"
pattern = re.compile(r'(\d)')
search_phrases = [
"requests a total of 1500 KSM",
"requests a total of 7450 $ (17 KSM)",
"Requested amount: 3,333 KSM",
"Requested funding 78,804 USD // 2770 KSM",
"Requested KSM: 598"
]
ref = model.encode("".join(search_phrases), convert_to_tensor=True)
prop = unicodedata.normalize("NFKD", prop)
prop = markdn_2_str(prop)
sentences = sent_tokenize(prop)
similarities = []
for s in sentences:
sentence_embedding = model.encode(s, convert_to_tensor=True)
similarities.append(-dot_product(sentence_embedding, ref))
max_similarity_index = np.argsort(similarities)
sent = next((sentences[i] for i in max_similarity_index if "KSM" in sentences[i]), "None")
s = re.split(r'(\s)', sent)
s = [x.translate(str.maketrans('', '', string.punctuation)) if not pattern.search(x) else x for x in s]
s = [x for x in s if x != ' ']
s = [x for x in s if x != '']
try:
index_KSM = [idx for idx, val in enumerate(s) if "KSM" in val]
for el in index_KSM:
l = s[el - 1:el + 2]
for x in l:
if pattern.search(x):
return x
except Exception:
return None
def get_proposals():
global POST_ID
global df
flag = True
while flag:
rn = requests.post(
f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
headers={"x-network": "kusama"})
if rn.ok:
proposal_data = rn.json()
line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
df.loc[POST_ID] = line
POST_ID += 1
else:
event.set()
flag = False
def get_embeddings():
global df_emb
for i in range(len(df)):
df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
def update_proposals():
global POST_ID
global df
flag = True
while flag:
rn = requests.post(
f"https://api.polkassembly.io/api/v1/posts/on-chain-post?proposalType={REFERENDUM_TYPE}&postId={POST_ID}",
headers={"x-network": "kusama"})
if rn.ok:
proposal_data = rn.json()
line = [proposal_data.get("content"), proposal_data.get("status"), get_sum(proposal_data.get("content"))]
df.loc[POST_ID] = line
POST_ID += 1
else:
event.set()
flag = False
def update_embeddings():
global df_emb
while True:
event.wait()
print(POST_ID)
print(len(df))
if len(df) != len(df_emb):
id_to_add = [x + len(df_emb) for x in range(len(df) - len(df_emb))]
for i in id_to_add:
df_emb.loc[i] = [model.encode(markdn_2_str(df.iloc[i]['content']))]
else:
event.clear()
def run_periodically():
update_proposals()
threading.Timer(UPDATE_INTERVAL, run_periodically, args=(UPDATE_INTERVAL,)).start()
def compare_proposals(prop, count):
query_emb = model.encode(markdn_2_str(prop))
new_df = pd.DataFrame(columns=['sim1'])
new_df['sim1'] = df_emb.apply(lambda row: dot_product(row[0], query_emb), axis=1)
best_match = np.argsort(-new_df['sim1'])[0:count]
res = [df.iloc[x]['content'] for x in best_match]
stat = [df.iloc[x]['status'] for x in best_match]
ksm = [df.iloc[x]['ksm'] for x in best_match]
# total = [get_sum(y) for y in [df.iloc[x]['content'] for x in best_match]]
title = [
'''Total KSM requested: {sum}, status: {status}, ID: {id}
\n '''.format(
sum=x, status=y, id=z) for x, y, z in zip(ksm, stat, best_match)]
result = "\n ".join([a + b for a, b in zip(title, res)])
return result
if __name__ == '__main__':
event = threading.Event()
model = SentenceTransformer('sentence-transformers/msmarco-bert-base-dot-v5')
df = pd.DataFrame(columns=['content', 'status', 'ksm'])
df_emb = pd.DataFrame(columns=['content'])
get_proposals()
get_embeddings()
POST_ID = len(df)
update_thread = threading.Thread(target=run_periodically) # background proposals update
upd_emb_thread = threading.Thread(target=update_embeddings) # background embeddings update
update_thread.start()
upd_emb_thread.start()
with gr.Blocks() as demo:
gr.Markdown("