Spaces:

Jongmo
/

Prove_KCL

Sleeping

App Files Files Community

Prove_KCL / Prove_llm.py

Jongmo

Upload 10 files

49664ed verified 5 months ago

raw

history blame contribute delete

4.02 kB

	import pandas as pd
	import numpy as np
	import sqlite3, torch, json, re, os, torch, itertools, html2text
	from ast import literal_eval as leval
	from tqdm.auto import tqdm
	from utils.verbalisation_module import VerbModule
	from utils.sentence_retrieval_module import SentenceRetrievalModule
	from utils.textual_entailment_module import TextualEntailmentModule
	from importlib import reload
	import llm_load
	from html.parser import HTMLParser
	from sentence_transformers import SentenceTransformer
	from sklearn.metrics.pairwise import cosine_similarity
	from tqdm import tqdm
	import gradio as gr


	def verbalisation(claim_df):
	verb_module = VerbModule()
	triples = []
	for _, row in claim_df.iterrows():
	triple = {
	'subject': row['entity_label'],
	'predicate': row['property_label'],
	'object': row['object_label']
	}
	triples.append(triple)


	claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
	claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
	claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
	return claim_df

	def RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress):
	join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
	tokenizer, model = llm_load.llmLoad(4096)
	h = html2text.HTML2Text()
	h.ignore_links = True

	filtered_htmls = []
	answers = []
	verifications = []
	for idx, (html, verb) in enumerate(zip(join_df['html'], join_df['verbalisation'])):
	try:
	filtered_html = h.handle(html)
	filtered_htmls.append(filtered_html)
	instruct = "Find the most relevant sentences from the filtered HTML document based on the given target sentence. If there are no directly related sentences, try to find sentences that provide context or background information related to the target sentence. Only answer 'nothing' if there is absolutely no relevant information in the document. Do not include any HTML tags or markup in your answer."
	question = f"target sentence:'{verb}', filtered HTML document:{filtered_html}"
	answer = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=128)
	answers.append(answer)
	except:
	answers.append('Malformed html')
	instruct = "Determine whether the target sentence is supported by the given evidence or not. If so, answer 'supportive'. It not, answer 'No supports'. Or, you can't determine with the given evidence, then asnwer 'Not enough information'"
	question = f"target sentence:'{verb}', evidence:{answers[-1]}"
	verification = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=64)
	verifications.append(verification)

	update_progress(idx, len(join_df)) # Update progress


	return pd.DataFrame({'verbalisation': join_df['verbalisation'], 'verification': verifications, 'evidence_set': answers, 'url': join_df['url'], 'filtered_html': filtered_htmls})



	if __name__ == '__main__':
	target_QID = 'Q42'
	conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
	query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
	claim_df = pd.read_sql_query(query, conn)

	query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
	reference_text_df = pd.read_sql_query(query, conn)

	verbalised_claims_df_final = verbalisation(claim_df)

	progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
	def update_progress(curr_step, total_steps):
	progress((curr_step + 1) / total_steps)

	result = RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)

	conn.commit()
	conn.close()