Jongmo commited on
Commit
953a835
·
verified ·
1 Parent(s): a7e6d7b

Upload 9 files

Browse files
Files changed (9) hide show
  1. Prove_lite.py +271 -0
  2. Prove_llm.py +84 -0
  3. SimpleUI_lite.py +122 -0
  4. SimpleUI_llm.py +136 -0
  5. UI_tester.py +52 -0
  6. Wikidata_Text_Parser.py +933 -0
  7. app.py +118 -59
  8. llm_load copy.py +188 -0
  9. llm_load.py +188 -0
Prove_lite.py ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sqlite3, torch, json, re, os, torch, itertools, nltk
4
+ from ast import literal_eval as leval
5
+ from tqdm.auto import tqdm
6
+ from utils.verbalisation_module import VerbModule
7
+ from utils.sentence_retrieval_module import SentenceRetrievalModule
8
+ from utils.textual_entailment_module import TextualEntailmentModule
9
+ from importlib import reload
10
+ from html.parser import HTMLParser
11
+ from sentence_transformers import SentenceTransformer
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+ from tqdm import tqdm
14
+ import gradio as gr
15
+ from bs4 import BeautifulSoup
16
+ from cleantext import clean
17
+
18
+
19
+ def verbalisation(claim_df):
20
+ verb_module = VerbModule()
21
+ triples = []
22
+ for _, row in claim_df.iterrows():
23
+ triple = {
24
+ 'subject': row['entity_label'],
25
+ 'predicate': row['property_label'],
26
+ 'object': row['object_label']
27
+ }
28
+ triples.append(triple)
29
+
30
+
31
+ claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
32
+ claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
33
+ claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
34
+ return claim_df
35
+
36
+ def setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress):
37
+ join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
38
+ SS_df = join_df[['reference_id','url','verbalisation', 'html']].copy()
39
+ def clean_html(html_content):
40
+ soup = BeautifulSoup(html_content, 'html.parser')
41
+ text = soup.get_text(separator=' ', strip=True)
42
+ cleaned_text = clean(text,
43
+ fix_unicode=True,
44
+ to_ascii=True,
45
+ lower=False,
46
+ no_line_breaks=False,
47
+ no_urls=True,
48
+ no_emails=True,
49
+ no_phone_numbers=True,
50
+ no_numbers=False,
51
+ no_digits=False,
52
+ no_currency_symbols=True,
53
+ no_punct=False,
54
+ replace_with_url="",
55
+ replace_with_email="",
56
+ replace_with_phone_number="",
57
+ replace_with_number="",
58
+ replace_with_digit="",
59
+ replace_with_currency_symbol="")
60
+ return cleaned_text
61
+ def split_into_sentences(text):
62
+ sentences = nltk.sent_tokenize(text)
63
+ return sentences
64
+ def slide_sentences(sentences, window_size=2):
65
+ if len(sentences) < window_size:
66
+ return [" ".join(sentences)]
67
+ return [" ".join(sentences[i:i + window_size]) for i in range(len(sentences) - window_size + 1)]
68
+
69
+ SS_df['html2text'] = SS_df['html'].apply(clean_html)
70
+ SS_df['nlp_sentences'] = SS_df['html2text'].apply(split_into_sentences)
71
+ SS_df['nlp_sentences_slide_2'] = SS_df['nlp_sentences'].apply(slide_sentences)
72
+
73
+ return SS_df[['reference_id','verbalisation','url','nlp_sentences','nlp_sentences_slide_2']]
74
+
75
+ def evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES):
76
+ sr_module = SentenceRetrievalModule(max_len=512)
77
+ sentence_relevance_df = splited_sentences_from_html.copy()
78
+ sentence_relevance_df.rename(columns={'verbalisation': 'final_verbalisation'}, inplace=True)
79
+
80
+ def chunks(l, n):
81
+ n = max(1, n)
82
+ return [l[i:i + n] for i in range(0, len(l), n)]
83
+
84
+ def compute_scores(column_name):
85
+ all_outputs = []
86
+ for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
87
+ outputs = []
88
+ for batch in chunks(row[column_name], BATCH_SIZE):
89
+ batch_outputs = sr_module.score_sentence_pairs([(row['final_verbalisation'], sentence) for sentence in batch])
90
+ outputs += batch_outputs
91
+ all_outputs.append(outputs)
92
+ sentence_relevance_df[f'{column_name}_scores'] = pd.Series(all_outputs)
93
+ assert all(sentence_relevance_df.apply(lambda x: len(x[column_name]) == len(x[f'{column_name}_scores']), axis=1))
94
+
95
+ compute_scores('nlp_sentences')
96
+ compute_scores('nlp_sentences_slide_2')
97
+
98
+ def get_top_n_sentences(row, column_name, n):
99
+ sentences_with_scores = [{'sentence': t[0], 'score': t[1], 'sentence_id': f"{row.name}_{j}"} for j, t in enumerate(zip(row[column_name], row[f'{column_name}_scores']))]
100
+ return sorted(sentences_with_scores, key=lambda x: x['score'], reverse=True)[:n]
101
+
102
+
103
+ def filter_overlaps(sentences):
104
+ filtered = []
105
+ for evidence in sentences:
106
+ if ';' in evidence['sentence_id']:
107
+ start_id, end_id = evidence['sentence_id'].split(';')
108
+ if not any(start_id in e['sentence_id'].split(';') or end_id in e['sentence_id'].split(';') for e in filtered):
109
+ filtered.append(evidence)
110
+ else:
111
+ if not any(evidence['sentence_id'] in e['sentence_id'].split(';') for e in filtered):
112
+ filtered.append(evidence)
113
+ return filtered
114
+
115
+ def limit_sentence_length(sentence, max_length):
116
+ if len(sentence) > max_length:
117
+ return sentence[:max_length] + '...'
118
+ return sentence
119
+
120
+ nlp_sentences_TOP_N, nlp_sentences_slide_2_TOP_N, nlp_sentences_all_TOP_N = [], [], []
121
+
122
+ for _, row in tqdm(sentence_relevance_df.iterrows(), total=sentence_relevance_df.shape[0]):
123
+ top_n = get_top_n_sentences(row, 'nlp_sentences', N_TOP_SENTENCES)
124
+ top_n = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n]
125
+ nlp_sentences_TOP_N.append(top_n)
126
+
127
+ top_n_slide_2 = get_top_n_sentences(row, 'nlp_sentences_slide_2', N_TOP_SENTENCES)
128
+ top_n_slide_2 = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in top_n_slide_2]
129
+ nlp_sentences_slide_2_TOP_N.append(top_n_slide_2)
130
+
131
+ all_sentences = top_n + top_n_slide_2
132
+ all_sentences_sorted = sorted(all_sentences, key=lambda x: x['score'], reverse=True)
133
+ filtered_sentences = filter_overlaps(all_sentences_sorted)
134
+ filtered_sentences = [{'sentence': limit_sentence_length(s['sentence'], 1024), 'score': s['score'], 'sentence_id': s['sentence_id']} for s in filtered_sentences]
135
+ nlp_sentences_all_TOP_N.append(filtered_sentences[:N_TOP_SENTENCES])
136
+
137
+ sentence_relevance_df['nlp_sentences_TOP_N'] = pd.Series(nlp_sentences_TOP_N)
138
+ sentence_relevance_df['nlp_sentences_slide_2_TOP_N'] = pd.Series(nlp_sentences_slide_2_TOP_N)
139
+ sentence_relevance_df['nlp_sentences_all_TOP_N'] = pd.Series(nlp_sentences_all_TOP_N)
140
+
141
+ return sentence_relevance_df
142
+
143
+ def textEntailment(evidence_df, SCORE_THRESHOLD):
144
+ textual_entailment_df = evidence_df.copy()
145
+ te_module = TextualEntailmentModule()
146
+
147
+ keys = ['TOP_N', 'slide_2_TOP_N', 'all_TOP_N']
148
+ te_columns = {f'evidence_TE_prob_{key}': [] for key in keys}
149
+ te_columns.update({f'evidence_TE_prob_weighted_{key}': [] for key in keys})
150
+ te_columns.update({f'evidence_TE_labels_{key}': [] for key in keys})
151
+ te_columns.update({f'claim_TE_prob_weighted_sum_{key}': [] for key in keys})
152
+ te_columns.update({f'claim_TE_label_weighted_sum_{key}': [] for key in keys})
153
+ te_columns.update({f'claim_TE_label_malon_{key}': [] for key in keys})
154
+
155
+ def process_row(row):
156
+ claim = row['final_verbalisation']
157
+ results = {}
158
+ for key in keys:
159
+ evidence = row[f'nlp_sentences_{key}']
160
+ evidence_size = len(evidence)
161
+ if evidence_size == 0:
162
+ results[key] = {
163
+ 'evidence_TE_prob': [],
164
+ 'evidence_TE_labels': [],
165
+ 'evidence_TE_prob_weighted': [],
166
+ 'claim_TE_prob_weighted_sum': [0, 0, 0],
167
+ 'claim_TE_label_weighted_sum': 'NOT ENOUGH INFO',
168
+ 'claim_TE_label_malon': 'NOT ENOUGH INFO'
169
+ }
170
+ continue
171
+
172
+ evidence_TE_prob = te_module.get_batch_scores(
173
+ claims=[claim] * evidence_size,
174
+ evidence=[e['sentence'] for e in evidence]
175
+ )
176
+
177
+ evidence_TE_labels = [te_module.get_label_from_scores(s) for s in evidence_TE_prob]
178
+
179
+ evidence_TE_prob_weighted = [
180
+ probs * ev['score'] for probs, ev in zip(evidence_TE_prob, evidence)
181
+ if ev['score'] > SCORE_THRESHOLD
182
+ ]
183
+
184
+ claim_TE_prob_weighted_sum = np.sum(evidence_TE_prob_weighted, axis=0) if evidence_TE_prob_weighted else [0, 0, 0]
185
+
186
+ claim_TE_label_weighted_sum = te_module.get_label_from_scores(claim_TE_prob_weighted_sum) if evidence_TE_prob_weighted else 'NOT ENOUGH INFO'
187
+
188
+ claim_TE_label_malon = te_module.get_label_malon(
189
+ [probs for probs, ev in zip(evidence_TE_prob, evidence) if ev['score'] > SCORE_THRESHOLD]
190
+ )
191
+
192
+ results[key] = {
193
+ 'evidence_TE_prob': evidence_TE_prob,
194
+ 'evidence_TE_labels': evidence_TE_labels,
195
+ 'evidence_TE_prob_weighted': evidence_TE_prob_weighted,
196
+ 'claim_TE_prob_weighted_sum': claim_TE_prob_weighted_sum,
197
+ 'claim_TE_label_weighted_sum': claim_TE_label_weighted_sum,
198
+ 'claim_TE_label_malon': claim_TE_label_malon
199
+ }
200
+ return results
201
+
202
+ for i, row in tqdm(textual_entailment_df.iterrows(), total=textual_entailment_df.shape[0]):
203
+ try:
204
+ result_sets = process_row(row)
205
+ for key in keys:
206
+ for k, v in result_sets[key].items():
207
+ te_columns[f'{k}_{key}'].append(v)
208
+ except Exception as e:
209
+ print(f"Error processing row {i}: {e}")
210
+ print(row)
211
+ raise
212
+
213
+ for key in keys:
214
+ for col in ['evidence_TE_prob', 'evidence_TE_prob_weighted', 'evidence_TE_labels',
215
+ 'claim_TE_prob_weighted_sum', 'claim_TE_label_weighted_sum', 'claim_TE_label_malon']:
216
+ textual_entailment_df[f'{col}_{key}'] = pd.Series(te_columns[f'{col}_{key}'])
217
+
218
+ return textual_entailment_df
219
+
220
+ def TableMaking(verbalised_claims_df_final, result):
221
+ verbalised_claims_df_final.set_index('reference_id', inplace=True)
222
+ result.set_index('reference_id', inplace=True)
223
+ results = pd.concat([verbalised_claims_df_final, result], axis=1)
224
+ results['triple'] = results[['entity_label', 'property_label', 'object_label']].apply(lambda x: ', '.join(x), axis=1)
225
+ all_result = pd.DataFrame()
226
+ for idx, row in results.iterrows():
227
+ aResult = pd.DataFrame(row["nlp_sentences_TOP_N"])[['sentence','score']]
228
+ aResult.rename(columns={'score': 'Relevance_score'}, inplace=True)
229
+ aResult = pd.concat([aResult, pd.DataFrame(row["evidence_TE_labels_all_TOP_N"], columns=['TextEntailment'])], axis=1)
230
+ aResult = pd.concat([aResult, pd.DataFrame(np.max(row["evidence_TE_prob_all_TOP_N"], axis=1), columns=['Entailment_score'])], axis=1)
231
+ aResult = aResult.reindex(columns=['sentence', 'TextEntailment', 'Entailment_score','Relevance_score'])
232
+ aBox = pd.DataFrame({'triple': [row["triple"]], 'url': row['url'],'Results': [aResult]})
233
+ all_result = pd.concat([all_result,aBox], axis=0)
234
+
235
+ def dataframe_to_html(all_result):
236
+ html = '<html><head><style>table {border-collapse: collapse; width: 100%;} th, td {border: 1px solid black; padding: 8px; text-align: left;} th {background-color: #f2f2f2;}</style></head><body>'
237
+ for triple in all_result['triple'].unique():
238
+ html += f'<h3>Triple: {triple}</h3>'
239
+ df = all_result[all_result['triple']==triple].copy()
240
+ for idx, row in df.iterrows():
241
+ url = row['url']
242
+ results = row['Results']
243
+ html += f'<h3>Reference: {url}</h3>'
244
+ html += results.to_html(index=False)
245
+ html += '</body></html>'
246
+ return html
247
+ html_result = dataframe_to_html(all_result)
248
+ return html_result
249
+
250
+ if __name__ == '__main__':
251
+ target_QID = 'Q245247'
252
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
253
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
254
+ claim_df = pd.read_sql_query(query, conn)
255
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
256
+ reference_text_df = pd.read_sql_query(query, conn)
257
+ verbalised_claims_df_final = verbalisation(claim_df)
258
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
259
+ def update_progress(curr_step, total_steps):
260
+ progress((curr_step + 1) / total_steps)
261
+
262
+ splited_sentences_from_html = setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
263
+
264
+ BATCH_SIZE = 512
265
+ N_TOP_SENTENCES = 5
266
+ SCORE_THRESHOLD = 0.6
267
+ evidence_df = evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
268
+ result = textEntailment(evidence_df, SCORE_THRESHOLD)
269
+ conn.commit()
270
+ conn.close()
271
+ display_df =TableMaking(verbalised_claims_df_final, result)
Prove_llm.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import sqlite3, torch, json, re, os, torch, itertools, html2text
4
+ from ast import literal_eval as leval
5
+ from tqdm.auto import tqdm
6
+ from utils.verbalisation_module import VerbModule
7
+ from utils.sentence_retrieval_module import SentenceRetrievalModule
8
+ from utils.textual_entailment_module import TextualEntailmentModule
9
+ from importlib import reload
10
+ import llm_load
11
+ from html.parser import HTMLParser
12
+ from sentence_transformers import SentenceTransformer
13
+ from sklearn.metrics.pairwise import cosine_similarity
14
+ from tqdm import tqdm
15
+ import gradio as gr
16
+
17
+
18
+ def verbalisation(claim_df):
19
+ verb_module = VerbModule()
20
+ triples = []
21
+ for _, row in claim_df.iterrows():
22
+ triple = {
23
+ 'subject': row['entity_label'],
24
+ 'predicate': row['property_label'],
25
+ 'object': row['object_label']
26
+ }
27
+ triples.append(triple)
28
+
29
+
30
+ claim_df['verbalisation'] = verb_module.verbalise_triples(triples)
31
+ claim_df['verbalisation_unks_replaced'] = claim_df['verbalisation'].apply(verb_module.replace_unks_on_sentence)
32
+ claim_df['verbalisation_unks_replaced_then_dropped'] = claim_df['verbalisation'].apply(lambda x: verb_module.replace_unks_on_sentence(x, empty_after=True))
33
+ return claim_df
34
+
35
+ def RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress):
36
+ join_df = pd.merge(verbalised_claims_df_final, reference_text_df[['reference_id', 'url', 'html']], on='reference_id', how='left')
37
+ tokenizer, model = llm_load.llmLoad(4096)
38
+ h = html2text.HTML2Text()
39
+ h.ignore_links = True
40
+
41
+ filtered_htmls = []
42
+ answers = []
43
+ verifications = []
44
+ for idx, (html, verb) in enumerate(zip(join_df['html'], join_df['verbalisation'])):
45
+ try:
46
+ filtered_html = h.handle(html)
47
+ filtered_htmls.append(filtered_html)
48
+ instruct = "Find the most relevant sentences from the filtered HTML document based on the given target sentence. If there are no directly related sentences, try to find sentences that provide context or background information related to the target sentence. Only answer 'nothing' if there is absolutely no relevant information in the document. Do not include any HTML tags or markup in your answer."
49
+ question = f"target sentence:'{verb}', filtered HTML document:{filtered_html}"
50
+ answer = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=128)
51
+ answers.append(answer)
52
+ except:
53
+ answers.append('Malformed html')
54
+ instruct = "Determine whether the target sentence is supported by the given evidence or not. If so, answer 'supportive'. It not, answer 'No supports'. Or, you can't determine with the given evidence, then asnwer 'Not enough information'"
55
+ question = f"target sentence:'{verb}', evidence:{answers[-1]}"
56
+ verification = llm_load.llmQuestion(tokenizer, model, instruct, question, output_size=64)
57
+ verifications.append(verification)
58
+
59
+ update_progress(idx, len(join_df)) # Update progress
60
+
61
+
62
+ return pd.DataFrame({'verbalisation': join_df['verbalisation'], 'verification': verifications, 'evidence_set': answers, 'url': join_df['url'], 'filtered_html': filtered_htmls})
63
+
64
+
65
+
66
+ if __name__ == '__main__':
67
+ target_QID = 'Q42'
68
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
69
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
70
+ claim_df = pd.read_sql_query(query, conn)
71
+
72
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
73
+ reference_text_df = pd.read_sql_query(query, conn)
74
+
75
+ verbalised_claims_df_final = verbalisation(claim_df)
76
+
77
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
78
+ def update_progress(curr_step, total_steps):
79
+ progress((curr_step + 1) / total_steps)
80
+
81
+ result = RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
82
+
83
+ conn.commit()
84
+ conn.close()
SimpleUI_lite.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import Prove_lite as prv
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
8
+
9
+ def wtr_process(qid):
10
+ try:
11
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
12
+ target_QID = qid
13
+ query = f"SELECT * FROM {'claim_text'}"
14
+ df = pd.read_sql_query(query, conn)
15
+ if target_QID in df['entity_id'].unique():
16
+ pass
17
+ else:
18
+ wtr.claimParser(target_QID) #save results in .db
19
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
20
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
21
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
22
+ claim_text = wtr.claim2text(html_set) #Claims generation
23
+ html_text = wtr.html2text(html_set)
24
+ claim_text = claim_text.astype(str)
25
+ html_text = html_text.astype(str)
26
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
27
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
28
+ conn.commit()
29
+ query = f"""
30
+ SELECT
31
+ claim_text.entity_label,
32
+ claim_text.property_label,
33
+ claim_text.object_label,
34
+ html_text.url
35
+ FROM claim_text
36
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
37
+ WHERE claim_text.entity_id = '{target_QID}'
38
+ """
39
+
40
+ result_df = pd.read_sql_query(query, conn)
41
+
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ return result_df
46
+
47
+ except Exception as e:
48
+ error_df = pd.DataFrame({'Error': [str(e)]})
49
+ return error_df
50
+
51
+
52
+ def prv_process(qid):
53
+ target_QID = qid
54
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
55
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
56
+ claim_df = pd.read_sql_query(query, conn)
57
+
58
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
59
+ reference_text_df = pd.read_sql_query(query, conn)
60
+
61
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
62
+
63
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
64
+ def update_progress(curr_step, total_steps):
65
+ progress((curr_step + 1) / total_steps)
66
+
67
+ splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
68
+
69
+ BATCH_SIZE = 512
70
+ N_TOP_SENTENCES = 5
71
+ SCORE_THRESHOLD = 0
72
+ evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
73
+ result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
74
+ display_df = prv.TableMaking(verbalised_claims_df_final, result)
75
+ conn.commit()
76
+ conn.close()
77
+ return display_df
78
+
79
+
80
+
81
+ with gr.Blocks() as demo:
82
+ print("gradio started!")
83
+ gr.Markdown(
84
+ """
85
+ # Prove
86
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
87
+ """
88
+ )
89
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
90
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
91
+ run_button_1 = gr.Button("Start parsing")
92
+ run_button_1.click(wtr_process, inp, out)
93
+
94
+
95
+ gr.Markdown(
96
+ """
97
+ Pre-trained language models-based text entailment.
98
+ """
99
+ )
100
+ out_2 = gr.HTML(label="Results")
101
+ run_button_2 = gr.Button("Start processing")
102
+ run_button_2.click(prv_process, inp, out_2)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ #DB initialising
107
+ if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
108
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
109
+ target_QID = 'Q115305900'
110
+ wtr.claimParser(target_QID) #save results in .db
111
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
112
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
113
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
114
+ claim_text = wtr.claim2text(html_set) #Claims generation
115
+ html_text = wtr.html2text(html_set)
116
+ claim_text = claim_text.astype(str)
117
+ html_text = html_text.astype(str)
118
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
119
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
120
+ conn.commit()
121
+ conn.close()
122
+ demo.launch(share=True)
SimpleUI_llm.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import CodeArchive.Prove_llm as prv
5
+ import pandas as pd
6
+
7
+ def wtr_process(qid):
8
+ try:
9
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
10
+ target_QID = qid
11
+
12
+ cursor = conn.cursor()
13
+
14
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='claims'")
15
+ table_exists = cursor.fetchone()
16
+
17
+ if table_exists:
18
+ cursor.execute("SELECT entity_id FROM claims WHERE entity_id=?", (target_QID,))
19
+ result = cursor.fetchone()
20
+
21
+ if result is not None and result[0] == target_QID:
22
+ print(result)
23
+ print(f"{target_QID} already exists in the 'claims' table. Skipping execution.")
24
+ else:
25
+ progress = gr.Progress(0)
26
+ progress(0.00, desc="Wikidata claims parsing...")
27
+ wtr.claimParser(target_QID) #save results in .db
28
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
29
+ progress(0.25, desc="URL and HTML parsing...")
30
+ url_set = wtr.urlParser() #from ref table in .db
31
+ html_set = wtr.htmlParser(url_set, qid) #Original html docs collection
32
+ progress(0.50, desc="claim2Text...")
33
+ claim_text = wtr.claim2text(html_set) #Claims generation
34
+ progress(0.74, desc="html2Text...")
35
+ html_text = wtr.html2text(html_set)
36
+ claim_text = claim_text.astype(str)
37
+ html_text = html_text.astype(str)
38
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
39
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
40
+ progress(1, desc="completed...")
41
+ else:
42
+ progress = gr.Progress(0)
43
+ progress(0.00, desc="Wikidata claims parsing...")
44
+ wtr.claimParser(target_QID) #save results in .db
45
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
46
+ progress(0.25, desc="URL and HTML parsing...")
47
+ url_set = wtr.urlParser() #from ref table in .db
48
+ html_set = wtr.htmlParser(url_set) #Original html docs collection
49
+ progress(0.50, desc="claim2Text...")
50
+ claim_text = wtr.claim2text(html_set) #Claims generation
51
+ progress(0.74, desc="html2Text...")
52
+ html_text = wtr.html2text(html_set)
53
+ claim_text = claim_text.astype(str)
54
+ html_text = html_text.astype(str)
55
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
56
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
57
+ progress(1, desc="completed...")
58
+
59
+
60
+ query = f"""
61
+ SELECT
62
+ claim_text.entity_label,
63
+ claim_text.property_label,
64
+ claim_text.object_label,
65
+ html_text.url
66
+ FROM claim_text
67
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
68
+ WHERE claim_text.entity_id = '{target_QID}'
69
+ """
70
+
71
+ result_df = pd.read_sql_query(query, conn)
72
+
73
+ conn.commit()
74
+ conn.close()
75
+
76
+ return result_df
77
+
78
+ except Exception as e:
79
+ error_df = pd.DataFrame({'Error': [str(e)]})
80
+ return error_df
81
+
82
+
83
+ def prv_process(qid):
84
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
85
+
86
+ query = f"""
87
+ SELECT html_text.*
88
+ FROM html_text
89
+ INNER JOIN claim_text ON html_text.reference_id = claim_text.reference_id
90
+ WHERE claim_text.entity_id = '{qid}'
91
+ """
92
+ reference_text_df = pd.read_sql_query(query, conn)
93
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{qid}'"
94
+ claim_df = pd.read_sql_query(query, conn)
95
+
96
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
97
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar
98
+
99
+ def update_progress(curr_step, total_steps):
100
+ progress((curr_step + 1) / total_steps)
101
+
102
+ result = prv.RelevantSentenceSelection(verbalised_claims_df_final, reference_text_df, update_progress)
103
+
104
+ conn.close()
105
+ return result
106
+
107
+
108
+
109
+ with gr.Blocks() as demo:
110
+ print("gradio started!")
111
+ gr.Markdown(
112
+ """
113
+ # Reference Quality Verification Tool
114
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
115
+ Parsing could take 3~5 mins depending on the number of references.
116
+ """
117
+ )
118
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
119
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
120
+ run_button_1 = gr.Button("Start parsing")
121
+ run_button_1.click(wtr_process, inp, out)
122
+
123
+
124
+ gr.Markdown(
125
+ """
126
+ LLM-based HTML parsing and verification !
127
+ """
128
+ )
129
+ out_2 = gr.DataFrame(label="LLM-based verificaiton result")
130
+
131
+ run_button_2 = gr.Button("Start processing")
132
+ run_button_2.click(prv_process, inp, out_2)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ demo.launch(share=True)
UI_tester.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+
5
+ def process_input(qid):
6
+ progress = gr.Progress(0)
7
+
8
+ wtr.claimParser(qid)
9
+
10
+ progress(0.20, desc="Filtering properties...")
11
+ filtered_df = wtr.propertyFiltering(qid)
12
+
13
+ progress(0.40, desc="Parsing URLs...")
14
+ url_set = wtr.urlParser()
15
+
16
+ progress(0.60, desc="Parsing HTML...")
17
+ html_set = wtr.htmlParser(url_set)
18
+
19
+ progress(0.80, desc="Generating claim text...")
20
+ claim_text = wtr.claim2text(html_set) #Claims generation
21
+
22
+ progress(1, desc="Generating claim text...")
23
+ html_text = wtr.html2text(html_set)
24
+
25
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
26
+ claim_text = claim_text.astype(str)
27
+ html_text = html_text.astype(str)
28
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
29
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
30
+ conn.commit()
31
+ conn.close()
32
+ return f"{html_text.shape[0]} HTMl documents collection via references of {qid}"
33
+
34
+ with gr.Blocks() as demo:
35
+ gr.Markdown(
36
+ """
37
+ # Reference Quality Verification Tool
38
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
39
+
40
+ Parsing could take 3~5 mins depending on the number of references.
41
+ """
42
+ )
43
+
44
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q42)")
45
+ out = gr.Textbox(label="Parsing result")
46
+ run_button = gr.Button("Start parsing")
47
+ run_button.click(process_input, inp, out)
48
+
49
+
50
+
51
+ if __name__ == "__main__":
52
+ demo.launch()
Wikidata_Text_Parser.py ADDED
@@ -0,0 +1,933 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from tqdm import tqdm
3
+ import pandas as pd
4
+ import os, sqlite3, traceback, ast, requests, fasttext, re, time, string, spacy, pysbd
5
+ from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout, InvalidSchema, InvalidURL
6
+ from qwikidata.linked_data_interface import get_entity_dict_from_api
7
+ from datetime import datetime
8
+ import utils.wikidata_utils as wdutils
9
+ from importlib import reload
10
+ from urllib.parse import urlparse, unquote
11
+ from urllib import parse
12
+ from bs4 import BeautifulSoup
13
+ from IPython.display import clear_output
14
+ from os.path import exists
15
+ from pathlib import Path
16
+ from nltk.tokenize import sent_tokenize
17
+ from sentence_splitter import SentenceSplitter, split_text_into_sentences
18
+ import nltk
19
+ nltk.download('punkt')
20
+
21
+ class DatabaseExtractor():
22
+ def __init__(self, dbname='wikidata_claims_refs_parsed.db'):
23
+ self.dbname = dbname
24
+ self.prepare_extraction()
25
+
26
+ def finish_extraction(self):
27
+ self.db.commit()
28
+
29
+ def prepare_extraction(self):
30
+ self.db = sqlite3.connect(self.dbname)
31
+ self.cursor = self.db.cursor()
32
+
33
+ self.cursor.execute('''
34
+ CREATE TABLE IF NOT EXISTS claims(
35
+ entity_id TEXT,
36
+ claim_id TEXT,
37
+ rank TEXT,
38
+ property_id TEXT,
39
+ datatype TEXT,
40
+ datavalue TEXT,
41
+ PRIMARY KEY (
42
+ claim_id
43
+ )
44
+ )''')
45
+
46
+ self.cursor.execute('''
47
+ CREATE TABLE IF NOT EXISTS claims_refs(
48
+ claim_id TEXT,
49
+ reference_id TEXT,
50
+ PRIMARY KEY (
51
+ claim_id,
52
+ reference_id
53
+ )
54
+ )''')
55
+
56
+ self.cursor.execute('''
57
+ CREATE TABLE IF NOT EXISTS refs(
58
+ reference_id TEXT,
59
+ reference_property_id TEXT,
60
+ reference_index TEXT,
61
+ reference_datatype TEXT,
62
+ reference_value TEXT,
63
+ PRIMARY KEY (
64
+ reference_id,
65
+ reference_property_id,
66
+ reference_index
67
+ )
68
+ )''')
69
+ self.db.commit()
70
+
71
+ def extract_claim(self, entity_id, claim):
72
+ if claim['mainsnak']['snaktype'] == 'value':
73
+ value = str(claim['mainsnak']['datavalue'])
74
+ else:
75
+ value = claim['mainsnak']['snaktype']
76
+ try:
77
+ self.cursor.execute('''
78
+ INSERT INTO claims(entity_id, claim_id, rank, property_id, datatype, datavalue)
79
+ VALUES($var,$var,$var,$var,$var,$var)'''.replace('$var','?'), (
80
+ entity_id,claim['id'],claim['rank'],
81
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value
82
+ ))
83
+ except UnicodeEncodeError:
84
+ print(entity_id,claim['id'],claim['rank'],
85
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value)
86
+ raise
87
+ except sqlite3.IntegrityError as err:
88
+ #self.db.rollback()
89
+ self.cursor.execute(
90
+ '''SELECT *
91
+ FROM claims
92
+ WHERE claim_id=$var
93
+ '''.replace('$var','?'), (claim['id'],)
94
+ )
95
+ conflicted_value = self.cursor.fetchone()
96
+ if conflicted_value == (entity_id,claim['id'],claim['rank'],
97
+ claim['mainsnak']['property'],claim['mainsnak']['datatype'],value):
98
+ pass
99
+ else:
100
+ print(err, claim['id'])
101
+ traceback.print_exc()
102
+ raise err
103
+ finally:
104
+ #self.db.commit()
105
+ pass
106
+
107
+ def extract_reference(self, ref):
108
+ for snaks in ref['snaks'].values():
109
+ for i, snak in enumerate(snaks):
110
+ if snak['snaktype'] == 'value':
111
+ value = str(snak['datavalue'])
112
+ else:
113
+ value = snak['snaktype']
114
+ try:
115
+ self.cursor.execute('''
116
+ INSERT INTO refs(reference_id, reference_property_id, reference_index,
117
+ reference_datatype, reference_value)
118
+ VALUES($var,$var,$var,$var,$var)'''.replace('$var','?'), (
119
+ ref['hash'],snak['property'],str(i),snak['datatype'],value
120
+ ))
121
+ except sqlite3.IntegrityError as err:
122
+ #self.db.rollback()
123
+ self.cursor.execute(# WE DONT USE THE INDEX HERE, THEY TEND TO COME SHUFFLED FROM API AND SORTING TAKES TOO LONG
124
+ '''SELECT reference_id, reference_property_id, reference_datatype, reference_value
125
+ FROM refs
126
+ WHERE reference_id = $var
127
+ AND reference_property_id = $var
128
+ '''.replace('$var','?'), (ref['hash'],snak['property'])
129
+ )
130
+ conflicted_values = self.cursor.fetchall()
131
+ if (ref['hash'],snak['property'],snak['datatype'],value) in conflicted_values:
132
+ pass
133
+ else:
134
+ print(err, ref['hash'],snak['property'],i)
135
+ print('trying to insert:',(ref['hash'],snak['property'],str(i),snak['datatype'],value))
136
+ traceback.print_exc()
137
+ raise err
138
+ finally:
139
+ #self.db.commit()
140
+ pass
141
+
142
+ def extract_claim_reference(self, claim, ref):
143
+ claim['id'],ref['hash']
144
+ try:
145
+ self.cursor.execute('''
146
+ INSERT INTO claims_refs(claim_id, reference_id)
147
+ VALUES($var,$var)'''.replace('$var','?'), (
148
+ claim['id'],ref['hash']
149
+ ))
150
+ except sqlite3.IntegrityError as err:
151
+ #db.rollback()
152
+ pass
153
+ finally:
154
+ #self.db.commit()
155
+ pass
156
+
157
+ def extract_entity(self, e):
158
+ for outgoing_property_id in e['claims'].values():
159
+ for claim in outgoing_property_id:
160
+ self.extract_claim(e['id'],claim)
161
+ if 'references' in claim:
162
+ for ref in claim['references']:
163
+ self.extract_claim_reference(claim, ref)
164
+ self.extract_reference(ref)
165
+
166
+ def claimParser(QID):
167
+ entity_id = QID
168
+ print('Setting up database ...')
169
+ extractor = DatabaseExtractor()
170
+
171
+ print('Fetching entity from API ...')
172
+ entity = get_entity_dict_from_api(entity_id)
173
+
174
+ if entity:
175
+ print(f'Parsing entity: {entity_id}')
176
+ extractor.extract_entity(entity)
177
+ else:
178
+ print(f'Failed to fetch entity: {entity_id}')
179
+
180
+ extractor.finish_extraction()
181
+
182
+ def propertyFiltering(QID):
183
+ reload(wdutils)
184
+ DB_PATH = 'wikidata_claims_refs_parsed.db'
185
+ claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
186
+
187
+ properties_to_remove = {
188
+ 'general':[
189
+ 'P31', # - instance of
190
+ 'P279',# - subclass of
191
+ 'P373',# - commons category
192
+ 'P910',# - Topic's main category
193
+ 'P7561',# - category for the interior of the item
194
+ 'P5008',# - on focus list of Wikimedia project
195
+ 'P2670',# - has parts of the class
196
+ 'P1740',# - category for films shot at this location
197
+ 'P1612',# - Commons Institution page
198
+ 'P8989',# - category for the view of the item
199
+ 'P2959',# - permanent duplicated item
200
+ 'P7867',# - category for maps
201
+ 'P935' ,# - Commons gallery
202
+ 'P1472',# - Commons Creator page
203
+ 'P8596',# category for the exterior of the item
204
+ 'P5105',# Deutsche Bahn station category
205
+ 'P8933',# category for the view from the item
206
+ 'P642',# of
207
+ 'P3876',# category for alumni of educational institution
208
+ 'P1791',# category of people buried here
209
+ 'P7084',# related category
210
+ 'P1465',# category for people who died here
211
+ 'P1687',# Wikidata property
212
+ 'P6104',# maintained by WikiProject
213
+ 'P4195',# category for employees of the organization
214
+ 'P1792',# category of associated people
215
+ 'P5869',# model item
216
+ 'P1659',# see also
217
+ 'P1464',# category for people born here
218
+ 'P2354',# has list
219
+ 'P1424',# topic's main template
220
+ 'P7782',# category for ship name
221
+ 'P179',# part of the series
222
+ 'P7888',# merged into
223
+ 'P6365',# member category
224
+ 'P8464',# content partnership category
225
+ 'P360',# is a list of
226
+ 'P805',# statement is subject of
227
+ 'P8703',# entry in abbreviations table
228
+ 'P1456',# list of monuments
229
+ 'P1012',# including
230
+ 'P1151',# topic's main Wikimedia portal
231
+ 'P2490',# page at OSTIS Belarus Wiki
232
+ 'P593',# HomoloGene ID
233
+ 'P8744',# economy of topic
234
+ 'P2614',# World Heritage criteria
235
+ 'P2184',# history of topic
236
+ 'P9241',# demographics of topic
237
+ 'P487',#Unicode character
238
+ 'P1754',#category related to list
239
+ 'P2559',#Wikidata usage instructions
240
+ 'P2517',#category for recipients of this award
241
+ 'P971',#category combines topics
242
+ 'P6112',# category for members of a team
243
+ 'P4224',#category contains
244
+ 'P301',#category's main topic
245
+ 'P1753',#list related to category
246
+ 'P1423',#template has topic
247
+ 'P1204',#Wikimedia portal's main topic
248
+ 'P3921',#Wikidata SPARQL query equivalent
249
+ 'P1963',#properties for this type
250
+ 'P5125',#Wikimedia outline
251
+ 'P3176',#uses property
252
+ 'P8952',#inappropriate property for this type
253
+ 'P2306',#property
254
+ 'P5193',#Wikidata property example for forms
255
+ 'P5977',#Wikidata property example for senses
256
+ ],
257
+ 'specific': {}
258
+ }
259
+
260
+ db = sqlite3.connect(DB_PATH)
261
+ cursor = db.cursor()
262
+ # To see how many out of the total number of stored claims we are excluding by removing the general properties
263
+ sql_query = "select count(*) from claims where property_id in $1;"
264
+ sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in properties_to_remove['general']]) + ')')
265
+ cursor.execute(sql_query)
266
+ print('Removing the',len(properties_to_remove['general']),'properties deemed as ontological or unverbalisable')
267
+ cursor = db.cursor()
268
+
269
+ sql_query = "select * from claims where entity_id in $1;"
270
+ sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in [QID]]) + ')')
271
+
272
+ cursor.execute(sql_query)
273
+ theme_df = pd.DataFrame(cursor.fetchall())
274
+ theme_df.columns = claims_columns
275
+
276
+ original_theme_df_size = theme_df.shape[0]
277
+ last_stage_theme_df_size = original_theme_df_size
278
+
279
+ print('- Removing deprecated')
280
+
281
+ # Remove deprecated
282
+ theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True)
283
+ print(
284
+ ' - Percentage of deprecated:',
285
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
286
+ )
287
+ last_stage_theme_df_size = theme_df.shape[0]
288
+
289
+ print('- Removing bad datatypes')
290
+
291
+ # Remove external_ids, commonsMedia (e.g. photos), globe-coordinates, urls
292
+ bad_datatypes = ['commonsMedia','external-id','globe-coordinate','url', 'wikibase-form',
293
+ 'geo-shape', 'math', 'musical-notation', 'tabular-data', 'wikibase-sense']
294
+ theme_df = theme_df[
295
+ theme_df['datatype'].apply(
296
+ lambda x : x not in bad_datatypes
297
+ )
298
+ ].reset_index(drop=True)
299
+ print(
300
+ ' - Percentage of bad datatypes:',
301
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
302
+ )
303
+ last_stage_theme_df_size = theme_df.shape[0]
304
+
305
+ print('- Removing bad properties')
306
+
307
+ # Remove specific properties such as P31 and P279
308
+ theme_df = theme_df[
309
+ theme_df['property_id'].apply(
310
+ lambda x : (x not in properties_to_remove['general']))
311
+
312
+ ].reset_index(drop=True)
313
+ print(
314
+ ' - Percentage of ontology (non-domain) properties:',
315
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
316
+ )
317
+ last_stage_theme_df_size = theme_df.shape[0]
318
+
319
+ print('- Removing somevalue/novalue')
320
+
321
+ # Remove novalue and somevalue
322
+ theme_df = theme_df[
323
+ theme_df['datavalue'].apply(
324
+ lambda x : x not in ['somevalue', 'novalue']
325
+ )
326
+ ].reset_index(drop=True)
327
+ print(
328
+ ' - Percentage of somevalue/novalue:',
329
+ round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%'
330
+ )
331
+ last_stage_theme_df_size = theme_df.shape[0]
332
+
333
+ print(
334
+ 'After all removals, we keep',
335
+ round(last_stage_theme_df_size/original_theme_df_size*100, 2),
336
+ )
337
+ theme_df.to_sql('claims', db, if_exists='replace', index=False)
338
+
339
+ return theme_df
340
+
341
+ def get_object_label_given_datatype(row):
342
+ Wd_API = wdutils.CachedWikidataAPI()
343
+ Wd_API.languages = ['en']
344
+ def turn_to_century_or_millennium(y, mode):
345
+ y = str(y)
346
+ if mode == 'C':
347
+ div = 100
348
+ group = int(y.rjust(3, '0')[:-2])
349
+ mode_name = 'century'
350
+ elif mode == 'M':
351
+ div = 1000
352
+ group = int(y.rjust(4, '0')[:-3])
353
+ mode_name = 'millenium'
354
+ else:
355
+ raise ValueError('Use mode = C for century and M for millennium')
356
+
357
+ if int(y)%div != 0:
358
+ group += 1
359
+ group = str(group)
360
+
361
+ group_suffix = (
362
+ 'st' if group[-1] == '1' else (
363
+ 'nd' if group[-1] == '2' else (
364
+ 'rd' if group[-1] == '3' else 'th'
365
+ )
366
+ )
367
+ )
368
+
369
+ return ' '.join([group+group_suffix, mode_name])
370
+
371
+ dt = row['datatype']
372
+ dv = row['datavalue']
373
+
374
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
375
+ if dt not in dt_types:
376
+ print(dt)
377
+ raise ValueError
378
+ else:
379
+ try:
380
+ if dt == dt_types[0]:
381
+ return Wd_API.get_label(ast.literal_eval(dv)['value']['id'], True) #get label here
382
+ elif dt == dt_types[1]:
383
+ dv = ast.literal_eval(dv)
384
+ return (dv['value']['text'], dv['value']['language'])
385
+ elif dt == dt_types[2]:
386
+ dv = ast.literal_eval(dv)
387
+ amount, unit = dv['value']['amount'], dv['value']['unit']
388
+ if amount[0] == '+':
389
+ amount = amount[1:]
390
+ if str(unit) == '1':
391
+ return (str(amount), 'en')
392
+ else:
393
+ unit_entity_id = unit.split('/')[-1]
394
+ unit = Wd_API.get_label(unit_entity_id, True)#get label here
395
+ return (' '.join([amount, unit[0]]), unit[1])
396
+ elif dt == dt_types[3]:
397
+ dv = ast.literal_eval(dv)
398
+ time = dv['value']['time']
399
+ timezone = dv['value']['timezone']
400
+ precision = dv['value']['precision']
401
+ assert dv['value']['after'] == 0 and dv['value']['before'] == 0
402
+
403
+ sufix = 'BC' if time[0] == '-' else ''
404
+ time = time[1:]
405
+
406
+ if precision == 11: #date
407
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en')
408
+ elif precision == 10: #month
409
+ try:
410
+ return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
411
+ except ValueError:
412
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en')
413
+ elif precision == 9: #year
414
+ try:
415
+ return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en')
416
+ except ValueError:
417
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en')
418
+ elif precision == 8: #decade
419
+ try:
420
+ return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
421
+ except ValueError:
422
+ return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en')
423
+ elif precision == 7: #century
424
+ try:
425
+ parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
426
+ except ValueError:
427
+ parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
428
+ finally:
429
+ return (turn_to_century_or_millennium(
430
+ parsed_time.strftime('%Y'), mode='C'
431
+ ) + sufix, 'en')
432
+ elif precision == 6: #millennium
433
+ try:
434
+ parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ')
435
+ except ValueError:
436
+ parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ')
437
+ finally:
438
+ return (turn_to_century_or_millennium(
439
+ parsed_time.strftime('%Y'), mode='M'
440
+ ) + sufix, 'en')
441
+ elif precision == 4: #hundred thousand years
442
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
443
+ timeint = round(timeint/1e5,1)
444
+ return (str(timeint) + 'hundred thousand years' + sufix, 'en')
445
+ elif precision == 3: #million years
446
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
447
+ timeint = round(timeint/1e6,1)
448
+ return (str(timeint) + 'million years' + sufix, 'en')
449
+ elif precision == 0: #billion years
450
+ timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y'))
451
+ timeint = round(timeint/1e9,1)
452
+ return (str(timeint) + 'billion years' +sufix, 'en')
453
+ elif dt == dt_types[4]:
454
+ return (ast.literal_eval(dv)['value'], 'en')
455
+ except ValueError as e:
456
+ #pdb.set_trace()
457
+ raise e
458
+
459
+ def get_object_desc_given_datatype(row):
460
+ Wd_API = wdutils.CachedWikidataAPI()
461
+ Wd_API.languages = ['en']
462
+ dt = row['datatype']
463
+ dv = row['datavalue']
464
+
465
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
466
+ if dt not in dt_types:
467
+ print(dt)
468
+ raise ValueError
469
+ else:
470
+ try:
471
+ if dt == dt_types[0]:
472
+ return Wd_API.get_desc(ast.literal_eval(dv)['value']['id']) #get label here
473
+ elif dt == dt_types[1]:
474
+ return ('no-desc', 'none')
475
+ elif dt == dt_types[2]:
476
+ dv = ast.literal_eval(dv)
477
+ amount, unit = dv['value']['amount'], dv['value']['unit']
478
+ if amount[0] == '+':
479
+ amount = amount[1:]
480
+ if str(unit) == '1':
481
+ return ('no-desc', 'none')
482
+ else:
483
+ unit_entity_id = unit.split('/')[-1]
484
+ return Wd_API.get_desc(unit_entity_id)
485
+ elif dt == dt_types[3]:
486
+ return ('no-desc', 'none')
487
+ elif dt == dt_types[4]:
488
+ return ('no-desc', 'none')
489
+ except ValueError as e:
490
+ #pdb.set_trace()
491
+ raise e
492
+
493
+ def get_object_alias_given_datatype(row):
494
+ Wd_API = wdutils.CachedWikidataAPI()
495
+ Wd_API.languages = ['en']
496
+ dt = row['datatype']
497
+ dv = row['datavalue']
498
+
499
+ dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string']
500
+ if dt not in dt_types:
501
+ print(dt)
502
+ raise ValueError
503
+ else:
504
+ try:
505
+ if dt == dt_types[0]:
506
+ return Wd_API.get_alias(ast.literal_eval(dv)['value']['id']) #get label here
507
+ elif dt == dt_types[1]:
508
+ return ('no-alias', 'none')
509
+ elif dt == dt_types[2]:
510
+ dv = ast.literal_eval(dv)
511
+ amount, unit = dv['value']['amount'], dv['value']['unit']
512
+ if amount[0] == '+':
513
+ amount = amount[1:]
514
+ if str(unit) == '1':
515
+ return ('no-alias', 'none')
516
+ else:
517
+ unit_entity_id = unit.split('/')[-1]
518
+ return Wd_API.get_alias(unit_entity_id)
519
+ elif dt == dt_types[3]:
520
+ dv = ast.literal_eval(dv)
521
+ time = dv['value']['time']
522
+ timezone = dv['value']['timezone']
523
+ precision = dv['value']['precision']
524
+ assert dv['value']['after'] == 0 and dv['value']['before'] == 0
525
+
526
+ sufix = 'BC' if time[0] == '-' else ''
527
+ time = time[1:]
528
+
529
+ if precision == 11: #date
530
+ return ([
531
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%-d of %B, %Y') + sufix,
532
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y (dd/mm/yyyy)') + sufix,
533
+ datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%b %-d, %Y') + sufix
534
+ ], 'en')
535
+ else: #month
536
+ return ('no-alias', 'none')
537
+ elif dt == dt_types[4]:
538
+ return ('no-alias', 'none')
539
+ except ValueError as e:
540
+ #pdb.set_trace()
541
+ raise e
542
+
543
+ def textualAugmentation(filtered_df):
544
+
545
+ Wd_API = wdutils.CachedWikidataAPI()
546
+ Wd_API.languages = ['en']
547
+
548
+ filtered_df['entity_label'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_label(x, True))
549
+ filtered_df['entity_desc'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_desc(x))
550
+ filtered_df['entity_alias'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_alias(x))
551
+
552
+ print(' - Predicate augmentation...')
553
+ filtered_df['property_label'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_label(x, True))
554
+ filtered_df['property_desc'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_desc(x))
555
+ filtered_df['property_alias'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_alias(x))
556
+
557
+ print(' - Object augmentation...')
558
+ filtered_df['object_label'] = filtered_df.apply(get_object_label_given_datatype, axis=1)
559
+ filtered_df['object_desc'] = filtered_df.apply(get_object_desc_given_datatype, axis=1)
560
+ filtered_df['object_alias'] = filtered_df.apply(get_object_alias_given_datatype, axis=1)
561
+
562
+
563
+ no_subject_label_perc = filtered_df[filtered_df['entity_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
564
+ print(' - No subject label %:', no_subject_label_perc, '%')
565
+
566
+ no_predicate_label_perc = filtered_df[filtered_df['property_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
567
+ print(' - No predicate label %:', no_predicate_label_perc, '%')
568
+
569
+ no_object_label_perc = filtered_df[filtered_df['object_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100
570
+ print(' - No object label %:', no_object_label_perc, '%')
571
+ return filtered_df
572
+
573
+ def urlParser(target_QID):
574
+ Wd_API = wdutils.CachedWikidataAPI()
575
+ Wd_API.languages = ['en']
576
+ db = sqlite3.connect('wikidata_claims_refs_parsed.db')
577
+ cursor = db.cursor()
578
+ refs_columns = ['reference_id','reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
579
+ cursor.execute('select * from refs where reference_datatype="url";')
580
+ url_df = pd.DataFrame(cursor.fetchall())
581
+ url_df.columns = refs_columns
582
+ def reference_value_to_url(reference_value):
583
+ if reference_value in ['novalue','somevalue']:
584
+ return reference_value
585
+ reference_value = ast.literal_eval(reference_value)
586
+ assert reference_value['type'] == 'string'
587
+ return reference_value['value']
588
+ def reference_value_to_external_id(reference_value):
589
+ if reference_value in ['novalue','somevalue']:
590
+ return reference_value
591
+ reference_value = ast.literal_eval(reference_value)
592
+ assert reference_value['type'] == 'string'
593
+ return reference_value['value']
594
+ def get_formatter_url(entity_id):
595
+ try:
596
+ sparql_query = '''
597
+ SELECT ?item ?itemLabel
598
+ WHERE
599
+ {
600
+ wd:$1 wdt:P1630 ?item.
601
+ SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
602
+ }
603
+ '''.replace('$1',entity_id)
604
+ sparql_results = Wd_API.query_sparql_endpoint(sparql_query)
605
+ if len(sparql_results['results']['bindings']) > 0:
606
+ return sparql_results['results']['bindings'][0]['item']['value']
607
+ else:
608
+ return 'no_formatter_url'
609
+ except Exception:
610
+ print(entity_id)
611
+ print(sparql_results)
612
+ raise
613
+ url_df['url'] = url_df.reference_value.apply(reference_value_to_url)
614
+ cursor.execute('select * from refs where reference_datatype="url";')
615
+ ext_id_df = pd.DataFrame(cursor.fetchall())
616
+ ext_id_df.columns = refs_columns
617
+ ext_id_df['ext_id'] = ext_id_df.reference_value.apply(reference_value_to_external_id)
618
+ ext_id_df['formatter_url'] = ext_id_df['reference_property_id'].apply(get_formatter_url)
619
+ ext_id_df['url'] = ext_id_df.apply(lambda x : x['formatter_url'].replace('$1', x['ext_id']), axis=1)
620
+ columns_for_join = ['reference_id', 'reference_property_id','reference_index','reference_datatype','url']
621
+ url_df_pre_join = url_df[columns_for_join]
622
+ ext_id_df_pre_join = ext_id_df[columns_for_join]
623
+ all_url_df = pd.concat([url_df_pre_join,ext_id_df_pre_join])
624
+ all_url_df = all_url_df.sort_values(['reference_id','reference_index'])
625
+ # drop those with url = 'no_formatter_url'
626
+ all_url_df = all_url_df[all_url_df['url'] != 'no_formatter_url'].reset_index(drop=True)
627
+ # drop those with url = somevalue and novalue
628
+ all_url_df = all_url_df[~all_url_df['url'].isin(['somevalue','novalue'])]
629
+ reference_id_counts = all_url_df.reference_id.value_counts().reset_index()
630
+ reference_id_counts.columns = ['reference_id', 'counts']
631
+ reference_id_counts_equal_1 = reference_id_counts[reference_id_counts['counts'] == 1].reference_id.tolist()
632
+ all_url_df_eq1 = all_url_df[all_url_df.reference_id.isin(reference_id_counts_equal_1)]
633
+ all_url_df_eq1 = all_url_df_eq1.reset_index(drop=True).drop('reference_index', axis=1)
634
+ return all_url_df_eq1
635
+
636
+ def htmlParser(url_set, qid):
637
+ text_reference_sampled_df = url_set
638
+ _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
639
+ text_reference_sampled_df['html'] = None
640
+ for i, row in text_reference_sampled_df.iterrows():
641
+
642
+ print(i, row.url)
643
+ try:
644
+ response = requests.get(row.url, timeout=10)
645
+ if response.status_code == 200:
646
+ html = response.text
647
+ text_reference_sampled_df.loc[i, 'html'] = html
648
+ else:
649
+ print(f"not response, {response.status_code}")
650
+ text_reference_sampled_df.loc[i, 'html'] = response.status_code
651
+ except requests.exceptions.Timeout:
652
+ print("Timeout occurred while fetching the URL:", row.url)
653
+ text_reference_sampled_df.loc[i, 'html'] = 'TimeOut'
654
+ pass
655
+ except Exception as e:
656
+ print("An error occurred:", str(e))
657
+ pass
658
+ text_reference_sampled_df_html = text_reference_sampled_df.copy()
659
+ text_reference_sampled_df_html['entity_id'] = qid
660
+ return text_reference_sampled_df_html
661
+
662
+ def claim2text(html_set):
663
+ text_reference_sampled_df_html = html_set
664
+ Wd_API = wdutils.CachedWikidataAPI()
665
+ Wd_API.languages = ['en']
666
+ db = sqlite3.connect('wikidata_claims_refs_parsed.db')
667
+ cursor = db.cursor()
668
+ claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue']
669
+ refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value']
670
+
671
+ def reference_id_to_claim_id(reference_id):
672
+ cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"')
673
+ sql_result = cursor.fetchall()
674
+ #return sql_result
675
+ randomly_chosen_claim_id = np.array(sql_result).reshape(-1)
676
+ return randomly_chosen_claim_id
677
+
678
+ def reference_id_to_claim_data(reference_id):
679
+ claim_ids = reference_id_to_claim_id(reference_id)
680
+ r = []
681
+ for claim_id in claim_ids:
682
+ #print(claim_id)
683
+ cursor.execute(f'select * from claims where claim_id="{claim_id}";')
684
+ d = cursor.fetchall()
685
+ r = r + d
686
+ return r
687
+
688
+ claim_data = []
689
+ for reference_id in text_reference_sampled_df_html.reference_id:
690
+ data = reference_id_to_claim_data(reference_id)
691
+ #print(data)
692
+ data = [(reference_id,) + t for t in data]
693
+ claim_data = claim_data + data
694
+ #break
695
+
696
+ claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns)
697
+ claim_df
698
+
699
+ def claim_id_to_claim_url(claim_id):
700
+ claim_id_parts = claim_id.split('$')
701
+ return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}'
702
+
703
+ BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property']
704
+
705
+ assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\
706
+ == claim_df.reference_id.unique().shape
707
+
708
+ print(claim_df.reference_id.unique().shape[0])
709
+ claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True)
710
+
711
+ from tqdm.auto import tqdm
712
+ tqdm.pandas()
713
+
714
+ claim_df[['entity_label','entity_label_lan']] = pd.DataFrame(
715
+ claim_df.entity_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
716
+ )
717
+ claim_df[['property_label','property_label_lan']] = pd.DataFrame(
718
+ claim_df.property_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist()
719
+ )
720
+
721
+ claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame(
722
+ claim_df.entity_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
723
+ )
724
+ claim_df[['property_alias','property_alias_lan']] = pd.DataFrame(
725
+ claim_df.property_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist()
726
+ )
727
+
728
+ claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame(
729
+ claim_df.entity_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
730
+ )
731
+ claim_df[['property_desc','property_desc_lan']] = pd.DataFrame(
732
+ claim_df.property_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist()
733
+ )
734
+
735
+ claim_df['object_label'] = claim_df.apply(get_object_label_given_datatype, axis=1)
736
+ claim_df['object_alias'] = claim_df.apply(get_object_alias_given_datatype, axis=1)
737
+ claim_df['object_desc'] = claim_df.apply(get_object_desc_given_datatype, axis=1)
738
+
739
+ claim_df['object_label'], claim_df['object_label_lan'] = zip(*claim_df['object_label'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
740
+ claim_df['object_alias'], claim_df['object_alias_lan'] = zip(*claim_df['object_alias'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
741
+ claim_df['object_desc'], claim_df['object_desc_lan'] = zip(*claim_df['object_desc'].apply(lambda x: x if isinstance(x, tuple) else (x, '')))
742
+
743
+ # Removing bad object labels
744
+ claim_df = claim_df[claim_df['object_label_lan'] != 'none'].reset_index(drop=True)
745
+ return claim_df
746
+
747
+ def html2text(html_set):
748
+ reference_html_df = html_set
749
+ _RE_COMBINE_WHITESPACE = re.compile(r"\s+")
750
+ ft_model = fasttext.load_model('base/lid.176.ftz')
751
+ def predict_language(text, k=20):
752
+ ls, scores = ft_model.predict(text, k=k) # top 20 matching languages
753
+ ls = [l.replace('__label__','') for l in ls]
754
+ return list(zip(ls,scores))
755
+ def get_url_language(html):
756
+ try:
757
+ soup = BeautifulSoup(html, "lxml")
758
+ [s.decompose() for s in soup("script")] # remove <script> elements
759
+ if soup.body == None:
760
+ return ('no body', None)
761
+ body_text = _RE_COMBINE_WHITESPACE.sub(" ", soup.body.get_text(' ')).strip()
762
+ return predict_language(body_text, k=1)[0]
763
+ except Exception:
764
+ raise
765
+ def get_text_p_tags(soup):
766
+ p_tags = soup.find_all('p')
767
+ text = [p.getText().strip() for p in p_tags if p.getText()]
768
+ return '\n'.join(text)
769
+ def clean_text_line_by_line(text, join=True, ch_join = ' ', verb=True):
770
+ # text = soup.body.get_text()
771
+ # break into lines and remove leading and trailing space on each
772
+ lines = list(text.splitlines())
773
+ lines = (line.strip() for line in lines)
774
+ # for each line, lets correct double spaces into single space
775
+ lines = (re.sub(r' {2,}', ' ', line) for line in lines)
776
+ # for each line, lets correct punctuation spaced to the left
777
+ lines = (re.sub(r' ([.,:;!?\\-])', r'\1', line) for line in lines)
778
+ # put periods if missing
779
+ lines = [line+'.' if line and line[-1] not in string.punctuation else line for i, line in enumerate(lines)]
780
+
781
+ if verb:
782
+ for i, line in enumerate(lines):
783
+ print(i,line)
784
+ # drop blank lines
785
+ if join:
786
+ return ch_join.join([line for line in lines if line])
787
+ else:
788
+ return [line for line in lines if line]
789
+
790
+ def apply_manual_rules(text):
791
+ # RULE: A line ending with a ':' followed by whitespaces and a newline is likely a continuing line and should be joined
792
+ #text = re.sub(
793
+ # r':\s*\n',
794
+ # r': ',
795
+ # text
796
+ #)
797
+ # RULE: Remove [1] reference numbers
798
+ text = re.sub(r'\[[0-9]+\]', '', text)
799
+ return text
800
+ def retrieve_text_from_html(html, soup_parser = 'lxml', verb=True, join=True):
801
+ if not isinstance(html, str) or 'DOCTYPE html' not in html:
802
+ return 'No body'
803
+ soup = BeautifulSoup(html, soup_parser)
804
+ for script in soup(["script", "style"]):
805
+ script.decompose()
806
+ if soup.body == None:
807
+ return 'No body'
808
+ [s.unwrap() for s in soup.body.find_all('strong')]
809
+
810
+ for p in soup.body.find_all('p'):
811
+ p.string = _RE_COMBINE_WHITESPACE.sub(" ", p.get_text('')).strip()
812
+
813
+ #DECOMPOSE ALL BAD TAGS
814
+ #--------------
815
+ #for c in ['warningbox', 'metadata', 'references', 'navbox', 'toc', 'catlinks']:
816
+ # for e in soup.body.find_all(class_=c):
817
+ # print('decomposed',e)
818
+ # e.decompose()
819
+
820
+ # DECOMPOSE INVISIBLE ELEMENTS
821
+ #for e in soup.body.find_all():
822
+ # if e.hidden:
823
+ # print('decomposed',e)
824
+ # e.decompose()
825
+ # else:
826
+ # if e.attrs is not None:
827
+ # #print(e)
828
+ # #print('-')
829
+ # style = e.get('style')
830
+ # if style and 'display' in style and 'none' in style:
831
+ # print('decomposed',e)
832
+ # e.decompose()
833
+ # #print(e, style)
834
+ #--------------
835
+
836
+ #print(soup.body)
837
+
838
+ # BOILERPLATE REMOVAL OPTIONS
839
+ #1. jusText
840
+ #text = justext.justext(html, justext.get_stoplist("English"))
841
+ #text = '\n'.join([paragraph.text for paragraph in text if not paragraph.is_boilerplate])
842
+
843
+ #2. boilerpy3
844
+ #html = soup.body
845
+ #text = extractor.get_content(soup.prettify())
846
+
847
+ #3. Just extracting from 'text tags' like p
848
+ #simple rules (does not work depending on website, like on artgallery.yale, anything without clear paragraphic style)
849
+ #text = get_text_p_tags(soup)
850
+
851
+ #4. NONE
852
+ text = soup.body.get_text(' ').strip() # NOT GETTING FROM THE WHOLE SOUP, JUST BODY TO AVOID TITLES
853
+
854
+ #POST PROCESSING
855
+ text = apply_manual_rules(text)
856
+ text = clean_text_line_by_line(text, ch_join = ' ', verb=verb, join=join)
857
+
858
+ if not text:
859
+ return 'No extractable text' if join else ['No extractable text']
860
+ else:
861
+ return text
862
+ i=0
863
+ print(i)
864
+ print(reference_html_df.url.iloc[i])
865
+
866
+ reference_html_df['extracted_sentences'] = reference_html_df.html.progress_apply(retrieve_text_from_html, join=False, verb=False)
867
+
868
+ join_ch = ' '
869
+ reference_html_df['extracted_text'] = reference_html_df.extracted_sentences.apply(lambda x : join_ch.join(x))
870
+
871
+ splitter = SentenceSplitter(language='en')
872
+
873
+ seg = pysbd.Segmenter(language="en", clean=False)
874
+
875
+ if not spacy.util.is_package("en_core_web_lg"):
876
+ os.system("python -m spacy download en_core_web_lg")
877
+
878
+ nlp = spacy.load("en_core_web_lg")
879
+
880
+ text = reference_html_df.loc[0,'extracted_text']
881
+
882
+ # OPTION 1
883
+ # This gets some things wrong, such as Smt.=Shrimati ending a sentence, or any
884
+ # initials like P. N. Nampoothiri or Lt. Col.
885
+ #sents = sent_tokenize(text)
886
+
887
+ # OPTION 2
888
+ # Also breaks titles and initials like above, but additionally gets parenthesis wrong, like
889
+ # Amma Maharani [break](queen mother) [break] of Travancore.
890
+ #sents = seg.segment(text)
891
+
892
+ # OPTION 3
893
+ # Same as above plus new ones, like breaking contractions (like m. for married)
894
+ #sents = splitter.split(text)
895
+
896
+ # OPTION 4
897
+ # By far the best option, makes way less of the mistakes above, but not none. So let's adopt a strategy so ease this.
898
+ sents = [s for s in nlp(text).sents]
899
+
900
+
901
+ reference_html_df['nlp_sentences'] = reference_html_df.extracted_text.progress_apply(lambda x : [str(s) for s in nlp(x).sents])
902
+ reference_html_df['nlp_sentences_slide_2'] = reference_html_df['nlp_sentences'].progress_apply(
903
+ lambda x : [' '.join([a,b]) for a,b in list(zip(x,x[1:]+['']))]
904
+ )
905
+
906
+ assert type(reference_html_df.loc[0,'nlp_sentences']) == list
907
+ assert type(reference_html_df.loc[0,'nlp_sentences'][0]) == str
908
+ assert type(reference_html_df.loc[0,'nlp_sentences_slide_2']) == list
909
+ assert type(reference_html_df.loc[0,'nlp_sentences_slide_2'][0]) == str
910
+ return reference_html_df
911
+
912
+ if __name__ == '__main__':
913
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
914
+ target_QID = 'Q3621696'
915
+ claimParser(target_QID) #save results in .db
916
+ filtered_df = propertyFiltering(target_QID) #update db and return dataframe after filtering
917
+ url_set = urlParser(target_QID) #from ref table in .db
918
+ html_set = htmlParser(url_set, target_QID) #Original html docs collection
919
+ try:
920
+ claim_text = claim2text(html_set) #Claims generation
921
+ html_text = html2text(html_set)
922
+ claim_text = claim_text.astype(str)
923
+ html_text = html_text.astype(str)
924
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
925
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
926
+ except Exception as e:
927
+ print(f"No accessible html documents")
928
+
929
+
930
+ conn.commit()
931
+ conn.close()
932
+ #augmented_df = textualAugmentation(filtered_df) #textual information augmentation including label, desc, and alias
933
+
app.py CHANGED
@@ -1,63 +1,122 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
-
9
-
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
19
-
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
25
-
26
- messages.append({"role": "user", "content": message})
27
-
28
- response = ""
29
-
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
- """
43
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
44
- """
45
- demo = gr.ChatInterface(
46
- respond,
47
- additional_inputs=[
48
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
49
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
50
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
- gr.Slider(
52
- minimum=0.1,
53
- maximum=1.0,
54
- value=0.95,
55
- step=0.05,
56
- label="Top-p (nucleus sampling)",
57
- ),
58
- ],
59
- )
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  if __name__ == "__main__":
63
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import Wikidata_Text_Parser as wtr
3
+ import sqlite3
4
+ import Prove_lite as prv
5
+ import pandas as pd
6
+ import numpy as np
7
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ def wtr_process(qid):
10
+ try:
11
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
12
+ target_QID = qid
13
+ query = f"SELECT * FROM {'claim_text'}"
14
+ df = pd.read_sql_query(query, conn)
15
+ if target_QID in df['entity_id'].unique():
16
+ pass
17
+ else:
18
+ wtr.claimParser(target_QID) #save results in .db
19
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
20
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
21
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
22
+ claim_text = wtr.claim2text(html_set) #Claims generation
23
+ html_text = wtr.html2text(html_set)
24
+ claim_text = claim_text.astype(str)
25
+ html_text = html_text.astype(str)
26
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
27
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
28
+ conn.commit()
29
+ query = f"""
30
+ SELECT
31
+ claim_text.entity_label,
32
+ claim_text.property_label,
33
+ claim_text.object_label,
34
+ html_text.url
35
+ FROM claim_text
36
+ INNER JOIN html_text ON claim_text.reference_id = html_text.reference_id
37
+ WHERE claim_text.entity_id = '{target_QID}'
38
+ """
39
 
40
+ result_df = pd.read_sql_query(query, conn)
41
+
42
+ conn.commit()
43
+ conn.close()
44
+
45
+ return result_df
46
+
47
+ except Exception as e:
48
+ error_df = pd.DataFrame({'Error': [str(e)]})
49
+ return error_df
50
+
51
+
52
+ def prv_process(qid):
53
+ target_QID = qid
54
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
55
+ query = f"SELECT * FROM claim_text WHERE entity_id = '{target_QID}'"
56
+ claim_df = pd.read_sql_query(query, conn)
57
+
58
+ query = f"SELECT * FROM html_text Where entity_id = '{target_QID}'"
59
+ reference_text_df = pd.read_sql_query(query, conn)
60
+
61
+ verbalised_claims_df_final = prv.verbalisation(claim_df)
62
+
63
+ progress = gr.Progress(len(verbalised_claims_df_final)) # Create progress bar for Gradio
64
+ def update_progress(curr_step, total_steps):
65
+ progress((curr_step + 1) / total_steps)
66
+
67
+ splited_sentences_from_html = prv.setencesSpliter(verbalised_claims_df_final, reference_text_df, update_progress)
68
+
69
+ BATCH_SIZE = 512
70
+ N_TOP_SENTENCES = 5
71
+ SCORE_THRESHOLD = 0
72
+ evidence_df = prv.evidenceSelection(splited_sentences_from_html, BATCH_SIZE, N_TOP_SENTENCES)
73
+ result = prv.textEntailment(evidence_df, SCORE_THRESHOLD)
74
+ display_df = prv.TableMaking(verbalised_claims_df_final, result)
75
+ conn.commit()
76
+ conn.close()
77
+ return display_df
78
+
79
+
80
+
81
+ with gr.Blocks() as demo:
82
+ print("gradio started!")
83
+ gr.Markdown(
84
+ """
85
+ # Prove
86
+ This is a tool for verifying the reference quality of Wikidata claims related to the target entity item.
87
+ """
88
+ )
89
+ inp = gr.Textbox(label="Input QID", placeholder="Input QID (i.e. Q245247)")
90
+ out = gr.Dataframe(label="Parsing result (not presenting parsed HTMLs)", headers=["entity_label", "property_label", "object_label", "url"])
91
+ run_button_1 = gr.Button("Start parsing")
92
+ run_button_1.click(wtr_process, inp, out)
93
+
94
+
95
+ gr.Markdown(
96
+ """
97
+ Pre-trained language models-based text entailment.
98
+ """
99
+ )
100
+ out_2 = gr.HTML(label="Results")
101
+ run_button_2 = gr.Button("Start processing")
102
+ run_button_2.click(prv_process, inp, out_2)
103
+
104
+
105
  if __name__ == "__main__":
106
+ #DB initialising
107
+ if os.path.isfile('wikidata_claims_refs_parsed.db') != True:
108
+ conn = sqlite3.connect('wikidata_claims_refs_parsed.db')
109
+ target_QID = 'Q115305900'
110
+ wtr.claimParser(target_QID) #save results in .db
111
+ filtered_df = wtr.propertyFiltering(target_QID) #update db and return dataframe after filtering
112
+ url_set = wtr.urlParser(target_QID) #from ref table in .db
113
+ html_set = wtr.htmlParser(url_set, target_QID) #Original html docs collection
114
+ claim_text = wtr.claim2text(html_set) #Claims generation
115
+ html_text = wtr.html2text(html_set)
116
+ claim_text = claim_text.astype(str)
117
+ html_text = html_text.astype(str)
118
+ claim_text.to_sql('claim_text', conn, if_exists='replace', index=False)
119
+ html_text.to_sql('html_text', conn, if_exists='replace', index=False)
120
+ conn.commit()
121
+ conn.close()
122
+ demo.launch(share=True)
llm_load copy.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+ from transformers import TextStreamer
5
+
6
+ def llmLoad(max_seq_length):
7
+ with open('API_key.txt', 'r') as file:
8
+ token = file.read().strip()
9
+ login(token=token)
10
+
11
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
12
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
13
+
14
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
15
+ fourbit_models = [
16
+ "unsloth/mistral-7b-bnb-4bit",
17
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
18
+ "unsloth/llama-2-7b-bnb-4bit",
19
+ "unsloth/gemma-7b-bnb-4bit",
20
+ "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
21
+ "unsloth/gemma-2b-bnb-4bit",
22
+ "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
23
+ "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
24
+ ] # More models at https://huggingface.co/unsloth
25
+
26
+ model, tokenizer = FastLanguageModel.from_pretrained(
27
+ model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
28
+ max_seq_length = max_seq_length,
29
+ dtype = dtype,
30
+ load_in_4bit = load_in_4bit,
31
+ )
32
+ return tokenizer, model
33
+
34
+ def llmQuestion(tokenizer, model, instruct, question, output_size):
35
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
36
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
37
+
38
+ ### Instruction:
39
+ {}
40
+
41
+ ### Input:
42
+ {}
43
+
44
+ ### Response:
45
+ {}"""
46
+
47
+ # alpaca_prompt = Copied from above
48
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
+ inputs = tokenizer(
50
+ [
51
+ alpaca_prompt.format(
52
+ instruct, # instruction
53
+ question, # input
54
+ "", # output - leave this blank for generation!
55
+ )
56
+ ], return_tensors = "pt").to("cuda")
57
+
58
+
59
+ outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
60
+ output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
61
+
62
+ return output_text
63
+
64
+ if __name__ == "__main__":
65
+ tokenizer, model = llmLoad(8192)
66
+ sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
67
+ '\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
68
+ 'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
69
+ " Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
70
+ '\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
71
+ '\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
72
+ ' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
73
+ ' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
74
+ ' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
75
+ ' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
76
+ ' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
77
+ 'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
78
+ " \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
79
+ 'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
80
+ ' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
81
+ 'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
82
+ ' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
83
+ 'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
84
+ ' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
85
+ ' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
86
+ ' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
87
+ ' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
88
+ 'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
89
+ ' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
90
+ ' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
91
+ 'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
92
+ ' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
93
+ ' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
94
+ ' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
95
+ ' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
96
+ ' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
97
+ ' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
98
+ ' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
99
+ ' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
100
+ 'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
101
+ ' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
102
+ 'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
103
+ " SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
104
+ 'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
105
+ 'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
106
+ " \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
107
+ "\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
108
+ '9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
109
+ 'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
110
+ 'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
111
+ 'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
112
+ ' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
113
+ 'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
114
+ 'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
115
+ 'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
116
+ 'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
117
+ 'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
118
+ "adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
119
+ 'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
120
+ ' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
121
+ 'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
122
+ 'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
123
+ ' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
124
+ 'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
125
+ 'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
126
+ ' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
127
+ ' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
128
+ ' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
129
+ ' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
130
+ "ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
131
+ " writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
132
+ ' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
133
+ 'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
134
+ ' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
135
+ 'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
136
+ 'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
137
+ "cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
138
+ "\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
139
+ "other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
140
+ ' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
141
+ ' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
142
+ '\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
143
+ 'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
144
+ 'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
145
+ '\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
146
+ '\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
147
+ 'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
148
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
149
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
150
+ '\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
151
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
152
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
153
+ '\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
154
+ 'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
155
+ '\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
156
+ '\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
157
+ '\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
158
+ '\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
159
+ '\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
160
+ 'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
161
+ '\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
162
+ '\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
163
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
164
+ '\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
165
+ '\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
166
+ '\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
167
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
168
+ '\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
169
+ '\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
170
+ '\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
171
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
172
+ '\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
173
+ '\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
174
+ '\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
175
+ '\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
176
+ '\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
177
+ '\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
178
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
179
+ '\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
180
+ ' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
181
+ ' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
182
+ ' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
183
+ ' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
184
+ " \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
185
+ "\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
186
+ instruct = "Find relevant sentences from text_dump with given the target sentence"
187
+ question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
188
+ answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)
llm_load.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ from unsloth import FastLanguageModel
3
+ import torch
4
+ from transformers import TextStreamer
5
+
6
+ def llmLoad(max_seq_length):
7
+ with open('API_key.txt', 'r') as file:
8
+ token = file.read().strip()
9
+ login(token=token)
10
+
11
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
12
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
13
+
14
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
15
+ fourbit_models = [
16
+ "unsloth/mistral-7b-bnb-4bit",
17
+ "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
18
+ "unsloth/llama-2-7b-bnb-4bit",
19
+ "unsloth/gemma-7b-bnb-4bit",
20
+ "unsloth/gemma-7b-it-bnb-4bit", # Instruct version of Gemma 7b
21
+ "unsloth/gemma-2b-bnb-4bit",
22
+ "unsloth/gemma-2b-it-bnb-4bit", # Instruct version of Gemma 2b
23
+ "unsloth/llama-3-8b-bnb-4bit", # [NEW] 15 Trillion token Llama-3
24
+ ] # More models at https://huggingface.co/unsloth
25
+
26
+ model, tokenizer = FastLanguageModel.from_pretrained(
27
+ model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit",
28
+ max_seq_length = max_seq_length,
29
+ dtype = dtype,
30
+ load_in_4bit = load_in_4bit,
31
+ )
32
+ return tokenizer, model
33
+
34
+ def llmQuestion(tokenizer, model, instruct, question, output_size):
35
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
36
+ alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
37
+
38
+ ### Instruction:
39
+ {}
40
+
41
+ ### Input:
42
+ {}
43
+
44
+ ### Response:
45
+ {}"""
46
+
47
+ # alpaca_prompt = Copied from above
48
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
+ inputs = tokenizer(
50
+ [
51
+ alpaca_prompt.format(
52
+ instruct, # instruction
53
+ question, # input
54
+ "", # output - leave this blank for generation!
55
+ )
56
+ ], return_tensors = "pt").to("cuda")
57
+
58
+
59
+ outputs = model.generate(**inputs, max_new_tokens=output_size, use_cache=True)
60
+ output_text = tokenizer.batch_decode(outputs)[0].split('### Response:')[1]
61
+
62
+ return output_text
63
+
64
+ if __name__ == "__main__":
65
+ tokenizer, model = llmLoad(8192)
66
+ sentences = """['\n \n \n\t\t\t\n\t\t\t\n\t\t \n \n \n \n \n \n \n \n \n \n UK News Website of the Year\n \n \n The Telegraph logo\n \n \n \n \n ',
67
+ '\n \n \n \n \n Search Icon\n \n \n \n News \n Sport \n Money \n Travel \n ',
68
+ 'Business \n Health \n Opinion \n General election \n Ukraine \n Royals \n Life & Style \n Culture \n ',
69
+ " Puzzles \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__navigation .e-site-header-button__link').forEach(link => {\n\t\t\t\tlink.addEventListener('click', (e) => {\n",
70
+ '\t\t\t\t\teVar94 = "header-search-icon-mobile";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmgComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n',
71
+ '\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n \n \n \n \n \n UK Edition \n \n \n ',
72
+ ' \n \n \n US Edition \n \n \n \n \n \n \n \n Search Icon\n \n \n \n Subscribe now Free for one month',
73
+ ' \n \n \n \n \n \n \n \n\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t\tLog in\n\t\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\n \n \n \n \n \n \n \n \n \n \n ',
74
+ ' \n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n Sections\n \n ',
75
+ ' \n \n UK Edition \n \n \n \n \n \n US Edition \n \n \n \n \n \n \n',
76
+ ' News\n \n \n \n \n News home \n UK news \n Politics \n World \n Health news \n Defe',
77
+ 'nce \n Science \n Education \n Environment \n Investigations \n Global Health Security \n \n Sport\n \n \n ',
78
+ " \n \n Sport home \n Football \n Rugby Union \n Cricket \n F1 \n Golf \n Tennis \n Women's Sp",
79
+ 'ort \n Racing \n Cycling \n Boxing \n More... \n \n Money\n \n \n \n \n Money home \n ',
80
+ ' Property \n Tax \n Pensions \n Banking \n Investing \n Net Zero \n Calculators \n Guides \n \n ',
81
+ 'Travel\n \n \n \n \n Travel home \n Europe \n UK \n Worldwide \n City breaks \n Hotels \n ',
82
+ ' Cruise \n Ski \n Advice \n \n Business\n \n \n \n \n Business home \n Alex \n Ec',
83
+ 'onomy \n Companies \n Markets \n Tech \n \n Health\n \n \n \n \n Health home \n Diet \n ',
84
+ ' Fitness \n Conditions \n Wellbeing \n Parenting \n Guides \n Tools \n \n Opinion\n \n \n ',
85
+ ' \n \n Opinion home \n Obituaries \n Letters to the Editor \n Telegraph View \n Our columnists \n Cartoons \n \n ',
86
+ ' General election \n Ukraine\n \n \n \n \n Ukraine home \n Daily podcast \n Daily newsletter \n \n ',
87
+ ' Royals\n \n \n \n \n Royals home \n King Charles III \n Queen Camilla \n Prince William \n Prince',
88
+ 'ss of Wales \n Prince Harry \n Duchess of Sussex \n \n Life & Style\n \n \n \n \n Life & Style home \n ',
89
+ ' Family \n Columnists \n Cookbook \n Food & Drink \n Fashion \n Beauty \n Luxury \n Cars \n Gardening \n ',
90
+ ' Interiors \n Puzzle News \n Recommended \n Tel Mag \n \n Culture\n \n \n \n \n Culture hom',
91
+ 'e \n TV \n Film \n Music \n Books \n Theatre \n Comedy \n Dance \n Opera \n Art \n \n ',
92
+ ' Telegraph Tickets \n \n Puzzles \n \n \n \n \n \n \n \n UK Edition \n \n \n \n ',
93
+ ' \n US Edition \n \n \n \n \n \n \n \n \n Subscribe now Free for one month \n \n \n ',
94
+ ' \n \n \n \n \n \n \n \n Log in\n \n Login icon\n \n \n \n \n ',
95
+ ' \n Follow us on:\n \n \n \n Facebook icon\n \n \n \n Instagram icon\n \n \n ',
96
+ ' \n X icon\n \n \n \n Snapchat icon\n \n \n \n LinkedIn icon\n \n \n \n ',
97
+ ' YouTube icon \n \n \n \n \n \n \n More from The Telegraph\n \n \n Download our app \n Newsletters \n ',
98
+ ' Telegraph Extra \n Recommended \n Financial Solutions \n Events \n Betting \n Dating \n Offers \n Travel offers \n Shop \n ',
99
+ ' Garden shop \n Bookshop \n Tickets \n Puzzles \n Fantasy Football \n Work at The Telegraph \n Telegraph Corporate \n Help and suppo',
100
+ 'rt \n The Chelsea Magazine Company \n Broadband and Mobile Deals \n Voucher codes \n \n See top shops\n \n \n \n ',
101
+ ' \n Samsung \n Nike \n ASOS \n eBay \n Currys \n Wayfair \n TUI \n JD Sports \n Travelodg',
102
+ 'e \n Adidas \n Broadband deals \n Cheap broadband \n Broadband in my area \n Broadband and TV deals \n Mobile deals \n ',
103
+ " SIM-only deals \n \n \n \n \n \n \n \n \n \n \n\t\t(function () {\n\t\t\tdocument.querySelectorAll('.site-header__buttons .e-site-header-button__link').forE",
104
+ 'ach(link => {\n\t\t\t\tlink.addEventListener(\'click\', (e) => {\n\t\t\t\t\teVar94 = "header-search-icon-desktop";\n\t\t\t\t\teVar95 = link.textContent.trim();\n\t\t\t\t\teVar96 = "img";\n\t\t\t\t\teVar97 = document.title;\n\t\t\t\t\ttmg',
105
+ 'ComponentString = eVar94+";"+eVar95+"_"+eVar96+";"+eVar97;\n\t\t\t\t\tlocalStorage.setItem("tmgComponentTracking", tmgComponentString);\n\t\t\t\t});\n\t\t\t});\n\t\t})();\n\t\n \n\t\n\t\t\n\t\t\t\n\t\t Jump to navigation\n \n \n \n',
106
+ " \n \n \n \n \n \n Hitch Hiker's Guide author Douglas Adams dies aged 49\n \n \n \n \n By Andrew Alderson and Daniel Foggo 13 May 2001 • 12:00am \n \n \n ",
107
+ "\n \n \n \n DOUGLAS ADAMS, the thought-provoking author who inspired a generation with his cult science-fiction novel, The Hitch Hiker's Guide to the Galaxy, has died at the age of 4",
108
+ '9 from a heart attack while working out at the gym.\n \n \n \n \n \n \n \n \n \n \n Douglas Adams: inspired a generation with t',
109
+ 'he cult novel, A Hitch Hiker\'s Guide to the Galaxy\n \n \n \n \n \n \n \n Adams\'s age was seven more than his cryptic answer of "42" to the intriguing ques',
110
+ 'tion the comic novel had posed: what is the answer to life, the universe and everything? His book has sold more than 14 million copies worldwide, but Adams became a household name in Britain after it ',
111
+ 'was turned into a BBC television series in the early 1980s.\n \n \n \n \n Adams, 6ft 5in tall and well built, did not have a history of heart problems. However, say friends, he',
112
+ ' had visited the doctor just days ago complaining of a numbness in his arm. He collapsed on Friday while exercising at a gym in Santa Barbara on the west coast of America and never regained consciousn',
113
+ 'ess. He leaves a widow and a six-year-old daughter.\n \n \n \n \n Adams was British but moved with his family to California in 1999, to be involved in a Disney film version of ',
114
+ 'his book: he had previously lived in Islington, north London, for 22 years. A complex man, he was transported from obscurity to fame in 1979 by the instant success of his novel, which became hugely po',
115
+ 'pular with students.\n \n \n \n \n Soon after the book was published, he was invited to sign copies at a small Soho bookshop. On his way there, Adams became convinced he was be',
116
+ 'en caught up in a demonstration, only to discover the crowds were waiting for him.\n \n \n \n \n The book shot to the number one spot in the best-seller list the next day. He s',
117
+ 'aid: "It was like being helicoptered to the top of Mount Everest, or having an orgasm without the foreplay." Adams, however, later suffered from writer\'s block and was so notoriously bad at meeting de',
118
+ "adlines that Sue Freestone, his former publisher, was even known to move into his house to bully him into writing.\n \n \n \n \n Ed Victor, Adams's literary agent for 20 years ",
119
+ 'and a close friend, was devastated by the news yesterday. He said: "I feel as if someone has torn a limb off me. Tragic is an overused word, but this really is a tragic loss.\n \n \n \n ',
120
+ ' \n Mr Victor said: "He was one of the truly original writers and thinkers of our generation who should have had many years ahead of him. He was not only entertaining, but also stimulating an',
121
+ 'd provoking: he was a unique thinker with a huge audience."\n \n \n \n \n Mr Victor said that writer\'s block had been a terrible problem for Adams, who hated spending time alon',
122
+ 'e. He said: "He was once locked in a hotel suite at the Berkeley for two weeks by Sonny Mehta [his former publisher]. When I asked Douglas how it had worked, he said: \'It was simple. I sat at the desk',
123
+ ' and typed and Sonny sat in an armchair and glowered.\' "\n \n \n \n \n Adams was said to have used The Hitch Hiker\'s Guide, which started off as a radio show in the 1970s, to p',
124
+ 'oke fun at those who seek solutions to unanswerable questions. It was intended to highlight the absurdity of attempting to do so.\n \n \n \n \n The novel has since been turned ',
125
+ 'into a play and a computer game, and has spawned four sequels. Adams also set up a website called h2g2, an entertainment guide now run by the BBC, as a spin-off from his book.\n \n \n \n ',
126
+ ' \n In his novel, which deals with the voyages of a suburban earthling, Arthur Dent, Adams describes a race of hyper-intelligent beings, who had reached a point where they were determined to',
127
+ ' understand the purpose of the universe and their own existence.\n \n \n \n \n They built a supercomputer, Deep Thought, and asked it for the answer to the ultimate question of',
128
+ ' life, the universe and everything. The computer worked for several millennia on the answer. Finally, the beings were shocked and disappointed with the computer\'s ridiculous response: "42".\n \n ',
129
+ ' \n \n \n In the book, the Earth is referred to as "mostly harmless", which became a buzz phrase of the 1980s. Adams was born in Cambridge in 1952 and educated at Brentwood School, E',
130
+ "ssex, before returning to Cambridge to study at St John's College.\n \n \n \n \n His early career included work as a radio and television writer and producer. Some of his early",
131
+ " writing was with his friend Graham Chapman, a member of the Monty Python's Flying Circus comedy team.\n \n \n \n \n He later collaborated with Terry Jones, another Python team",
132
+ ' member. Jones was in tears after learning of his friend\'s death yesterday. He told the Telegraph: "Douglas was a total original: he had a beautiful way of thinking and an incisive mind that went stra',
133
+ 'ight to the heart of matters. He had a genius for putting those concepts into words. His books were great works of literature. He was a lovely man, and I loved him."\n \n \n \n \n ',
134
+ ' Senior staff at the BBC, who worked with Adams, were equally sad. Alan Yentob, the corporation\'s director of drama and entertainment, said: "Douglas was a big character who will be hugely missed b',
135
+ 'y a host of friends and millions of fans around the world."\n \n \n \n \n Geoffrey Perkins, the BBC\'s head of comedy and who produced the original radio series of the novel, sa',
136
+ 'id: "I\'ve known Douglas for 25 years. He was absolutely one of the most creative geniuses to ever work in radio comedy."\n \n \n \n \n Adams\'s life was transformed by the publi',
137
+ "cation of The Hitch Hiker's Guide providing him with a wealth he had never imagined. He married Jane Belson, a barrister, in 1991 and they had a daughter, Polly, in 1994.\n \n \n \n ",
138
+ "\n Adams's other bestselling titles include The Restaurant at the End of the Universe; Life, the Universe and Everything and So Long, and Thanks for All the Fish. He was in discussion to turn an",
139
+ "other of his books, Dirk Gently's Holistic Detective Agency, into a film and was working on another novel, which was 12 years late.\n \n \n \n \n \n \n \n \n \n ",
140
+ ' \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n WhatsApp Icon\n \n \n \n Email Icon\n ',
141
+ ' \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n Advertisement\n \n \n \n\n\tMore stories\n\n\n',
142
+ '\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n Twitter Icon\n \n \n \n Facebook Icon\n \n \n \n Whats',
143
+ 'App Icon\n \n \n \n Email Icon\n \n \n \n \n \n Comment speech bubble\n \n \n \n \n \n \n \n \n \n \n\n\tMore from The T',
144
+ 'elegraph\n\n\n\n\n\n\n\n\n\n\t\n\n\n\n\n\n\t\n\t\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\t\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\t\tMore stories\n\t\t\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\n\t\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t',
145
+ '\n\n\t\t\n\n\t\n\n\n\n\t\t\n\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n',
146
+ '\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tProtesters charged after blocking coach bound ',
147
+ 'for Bibby Stockholm \n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n',
148
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tTelegraph Reporters\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:53am\n\t\t\t\t\t\t\t\n',
149
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t',
150
+ '\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t',
151
+ '\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t',
152
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tCanada police lay charges over murder of Sikh leader and probe Indian ties\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t',
153
+ '\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tOur F',
154
+ 'oreign Staff\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 May 2024, 1:12am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t',
155
+ '\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t',
156
+ '\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n',
157
+ '\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n',
158
+ '\t\t\t\t\n\n\t\t\t\tKing takes on hundreds of new patronages\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t',
159
+ '\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tVictoria Ward\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t4 M',
160
+ 'ay 2024, 12:01am\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n',
161
+ '\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t',
162
+ '\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t',
163
+ '\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLabour’s strategy ‘won’t last’ into a general election, says Cabinet minister\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n',
164
+ '\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t',
165
+ '\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tJack Maidment\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 11:01pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t',
166
+ '\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n',
167
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t',
168
+ '\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t',
169
+ '\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tLuton waste chance to start great escape in draw with Everton\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t',
170
+ '\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tWill Conroy\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n',
171
+ '\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:53pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
172
+ '\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
173
+ '\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\t\n\t\n\t\n\t\n\t\n\n\t\n\n\t\n\t\n\t\n\n\t\n\t\n\n\t\n\t\n\n\t\n\n\t\n\n\t\n\n\t\n\t\n\n\t\n\t\n\t\n',
174
+ '\n\t\n\n\t\n\n\t\n\t\t\n\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\n\t\t\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\tSeven things you may have missed in the local elections\n\n\t\t\t',
175
+ '\t\n\t\t\t\t\n\t\t\t\n\t\t\n\t\n\n\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\n\t\n\t\n\n\t\t',
176
+ '\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\t\t\n\n\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\tDominic Penna\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\n\t\t\t\t\t\t\t\t3 May 2024, 10:37pm\n\t\t\t\t\t\t\t\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n',
177
+ '\n\t\t\t\n\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\n\n\t\n\t\t\n\n\t\t\t\n\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\n\t\t\t\t\n\t\n\t\t\n',
178
+ '\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\t\n\t\t\n\t\t\n\t\n\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\t\n\t\t\t\n\n\t\t\n\t\n\n\n\n\t\t\n\t\n\n\n\t\t\n\t\n\n\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\n\n\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\n\n\t\t\t\t\n\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t\t\t\n\t\t\t\t\t\n\t\t\t\t',
179
+ '\n\n\t\t\t\n\n\t\t\n\n\t\n\n\n\n\n\n\n\n \n \n \n \n \n \n \n \n \n The Telegraph\n \n \n \n Back to top\n \n \n \n \n \n Follow us on:\n \n \n ',
180
+ ' \n Facebook icon\n \n \n \n Instagram icon\n \n \n \n X icon\n \n \n \n Snapchat icon\n \n \n',
181
+ ' \n LinkedIn icon\n \n \n \n YouTube icon \n \n \n \n \n \n \n \n \n \n Help Centre\n About us\n Telegraph Extra\n ',
182
+ ' Reader Prints\n Branded Content\n Syndication and Commissioning\n Fantasy Sport\n UK Voucher Codes\n Betting Offers\n Tax Strategy\n Broadband and Mobile Deals\n',
183
+ ' The Chelsea Magazine Company\n Newsletters\n Download the Telegraph App\n Privacy\n Terms & Conditions\n Modern Slavery\n Advertising terms\n Guidelines\n ',
184
+ " \n \n © Telegraph Media Group Limited 2024\n \n \n \n \n \n \n\twindow.addEventListener( 'DOMContentLoaded', function() {\n\t\t_satellite.pageBottom();\n\t});\n\n\t\t\n\t\t\t\n\t\t\t\t\n\t\t\t\n\t \n\t\t\n\t\t\t\n\t",
185
+ "\t\t\t\n\t\t\t\n\t\n window.RUM_BASE = '/';\nimport { sampleRUM } from '/.rum/@adobe/helix-rum-js@^1/src/index.js';\nsampleRUM('lazy');\nsampleRUM('cwv');\n\n "]"""
186
+ instruct = "Find relevant sentences from text_dump with given the target sentence"
187
+ question = f"target sentence:'Adam douglas was born in Cambrige', text_dump:{sentences}"
188
+ answer = llmQuestion(tokenizer, model, instruct, question, 8192, 8192)