|
import numpy as np |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import os, sqlite3, traceback, ast, requests, fasttext, re, time, string, spacy, pysbd |
|
from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout, InvalidSchema, InvalidURL |
|
from qwikidata.linked_data_interface import get_entity_dict_from_api |
|
from datetime import datetime |
|
import utils.wikidata_utils as wdutils |
|
from importlib import reload |
|
from urllib.parse import urlparse, unquote |
|
from urllib import parse |
|
from bs4 import BeautifulSoup |
|
from IPython.display import clear_output |
|
from os.path import exists |
|
from pathlib import Path |
|
from nltk.tokenize import sent_tokenize |
|
from sentence_splitter import SentenceSplitter, split_text_into_sentences |
|
import nltk |
|
nltk.download('punkt') |
|
|
|
class DatabaseExtractor(): |
|
def __init__(self, dbname='wikidata_claims_refs_parsed.db'): |
|
self.dbname = dbname |
|
self.prepare_extraction() |
|
|
|
def finish_extraction(self): |
|
self.db.commit() |
|
|
|
def prepare_extraction(self): |
|
self.db = sqlite3.connect(self.dbname) |
|
self.cursor = self.db.cursor() |
|
|
|
self.cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS claims( |
|
entity_id TEXT, |
|
claim_id TEXT, |
|
rank TEXT, |
|
property_id TEXT, |
|
datatype TEXT, |
|
datavalue TEXT, |
|
PRIMARY KEY ( |
|
claim_id |
|
) |
|
)''') |
|
|
|
self.cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS claims_refs( |
|
claim_id TEXT, |
|
reference_id TEXT, |
|
PRIMARY KEY ( |
|
claim_id, |
|
reference_id |
|
) |
|
)''') |
|
|
|
self.cursor.execute(''' |
|
CREATE TABLE IF NOT EXISTS refs( |
|
reference_id TEXT, |
|
reference_property_id TEXT, |
|
reference_index TEXT, |
|
reference_datatype TEXT, |
|
reference_value TEXT, |
|
PRIMARY KEY ( |
|
reference_id, |
|
reference_property_id, |
|
reference_index |
|
) |
|
)''') |
|
self.db.commit() |
|
|
|
def extract_claim(self, entity_id, claim): |
|
if claim['mainsnak']['snaktype'] == 'value': |
|
value = str(claim['mainsnak']['datavalue']) |
|
else: |
|
value = claim['mainsnak']['snaktype'] |
|
try: |
|
self.cursor.execute(''' |
|
INSERT INTO claims(entity_id, claim_id, rank, property_id, datatype, datavalue) |
|
VALUES($var,$var,$var,$var,$var,$var)'''.replace('$var','?'), ( |
|
entity_id,claim['id'],claim['rank'], |
|
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value |
|
)) |
|
except UnicodeEncodeError: |
|
print(entity_id,claim['id'],claim['rank'], |
|
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value) |
|
raise |
|
except sqlite3.IntegrityError as err: |
|
|
|
self.cursor.execute( |
|
'''SELECT * |
|
FROM claims |
|
WHERE claim_id=$var |
|
'''.replace('$var','?'), (claim['id'],) |
|
) |
|
conflicted_value = self.cursor.fetchone() |
|
if conflicted_value == (entity_id,claim['id'],claim['rank'], |
|
claim['mainsnak']['property'],claim['mainsnak']['datatype'],value): |
|
pass |
|
else: |
|
print(err, claim['id']) |
|
traceback.print_exc() |
|
raise err |
|
finally: |
|
|
|
pass |
|
|
|
def extract_reference(self, ref): |
|
for snaks in ref['snaks'].values(): |
|
for i, snak in enumerate(snaks): |
|
if snak['snaktype'] == 'value': |
|
value = str(snak['datavalue']) |
|
else: |
|
value = snak['snaktype'] |
|
try: |
|
self.cursor.execute(''' |
|
INSERT INTO refs(reference_id, reference_property_id, reference_index, |
|
reference_datatype, reference_value) |
|
VALUES($var,$var,$var,$var,$var)'''.replace('$var','?'), ( |
|
ref['hash'],snak['property'],str(i),snak['datatype'],value |
|
)) |
|
except sqlite3.IntegrityError as err: |
|
|
|
self.cursor.execute( |
|
'''SELECT reference_id, reference_property_id, reference_datatype, reference_value |
|
FROM refs |
|
WHERE reference_id = $var |
|
AND reference_property_id = $var |
|
'''.replace('$var','?'), (ref['hash'],snak['property']) |
|
) |
|
conflicted_values = self.cursor.fetchall() |
|
if (ref['hash'],snak['property'],snak['datatype'],value) in conflicted_values: |
|
pass |
|
else: |
|
print(err, ref['hash'],snak['property'],i) |
|
print('trying to insert:',(ref['hash'],snak['property'],str(i),snak['datatype'],value)) |
|
traceback.print_exc() |
|
raise err |
|
finally: |
|
|
|
pass |
|
|
|
def extract_claim_reference(self, claim, ref): |
|
claim['id'],ref['hash'] |
|
try: |
|
self.cursor.execute(''' |
|
INSERT INTO claims_refs(claim_id, reference_id) |
|
VALUES($var,$var)'''.replace('$var','?'), ( |
|
claim['id'],ref['hash'] |
|
)) |
|
except sqlite3.IntegrityError as err: |
|
|
|
pass |
|
finally: |
|
|
|
pass |
|
|
|
def extract_entity(self, e): |
|
for outgoing_property_id in e['claims'].values(): |
|
for claim in outgoing_property_id: |
|
self.extract_claim(e['id'],claim) |
|
if 'references' in claim: |
|
for ref in claim['references']: |
|
self.extract_claim_reference(claim, ref) |
|
self.extract_reference(ref) |
|
|
|
def claimParser(QID): |
|
entity_id = QID |
|
print('Setting up database ...') |
|
extractor = DatabaseExtractor() |
|
|
|
print('Fetching entity from API ...') |
|
entity = get_entity_dict_from_api(entity_id) |
|
|
|
if entity: |
|
print(f'Parsing entity: {entity_id}') |
|
extractor.extract_entity(entity) |
|
else: |
|
print(f'Failed to fetch entity: {entity_id}') |
|
|
|
extractor.finish_extraction() |
|
|
|
def propertyFiltering(QID): |
|
reload(wdutils) |
|
DB_PATH = 'wikidata_claims_refs_parsed.db' |
|
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue'] |
|
|
|
properties_to_remove = { |
|
'general':[ |
|
'P31', |
|
'P279', |
|
'P373', |
|
'P910', |
|
'P7561', |
|
'P5008', |
|
'P2670', |
|
'P1740', |
|
'P1612', |
|
'P8989', |
|
'P2959', |
|
'P7867', |
|
'P935' , |
|
'P1472', |
|
'P8596', |
|
'P5105', |
|
'P8933', |
|
'P642', |
|
'P3876', |
|
'P1791', |
|
'P7084', |
|
'P1465', |
|
'P1687', |
|
'P6104', |
|
'P4195', |
|
'P1792', |
|
'P5869', |
|
'P1659', |
|
'P1464', |
|
'P2354', |
|
'P1424', |
|
'P7782', |
|
'P179', |
|
'P7888', |
|
'P6365', |
|
'P8464', |
|
'P360', |
|
'P805', |
|
'P8703', |
|
'P1456', |
|
'P1012', |
|
'P1151', |
|
'P2490', |
|
'P593', |
|
'P8744', |
|
'P2614', |
|
'P2184', |
|
'P9241', |
|
'P487', |
|
'P1754', |
|
'P2559', |
|
'P2517', |
|
'P971', |
|
'P6112', |
|
'P4224', |
|
'P301', |
|
'P1753', |
|
'P1423', |
|
'P1204', |
|
'P3921', |
|
'P1963', |
|
'P5125', |
|
'P3176', |
|
'P8952', |
|
'P2306', |
|
'P5193', |
|
'P5977', |
|
], |
|
'specific': {} |
|
} |
|
|
|
db = sqlite3.connect(DB_PATH) |
|
cursor = db.cursor() |
|
|
|
sql_query = "select count(*) from claims where property_id in $1;" |
|
sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in properties_to_remove['general']]) + ')') |
|
cursor.execute(sql_query) |
|
print('Removing the',len(properties_to_remove['general']),'properties deemed as ontological or unverbalisable') |
|
cursor = db.cursor() |
|
|
|
sql_query = "select * from claims where entity_id in $1;" |
|
sql_query = sql_query.replace('$1', '(' + ','.join([('"' + e + '"') for e in [QID]]) + ')') |
|
|
|
cursor.execute(sql_query) |
|
theme_df = pd.DataFrame(cursor.fetchall()) |
|
theme_df.columns = claims_columns |
|
|
|
original_theme_df_size = theme_df.shape[0] |
|
last_stage_theme_df_size = original_theme_df_size |
|
|
|
print('- Removing deprecated') |
|
|
|
|
|
theme_df = theme_df[theme_df['rank'] != 'deprecated'].reset_index(drop=True) |
|
print( |
|
' - Percentage of deprecated:', |
|
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
|
) |
|
last_stage_theme_df_size = theme_df.shape[0] |
|
|
|
print('- Removing bad datatypes') |
|
|
|
|
|
bad_datatypes = ['commonsMedia','external-id','globe-coordinate','url', 'wikibase-form', |
|
'geo-shape', 'math', 'musical-notation', 'tabular-data', 'wikibase-sense'] |
|
theme_df = theme_df[ |
|
theme_df['datatype'].apply( |
|
lambda x : x not in bad_datatypes |
|
) |
|
].reset_index(drop=True) |
|
print( |
|
' - Percentage of bad datatypes:', |
|
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
|
) |
|
last_stage_theme_df_size = theme_df.shape[0] |
|
|
|
print('- Removing bad properties') |
|
|
|
|
|
theme_df = theme_df[ |
|
theme_df['property_id'].apply( |
|
lambda x : (x not in properties_to_remove['general'])) |
|
|
|
].reset_index(drop=True) |
|
print( |
|
' - Percentage of ontology (non-domain) properties:', |
|
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
|
) |
|
last_stage_theme_df_size = theme_df.shape[0] |
|
|
|
print('- Removing somevalue/novalue') |
|
|
|
|
|
theme_df = theme_df[ |
|
theme_df['datavalue'].apply( |
|
lambda x : x not in ['somevalue', 'novalue'] |
|
) |
|
].reset_index(drop=True) |
|
print( |
|
' - Percentage of somevalue/novalue:', |
|
round((last_stage_theme_df_size-theme_df.shape[0])/original_theme_df_size*100, 2), '%' |
|
) |
|
last_stage_theme_df_size = theme_df.shape[0] |
|
|
|
print( |
|
'After all removals, we keep', |
|
round(last_stage_theme_df_size/original_theme_df_size*100, 2), |
|
) |
|
theme_df.to_sql('claims', db, if_exists='replace', index=False) |
|
|
|
return theme_df |
|
|
|
def get_object_label_given_datatype(row): |
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
def turn_to_century_or_millennium(y, mode): |
|
y = str(y) |
|
if mode == 'C': |
|
div = 100 |
|
group = int(y.rjust(3, '0')[:-2]) |
|
mode_name = 'century' |
|
elif mode == 'M': |
|
div = 1000 |
|
group = int(y.rjust(4, '0')[:-3]) |
|
mode_name = 'millenium' |
|
else: |
|
raise ValueError('Use mode = C for century and M for millennium') |
|
|
|
if int(y)%div != 0: |
|
group += 1 |
|
group = str(group) |
|
|
|
group_suffix = ( |
|
'st' if group[-1] == '1' else ( |
|
'nd' if group[-1] == '2' else ( |
|
'rd' if group[-1] == '3' else 'th' |
|
) |
|
) |
|
) |
|
|
|
return ' '.join([group+group_suffix, mode_name]) |
|
|
|
dt = row['datatype'] |
|
dv = row['datavalue'] |
|
|
|
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
|
if dt not in dt_types: |
|
print(dt) |
|
raise ValueError |
|
else: |
|
try: |
|
if dt == dt_types[0]: |
|
return Wd_API.get_label(ast.literal_eval(dv)['value']['id'], True) |
|
elif dt == dt_types[1]: |
|
dv = ast.literal_eval(dv) |
|
return (dv['value']['text'], dv['value']['language']) |
|
elif dt == dt_types[2]: |
|
dv = ast.literal_eval(dv) |
|
amount, unit = dv['value']['amount'], dv['value']['unit'] |
|
if amount[0] == '+': |
|
amount = amount[1:] |
|
if str(unit) == '1': |
|
return (str(amount), 'en') |
|
else: |
|
unit_entity_id = unit.split('/')[-1] |
|
unit = Wd_API.get_label(unit_entity_id, True) |
|
return (' '.join([amount, unit[0]]), unit[1]) |
|
elif dt == dt_types[3]: |
|
dv = ast.literal_eval(dv) |
|
time = dv['value']['time'] |
|
timezone = dv['value']['timezone'] |
|
precision = dv['value']['precision'] |
|
assert dv['value']['after'] == 0 and dv['value']['before'] == 0 |
|
|
|
sufix = 'BC' if time[0] == '-' else '' |
|
time = time[1:] |
|
|
|
if precision == 11: |
|
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y') + sufix, 'en') |
|
elif precision == 10: |
|
try: |
|
return (datetime.strptime(time, '%Y-%m-00T00:00:%SZ').strftime("%B of %Y") + sufix, 'en') |
|
except ValueError: |
|
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime("%B of %Y") + sufix, 'en') |
|
elif precision == 9: |
|
try: |
|
return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y') + sufix, 'en') |
|
except ValueError: |
|
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y') + sufix, 'en') |
|
elif precision == 8: |
|
try: |
|
return (datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en') |
|
except ValueError: |
|
return (datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%Y')[:-1] +'0s' + sufix, 'en') |
|
elif precision == 7: |
|
try: |
|
parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ') |
|
except ValueError: |
|
parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ') |
|
finally: |
|
return (turn_to_century_or_millennium( |
|
parsed_time.strftime('%Y'), mode='C' |
|
) + sufix, 'en') |
|
elif precision == 6: |
|
try: |
|
parsed_time = datetime.strptime(time, '%Y-00-00T00:00:%SZ') |
|
except ValueError: |
|
parsed_time = datetime.strptime(time, '%Y-%m-%dT00:00:%SZ') |
|
finally: |
|
return (turn_to_century_or_millennium( |
|
parsed_time.strftime('%Y'), mode='M' |
|
) + sufix, 'en') |
|
elif precision == 4: |
|
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
|
timeint = round(timeint/1e5,1) |
|
return (str(timeint) + 'hundred thousand years' + sufix, 'en') |
|
elif precision == 3: |
|
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
|
timeint = round(timeint/1e6,1) |
|
return (str(timeint) + 'million years' + sufix, 'en') |
|
elif precision == 0: |
|
timeint = int(datetime.strptime(time, '%Y-00-00T00:00:%SZ').strftime('%Y')) |
|
timeint = round(timeint/1e9,1) |
|
return (str(timeint) + 'billion years' +sufix, 'en') |
|
elif dt == dt_types[4]: |
|
return (ast.literal_eval(dv)['value'], 'en') |
|
except ValueError as e: |
|
|
|
raise e |
|
|
|
def get_object_desc_given_datatype(row): |
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
dt = row['datatype'] |
|
dv = row['datavalue'] |
|
|
|
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
|
if dt not in dt_types: |
|
print(dt) |
|
raise ValueError |
|
else: |
|
try: |
|
if dt == dt_types[0]: |
|
return Wd_API.get_desc(ast.literal_eval(dv)['value']['id']) |
|
elif dt == dt_types[1]: |
|
return ('no-desc', 'none') |
|
elif dt == dt_types[2]: |
|
dv = ast.literal_eval(dv) |
|
amount, unit = dv['value']['amount'], dv['value']['unit'] |
|
if amount[0] == '+': |
|
amount = amount[1:] |
|
if str(unit) == '1': |
|
return ('no-desc', 'none') |
|
else: |
|
unit_entity_id = unit.split('/')[-1] |
|
return Wd_API.get_desc(unit_entity_id) |
|
elif dt == dt_types[3]: |
|
return ('no-desc', 'none') |
|
elif dt == dt_types[4]: |
|
return ('no-desc', 'none') |
|
except ValueError as e: |
|
|
|
raise e |
|
|
|
def get_object_alias_given_datatype(row): |
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
dt = row['datatype'] |
|
dv = row['datavalue'] |
|
|
|
dt_types = ['wikibase-item', 'monolingualtext', 'quantity', 'time', 'string'] |
|
if dt not in dt_types: |
|
print(dt) |
|
raise ValueError |
|
else: |
|
try: |
|
if dt == dt_types[0]: |
|
return Wd_API.get_alias(ast.literal_eval(dv)['value']['id']) |
|
elif dt == dt_types[1]: |
|
return ('no-alias', 'none') |
|
elif dt == dt_types[2]: |
|
dv = ast.literal_eval(dv) |
|
amount, unit = dv['value']['amount'], dv['value']['unit'] |
|
if amount[0] == '+': |
|
amount = amount[1:] |
|
if str(unit) == '1': |
|
return ('no-alias', 'none') |
|
else: |
|
unit_entity_id = unit.split('/')[-1] |
|
return Wd_API.get_alias(unit_entity_id) |
|
elif dt == dt_types[3]: |
|
dv = ast.literal_eval(dv) |
|
time = dv['value']['time'] |
|
timezone = dv['value']['timezone'] |
|
precision = dv['value']['precision'] |
|
assert dv['value']['after'] == 0 and dv['value']['before'] == 0 |
|
|
|
sufix = 'BC' if time[0] == '-' else '' |
|
time = time[1:] |
|
|
|
if precision == 11: |
|
return ([ |
|
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%-d of %B, %Y') + sufix, |
|
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%d/%m/%Y (dd/mm/yyyy)') + sufix, |
|
datetime.strptime(time, '%Y-%m-%dT00:00:%SZ').strftime('%b %-d, %Y') + sufix |
|
], 'en') |
|
else: |
|
return ('no-alias', 'none') |
|
elif dt == dt_types[4]: |
|
return ('no-alias', 'none') |
|
except ValueError as e: |
|
|
|
raise e |
|
|
|
def textualAugmentation(filtered_df): |
|
|
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
|
|
filtered_df['entity_label'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_label(x, True)) |
|
filtered_df['entity_desc'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_desc(x)) |
|
filtered_df['entity_alias'] = filtered_df['entity_id'].apply(lambda x: Wd_API.get_alias(x)) |
|
|
|
print(' - Predicate augmentation...') |
|
filtered_df['property_label'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_label(x, True)) |
|
filtered_df['property_desc'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_desc(x)) |
|
filtered_df['property_alias'] = filtered_df['property_id'].apply(lambda x: Wd_API.get_alias(x)) |
|
|
|
print(' - Object augmentation...') |
|
filtered_df['object_label'] = filtered_df.apply(get_object_label_given_datatype, axis=1) |
|
filtered_df['object_desc'] = filtered_df.apply(get_object_desc_given_datatype, axis=1) |
|
filtered_df['object_alias'] = filtered_df.apply(get_object_alias_given_datatype, axis=1) |
|
|
|
|
|
no_subject_label_perc = filtered_df[filtered_df['entity_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
|
print(' - No subject label %:', no_subject_label_perc, '%') |
|
|
|
no_predicate_label_perc = filtered_df[filtered_df['property_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
|
print(' - No predicate label %:', no_predicate_label_perc, '%') |
|
|
|
no_object_label_perc = filtered_df[filtered_df['object_label'].apply(lambda x: x[0] == 'no-label')].shape[0] / filtered_df.shape[0] * 100 |
|
print(' - No object label %:', no_object_label_perc, '%') |
|
return filtered_df |
|
|
|
def urlParser(target_QID): |
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
db = sqlite3.connect('wikidata_claims_refs_parsed.db') |
|
cursor = db.cursor() |
|
refs_columns = ['reference_id','reference_property_id', 'reference_index', 'reference_datatype', 'reference_value'] |
|
cursor.execute('select * from refs where reference_datatype="url";') |
|
url_df = pd.DataFrame(cursor.fetchall()) |
|
url_df.columns = refs_columns |
|
def reference_value_to_url(reference_value): |
|
if reference_value in ['novalue','somevalue']: |
|
return reference_value |
|
reference_value = ast.literal_eval(reference_value) |
|
assert reference_value['type'] == 'string' |
|
return reference_value['value'] |
|
def reference_value_to_external_id(reference_value): |
|
if reference_value in ['novalue','somevalue']: |
|
return reference_value |
|
reference_value = ast.literal_eval(reference_value) |
|
assert reference_value['type'] == 'string' |
|
return reference_value['value'] |
|
def get_formatter_url(entity_id): |
|
try: |
|
sparql_query = ''' |
|
SELECT ?item ?itemLabel |
|
WHERE |
|
{ |
|
wd:$1 wdt:P1630 ?item. |
|
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } |
|
} |
|
'''.replace('$1',entity_id) |
|
sparql_results = Wd_API.query_sparql_endpoint(sparql_query) |
|
if len(sparql_results['results']['bindings']) > 0: |
|
return sparql_results['results']['bindings'][0]['item']['value'] |
|
else: |
|
return 'no_formatter_url' |
|
except Exception: |
|
print(entity_id) |
|
print(sparql_results) |
|
raise |
|
url_df['url'] = url_df.reference_value.apply(reference_value_to_url) |
|
cursor.execute('select * from refs where reference_datatype="url";') |
|
ext_id_df = pd.DataFrame(cursor.fetchall()) |
|
ext_id_df.columns = refs_columns |
|
ext_id_df['ext_id'] = ext_id_df.reference_value.apply(reference_value_to_external_id) |
|
ext_id_df['formatter_url'] = ext_id_df['reference_property_id'].apply(get_formatter_url) |
|
ext_id_df['url'] = ext_id_df.apply(lambda x : x['formatter_url'].replace('$1', x['ext_id']), axis=1) |
|
columns_for_join = ['reference_id', 'reference_property_id','reference_index','reference_datatype','url'] |
|
url_df_pre_join = url_df[columns_for_join] |
|
ext_id_df_pre_join = ext_id_df[columns_for_join] |
|
all_url_df = pd.concat([url_df_pre_join,ext_id_df_pre_join]) |
|
all_url_df = all_url_df.sort_values(['reference_id','reference_index']) |
|
|
|
all_url_df = all_url_df[all_url_df['url'] != 'no_formatter_url'].reset_index(drop=True) |
|
|
|
all_url_df = all_url_df[~all_url_df['url'].isin(['somevalue','novalue'])] |
|
reference_id_counts = all_url_df.reference_id.value_counts().reset_index() |
|
reference_id_counts.columns = ['reference_id', 'counts'] |
|
reference_id_counts_equal_1 = reference_id_counts[reference_id_counts['counts'] == 1].reference_id.tolist() |
|
all_url_df_eq1 = all_url_df[all_url_df.reference_id.isin(reference_id_counts_equal_1)] |
|
all_url_df_eq1 = all_url_df_eq1.reset_index(drop=True).drop('reference_index', axis=1) |
|
return all_url_df_eq1 |
|
|
|
def htmlParser(url_set, qid): |
|
text_reference_sampled_df = url_set |
|
_RE_COMBINE_WHITESPACE = re.compile(r"\s+") |
|
text_reference_sampled_df['html'] = None |
|
for i, row in text_reference_sampled_df.iterrows(): |
|
|
|
print(i, row.url) |
|
try: |
|
response = requests.get(row.url, timeout=10) |
|
if response.status_code == 200: |
|
html = response.text |
|
text_reference_sampled_df.loc[i, 'html'] = html |
|
else: |
|
print(f"not response, {response.status_code}") |
|
text_reference_sampled_df.loc[i, 'html'] = response.status_code |
|
except requests.exceptions.Timeout: |
|
print("Timeout occurred while fetching the URL:", row.url) |
|
text_reference_sampled_df.loc[i, 'html'] = 'TimeOut' |
|
pass |
|
except Exception as e: |
|
print("An error occurred:", str(e)) |
|
pass |
|
text_reference_sampled_df_html = text_reference_sampled_df.copy() |
|
text_reference_sampled_df_html['entity_id'] = qid |
|
return text_reference_sampled_df_html |
|
|
|
def claim2text(html_set): |
|
text_reference_sampled_df_html = html_set |
|
Wd_API = wdutils.CachedWikidataAPI() |
|
Wd_API.languages = ['en'] |
|
db = sqlite3.connect('wikidata_claims_refs_parsed.db') |
|
cursor = db.cursor() |
|
claims_columns = ['entity_id','claim_id','rank','property_id','datatype','datavalue'] |
|
refs_columns = ['reference_id', 'reference_property_id', 'reference_index', 'reference_datatype', 'reference_value'] |
|
|
|
def reference_id_to_claim_id(reference_id): |
|
cursor.execute(f'select claim_id from claims_refs where reference_id="{reference_id}"') |
|
sql_result = cursor.fetchall() |
|
|
|
randomly_chosen_claim_id = np.array(sql_result).reshape(-1) |
|
return randomly_chosen_claim_id |
|
|
|
def reference_id_to_claim_data(reference_id): |
|
claim_ids = reference_id_to_claim_id(reference_id) |
|
r = [] |
|
for claim_id in claim_ids: |
|
|
|
cursor.execute(f'select * from claims where claim_id="{claim_id}";') |
|
d = cursor.fetchall() |
|
r = r + d |
|
return r |
|
|
|
claim_data = [] |
|
for reference_id in text_reference_sampled_df_html.reference_id: |
|
data = reference_id_to_claim_data(reference_id) |
|
|
|
data = [(reference_id,) + t for t in data] |
|
claim_data = claim_data + data |
|
|
|
|
|
claim_df = pd.DataFrame(claim_data, columns = ['reference_id'] + claims_columns) |
|
claim_df |
|
|
|
def claim_id_to_claim_url(claim_id): |
|
claim_id_parts = claim_id.split('$') |
|
return f'https://www.wikidata.org/wiki/{claim_id_parts[0]}#{claim_id}' |
|
|
|
BAD_DATATYPES = ['external-id','commonsMedia','url', 'globe-coordinate', 'wikibase-lexeme', 'wikibase-property'] |
|
|
|
assert claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reference_id.unique().shape\ |
|
== claim_df.reference_id.unique().shape |
|
|
|
print(claim_df.reference_id.unique().shape[0]) |
|
claim_df = claim_df[~claim_df.datatype.isin(BAD_DATATYPES)].reset_index(drop=True) |
|
|
|
from tqdm.auto import tqdm |
|
tqdm.pandas() |
|
|
|
claim_df[['entity_label','entity_label_lan']] = pd.DataFrame( |
|
claim_df.entity_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist() |
|
) |
|
claim_df[['property_label','property_label_lan']] = pd.DataFrame( |
|
claim_df.property_id.progress_apply(Wd_API.get_label, non_language_set=True).tolist() |
|
) |
|
|
|
claim_df[['entity_alias','entity_alias_lan']] = pd.DataFrame( |
|
claim_df.entity_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist() |
|
) |
|
claim_df[['property_alias','property_alias_lan']] = pd.DataFrame( |
|
claim_df.property_id.progress_apply(Wd_API.get_alias, non_language_set=True).tolist() |
|
) |
|
|
|
claim_df[['entity_desc','entity_desc_lan']] = pd.DataFrame( |
|
claim_df.entity_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist() |
|
) |
|
claim_df[['property_desc','property_desc_lan']] = pd.DataFrame( |
|
claim_df.property_id.progress_apply(Wd_API.get_desc, non_language_set=True).tolist() |
|
) |
|
|
|
claim_df['object_label'] = claim_df.apply(get_object_label_given_datatype, axis=1) |
|
claim_df['object_alias'] = claim_df.apply(get_object_alias_given_datatype, axis=1) |
|
claim_df['object_desc'] = claim_df.apply(get_object_desc_given_datatype, axis=1) |
|
|
|
claim_df['object_label'], claim_df['object_label_lan'] = zip(*claim_df['object_label'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
|
claim_df['object_alias'], claim_df['object_alias_lan'] = zip(*claim_df['object_alias'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
|
claim_df['object_desc'], claim_df['object_desc_lan'] = zip(*claim_df['object_desc'].apply(lambda x: x if isinstance(x, tuple) else (x, ''))) |
|
|
|
|
|
claim_df = claim_df[claim_df['object_label_lan'] != 'none'].reset_index(drop=True) |
|
return claim_df |
|
|
|
def html2text(html_set): |
|
reference_html_df = html_set |
|
_RE_COMBINE_WHITESPACE = re.compile(r"\s+") |
|
ft_model = fasttext.load_model('base/lid.176.ftz') |
|
def predict_language(text, k=20): |
|
ls, scores = ft_model.predict(text, k=k) |
|
ls = [l.replace('__label__','') for l in ls] |
|
return list(zip(ls,scores)) |
|
def get_url_language(html): |
|
try: |
|
soup = BeautifulSoup(html, "lxml") |
|
[s.decompose() for s in soup("script")] |
|
if soup.body == None: |
|
return ('no body', None) |
|
body_text = _RE_COMBINE_WHITESPACE.sub(" ", soup.body.get_text(' ')).strip() |
|
return predict_language(body_text, k=1)[0] |
|
except Exception: |
|
raise |
|
def get_text_p_tags(soup): |
|
p_tags = soup.find_all('p') |
|
text = [p.getText().strip() for p in p_tags if p.getText()] |
|
return '\n'.join(text) |
|
def clean_text_line_by_line(text, join=True, ch_join = ' ', verb=True): |
|
|
|
|
|
lines = list(text.splitlines()) |
|
lines = (line.strip() for line in lines) |
|
|
|
lines = (re.sub(r' {2,}', ' ', line) for line in lines) |
|
|
|
lines = (re.sub(r' ([.,:;!?\\-])', r'\1', line) for line in lines) |
|
|
|
lines = [line+'.' if line and line[-1] not in string.punctuation else line for i, line in enumerate(lines)] |
|
|
|
if verb: |
|
for i, line in enumerate(lines): |
|
print(i,line) |
|
|
|
if join: |
|
return ch_join.join([line for line in lines if line]) |
|
else: |
|
return [line for line in lines if line] |
|
|
|
def apply_manual_rules(text): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = re.sub(r'\[[0-9]+\]', '', text) |
|
return text |
|
def retrieve_text_from_html(html, soup_parser = 'lxml', verb=True, join=True): |
|
if not isinstance(html, str) or 'DOCTYPE html' not in html: |
|
return 'No body' |
|
soup = BeautifulSoup(html, soup_parser) |
|
for script in soup(["script", "style"]): |
|
script.decompose() |
|
if soup.body == None: |
|
return 'No body' |
|
[s.unwrap() for s in soup.body.find_all('strong')] |
|
|
|
for p in soup.body.find_all('p'): |
|
p.string = _RE_COMBINE_WHITESPACE.sub(" ", p.get_text('')).strip() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = soup.body.get_text(' ').strip() |
|
|
|
|
|
text = apply_manual_rules(text) |
|
text = clean_text_line_by_line(text, ch_join = ' ', verb=verb, join=join) |
|
|
|
if not text: |
|
return 'No extractable text' if join else ['No extractable text'] |
|
else: |
|
return text |
|
i=0 |
|
print(i) |
|
print(reference_html_df.url.iloc[i]) |
|
|
|
reference_html_df['extracted_sentences'] = reference_html_df.html.progress_apply(retrieve_text_from_html, join=False, verb=False) |
|
|
|
join_ch = ' ' |
|
reference_html_df['extracted_text'] = reference_html_df.extracted_sentences.apply(lambda x : join_ch.join(x)) |
|
|
|
splitter = SentenceSplitter(language='en') |
|
|
|
seg = pysbd.Segmenter(language="en", clean=False) |
|
|
|
if not spacy.util.is_package("en_core_web_lg"): |
|
os.system("python -m spacy download en_core_web_lg") |
|
|
|
nlp = spacy.load("en_core_web_lg") |
|
|
|
text = reference_html_df.loc[0,'extracted_text'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
sents = [s for s in nlp(text).sents] |
|
|
|
|
|
reference_html_df['nlp_sentences'] = reference_html_df.extracted_text.progress_apply(lambda x : [str(s) for s in nlp(x).sents]) |
|
reference_html_df['nlp_sentences_slide_2'] = reference_html_df['nlp_sentences'].progress_apply( |
|
lambda x : [' '.join([a,b]) for a,b in list(zip(x,x[1:]+['']))] |
|
) |
|
|
|
assert type(reference_html_df.loc[0,'nlp_sentences']) == list |
|
assert type(reference_html_df.loc[0,'nlp_sentences'][0]) == str |
|
assert type(reference_html_df.loc[0,'nlp_sentences_slide_2']) == list |
|
assert type(reference_html_df.loc[0,'nlp_sentences_slide_2'][0]) == str |
|
return reference_html_df |
|
|
|
if __name__ == '__main__': |
|
conn = sqlite3.connect('wikidata_claims_refs_parsed.db') |
|
target_QID = 'Q3621696' |
|
claimParser(target_QID) |
|
filtered_df = propertyFiltering(target_QID) |
|
url_set = urlParser(target_QID) |
|
html_set = htmlParser(url_set, target_QID) |
|
try: |
|
claim_text = claim2text(html_set) |
|
html_text = html2text(html_set) |
|
claim_text = claim_text.astype(str) |
|
html_text = html_text.astype(str) |
|
claim_text.to_sql('claim_text', conn, if_exists='replace', index=False) |
|
html_text.to_sql('html_text', conn, if_exists='replace', index=False) |
|
except Exception as e: |
|
print(f"No accessible html documents") |
|
|
|
|
|
conn.commit() |
|
conn.close() |
|
|
|
|