Spaces:

naveed-stockmark
/

kg_reasoning_demo

Runtime error

File size: 14,540 Bytes

import pandas as pd
from utils import normalize_text
import streamlit as st
from itertools import chain
from collections import Counter

### Data paths
# WIKIPEDIA_PATH = "./kensho_en_wiki_typing_technical.csv"
# WIKIDATA_PATH = "./wikidata_ss_processed.csv"
# REBEL_INFER_PATH = "./rebel_inference_processed_ss.csv"
# ENTITY_LINKING_PATH = "./linking_df_technical_min.csv"

relation_to_id = {
    "uses": 2283,
    "has_use": 366,
    "part_of": 361,
    "has_part": 527,
    "made_from_material": 186,
    "material_of": 186
}

inverse_dict = {
    'uses': 'has_use',
    'has_use': 'uses',
    'has_part': 'part_of',
    'part_of': 'has_part',
    'made_from_material': 'material_of',
    'material_of': 'made_from_material'
}

all_relations = ['uses', 'has_part', 'has_use', 'part_of', 'made_from_material', 'material_of']

st.title("Materials use case search app")

# User Input
input_text = st.text_input(
    label="Enter the name of a material i.e iron, ceramic, steel, aluminum, plastic, etc and press Enter",
    value="iron",
    key="ent",
)

st.write("preparing data ...")

# Wikipedia metadata
@st.cache_data(persist="disk")
def get_wiki_df(path="./kensho_en_wiki_typing_technical.csv"):
    wiki_df = pd.read_csv(path)
    
    # filter out technical articles
    exclude_ids = set(wiki_df[(wiki_df.exclude == True) | (wiki_df.technical == False)].page_id.to_list())
    include_skpes = set(wiki_df[wiki_df.page_id.apply(lambda x: x not in exclude_ids)].skpe_id.to_list())

    skpe_to_wikidata = dict(zip(wiki_df.skpe_id.to_list(), wiki_df.item_id.to_list()))
    
    wiki_df = wiki_df.drop(columns=['Unnamed: 0', 'en_probs', 'exclude'])
    wiki_df = wiki_df.rename(columns={'title_x': 'en_title'})
    
    return wiki_df, include_skpes, skpe_to_wikidata

wiki_df, include_skpes, skpe_to_wikidata = get_wiki_df()

# KG data source 1: Wikidata
# @st.cache_data(persist="disk")
def get_wikidata_df(path="./wikidata_ss_processed.csv"):
    wikidata_df = pd.read_csv(path)
    
    # filter technical wikidata
    wikidata_df = wikidata_df[wikidata_df.apply(lambda x: x.source_skpe in include_skpes and x.target_skpe in include_skpes, axis=1)]
    
    wikidata_df['source_wikidata'] = wikidata_df.source_skpe.apply(lambda x: skpe_to_wikidata[x])
    wikidata_df['target_wikidata'] = wikidata_df.target_skpe.apply(lambda x: skpe_to_wikidata[x])
    wikidata_df = wikidata_df.drop(columns=['source_skpe', 'target_skpe'])
    
    wikidata_df['source'] = 'wikidata'
    
    return wikidata_df

## wikidata_df = get_wikidata_df()

# @st.cache_data(persist="disk")
def get_rebel_infer_df(path="./rebel_inference_processed_ss.csv"):
    rebel_infer_df = pd.read_csv(path)
    
    # filter technical
    rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: type(x.source_skpe_id) == str and type(x.target_skpe_id) == str, axis=1)]

    rebel_infer_df = rebel_infer_df[rebel_infer_df.apply(lambda x: x.source_skpe_id in skpe_to_wikidata.keys() and x.target_skpe_id in skpe_to_wikidata.keys(), axis=1)]
    rebel_infer_df['source_wikidata'] = rebel_infer_df.source_skpe_id.apply(lambda x: skpe_to_wikidata[x])
    rebel_infer_df['target_wikidata'] = rebel_infer_df.target_skpe_id.apply(lambda x: skpe_to_wikidata[x])
    # rebel_infer_df['title_page_id'] = rebel_infer_df.page_skpe_id.apply(lambda x: skpe_to_wikidata[x])

    rebel_infer_df = rebel_infer_df.drop(columns=['instance_id', 'source_text', 'target_text', 'page_skpe_id', 'source_skpe_id', 'target_skpe_id'])
    rebel_infer_df = rebel_infer_df.rename(columns={'source_skpe_id': 'source_skpe', 'target_skpe_id': 'target_skpe', 'source': 'source_en', 'target': 'target_en'})
    rebel_infer_df = rebel_infer_df[rebel_infer_df.source_wikidata != rebel_infer_df.target_wikidata]

    rebel_infer_df['source'] = 'rebel_wikipedia'
    
    return rebel_infer_df

## rebel_infer_df = get_rebel_infer_df()

# Add luke df

# Data source 3: luke inference
# @st.cache_data(persist="disk")
def get_luke_infer_df(path="./luke_fulltext_ss_infer_20240112.csv"):
    luke_infer_df = pd.read_csv(path)

    luke_infer_df = luke_infer_df.rename(columns={"source_mention": "source_en", "target_mention": "target_en", "pred": "relation"})
    luke_infer_df = luke_infer_df.drop(columns=["page_id", "sent_id", "model"])

    luke_infer_df['source'] = 'luke_wikipedia_20240112'
    
    return luke_infer_df

## luke_infer_df = get_luke_infer_df()

# Build instance df
@st.cache_data(persist="disk")
def build_instance_df():

    wikidata_df = get_wikidata_df()
    rebel_infer_df = get_rebel_infer_df()
    luke_infer_df = get_luke_infer_df()

    instance_df = pd.concat([wikidata_df, rebel_infer_df, luke_infer_df])
    # instance_df = instance_df.reset_index(drop=True)
    instance_df['instance_id'] = instance_df.index.to_list()
    instance_df.relation = instance_df.relation.apply(lambda x: x.replace(' ', '_'))
    instance_df['inv_relation'] = instance_df.relation.apply(lambda x: inverse_dict[x])
    instance_df.score = instance_df.score.fillna(1.0)

    # instance_df = instance_df[instance_df.source == 'luke_wikipedia_20240112']

    return instance_df

instance_df = build_instance_df()

# Get KG df
@st.cache_data(persist="disk")
def get_kg_df(path="./kg_master_ss_sample_20240215.csv"):
    kg_df =  pd.read_csv(path)
    kg_df['kg_id'] = kg_df.index
    kg_df = kg_df[kg_df.mode_relation.apply(lambda x: x in ['material_of', 'part_of', 'has_use'])]
    kg_min_df = kg_df[['kg_id', 'source_en', 'source_wikidata', 'mode_relation' ,'target_en', 'target_wikidata']].copy()
    return kg_df, kg_min_df

kg_df, kg_min_df = get_kg_df()


# Get entity linking df
@st.cache_data(persist="disk")
def get_entity_linking_df(path="./linking_df_technical_min.csv"):
    linking_df = pd.read_csv(path)
    return linking_df

st.write("matching input text ...")

linking_df = get_entity_linking_df()


### Start ###

# normalise and match
text_norm = normalize_text(input_text)
match_df = linking_df[linking_df.text == text_norm]

match_df = match_df[match_df.skpe_id.apply(lambda x: x in skpe_to_wikidata.keys())]
match_df['wikidata_id'] = match_df.skpe_id.apply(lambda x: skpe_to_wikidata[x]) 



# top match skpe
if len(match_df) > 0:

    top_wikidata = match_df.wikidata_id.mode()[0]
    all_wikidata = set(match_df.wikidata_id.to_list())
    wikidata_to_count = dict(match_df.wikidata_id.value_counts())

    # Match list
    wiki_match_df = wiki_df[wiki_df.item_id.apply(lambda x: x in all_wikidata)].copy()
    wiki_match_df['link_score'] = wiki_match_df['item_id'].apply(lambda x: wikidata_to_count[x] / sum(wikidata_to_count.values()))
    wiki_match_df = wiki_match_df.sort_values(by='link_score', ascending=False)

    # show similar results
    st.write(f"Found following matches for the term {input_text}")
    wiki_match_df.sort_values(by='views', ascending=False)[:5]

    # proceeding with top match
    st.write("Performing use case extraction for the following top match ...")
    wiki_df[wiki_df.item_id.apply(lambda x: x == top_wikidata)]
    
    # Stuff that are made out of input
    start_df = kg_min_df[(kg_min_df.source_wikidata == top_wikidata) & (kg_min_df.mode_relation == 'material_of')].copy()

    # made_of_list = made_of_df.source_wikidata.to_list()

    if len(start_df) > 0:

        st.write(f"Discovered following entities made out of {input_text}")
        start_df

        st.write("Extracting knowledge graph paths ...")

        ### Length 2 paths

        path_2_df = start_df.merge(
            kg_min_df[kg_min_df.mode_relation == 'has_use'], 
            left_on='target_wikidata', 
            right_on='source_wikidata', 
            how='inner')

        path_2_df = path_2_df.rename(columns={
            'kg_id_x': 'first_relation_id',
            'source_en_x': 'first_source_en',
            'source_wikidata_x': 'first_source_wikidata',
            'mode_relation_x': 'first_mode_relation',
            'target_en_x': 'first_target_en',
            'target_wikidata_x': 'first_target_wikidata',
            'kg_id_y': 'second_relation_id',
            'source_en_y': 'second_source_en',
            'source_wikidata_y': 'second_source_wikidata',
            'mode_relation_y': 'second_mode_relation',
            'target_en_y': 'second_target_en',
            'target_wikidata_y': 'second_target_wikidata',
        })

        path_2_df['path_len'] = 2

        ### Length 3 paths

        path_3_df = start_df.merge(
            kg_min_df[kg_min_df.mode_relation == 'part_of'], 
            left_on='target_wikidata', 
            right_on='source_wikidata', 
            how='inner').merge(
            kg_min_df[kg_min_df.mode_relation == 'has_use'], 
            left_on='target_wikidata_y', 
            right_on='source_wikidata', 
            how='inner'
        )

        path_3_df = path_3_df.rename(columns={
            'kg_id_x': 'first_relation_id',
            'source_en_x': 'first_source_en',
            'source_wikidata_x': 'first_source_wikidata',
            'mode_relation_x': 'first_mode_relation',
            'target_en_x': 'first_target_en',
            'target_wikidata_x': 'first_target_wikidata',
            'kg_id_y': 'second_relation_id',
            'source_en_y': 'second_source_en',
            'source_wikidata_y': 'second_source_wikidata',
            'mode_relation_y': 'second_mode_relation',
            'target_en_y': 'second_target_en',
            'target_wikidata_y': 'second_target_wikidata',
            'kg_id': 'third_relation_id',
            'source_en': 'third_source_en',
            'source_wikidata': 'third_source_wikidata',
            'mode_relation': 'third_mode_relation',
            'target_en': 'third_target_en',
            'target_wikidata': 'third_target_wikidata',
        })

        path_3_df['path_len'] = 3

        path_df = pd.concat([path_2_df, path_3_df])

        # sample max
        ## path_df = path_df.sample(min(20, len(path_df)))


        ### End
        
        if len(path_df) > 0:

            st.write(f"Found {len(path_df)} knowledge graph paths relevant to use cases of {input_text}")
            st.write("------")

            # print all paths
            for i, path in enumerate(path_df.to_dict(orient='records')):

                if i > 5:
                    break

                material = path['first_source_en']
                material_wikidata = path['first_source_wikidata']
                material_url = f"https://www.wikidata.org/wiki/Q{material_wikidata}"

                use_case = path['third_target_en'] if path['path_len'] == 3 else path['second_target_en']
                use_case_wikidata = path['third_target_wikidata'] if path['path_len'] == 3 else path['second_target_wikidata']
                use_case_url = f"https://www.wikidata.org/wiki/Q{use_case_wikidata}"

                st.write(f"**Reasoning Path {i+1}:**")

                # for edge in path:

                edge_prefixes = ['first', 'second', 'third']
                for k in range(path['path_len']):

                    prefix = edge_prefixes[k]

                    source_wikidata = int(path[f'{prefix}_source_wikidata'])
                    target_wikidata = int(path[f'{prefix}_target_wikidata'])

                    source_url = "https://www.wikidata.org/wiki/Q" + str(source_wikidata)
                    target_url = "https://www.wikidata.org/wiki/Q" + str(target_wikidata)
                    relation_url = "https://www.wikidata.org/wiki/Property:P" + str(int(relation_to_id[path[f'{prefix}_mode_relation']]))

                    source_en = path[f'{prefix}_source_en']
                    target_en = path[f'{prefix}_target_en']
                    relation = path[f'{prefix}_mode_relation']

                    st.markdown(f"Edge {k+1}: [{source_en}]({source_url}) --[{relation}]({relation_url})--> [{target_en}]({target_url})")

                    with st.expander("Edge Metadata"):

                        rel_id = path[f'{prefix}_relation_id']
                        rel_data = kg_df[kg_df.kg_id == rel_id].to_dict(orient='records')[0]

                        instance_ids = eval(rel_data[f'{relation}_instances'])
                        instances = instance_df.loc[instance_ids]

                        neg_instance_ids = list(chain.from_iterable([eval(rel_data[k]) for k in rel_data.keys() if '_instances' in k and relation not in k]))
                        neg_instances = instance_df.loc[neg_instance_ids]

                        # extra filtering

                        instances = instances[instances.apply(lambda x: x.source_wikidata in [source_wikidata, target_wikidata] and x.target_wikidata in [source_wikidata, target_wikidata], axis=1)]
                        # neg_instances = neg_instances[neg_instances.apply(lambda x: x.source_wikidata in [source_wikidata, target_wikidata] and x.target_wikidata in [source_wikidata, target_wikidata], axis=1)]

                        st.write(f"**Total Number of Inference Instances:** {int(rel_data['n_evidence'])}")
                        st.write(f"**Number of Instances that support the most frequent relation:** {int(rel_data['n_support'])}")
                        st.write(f"**Support Ratio:** {rel_data['support_ratio']}")
                        st.write(f"**Average Inference Score:** {rel_data['avg_score']}")

                        st.write("Inferences supporting the relation")
                        instances[['source_en', 'relation', 'target_en', 'text', 'source_wikidata', 'target_wikidata', 'source', 'page_title', 'score', 'section']]

                        st.write("Other inferences involving the same edge")
                        neg_instances[['source_en', 'relation', 'target_en', 'text', 'source_wikidata', 'target_wikidata', 'source', 'page_title', 'score', 'section']]

                        count_dict = {relation: rel_data[f'{relation}_support'] for relation in all_relations}
                        count_df = pd.DataFrame.from_dict(count_dict, orient='index')
                        count_df.columns = ['count']
                        st.write("Inference distribution for above edge")
                        count_df

                st.write("**Conclusion:**")
                st.write(f"[{material}]({material_url}) is useful for [{use_case}]({use_case_url})")
                st.write("------")
        else:
            st.write("Found no knowledge graph paths relevant to use cases")
    else:
        st.write("Found no entities that are made from {input_text}")

else:
    st.write("no matches")