import streamlit as st import torch from transformers import AutoTokenizer, AutoModelForSequenceClassification from datasets import load_dataset import pandas as pd # finetuned model language_model_path = "juliaannjose/finetuned_model" # load the dataset to # use the patent number, abstract and claim columns for UI with st.spinner("Loading..."): dataset_dict = load_dataset( "HUPD/hupd", name="sample", data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", icpr_label=None, train_filing_start_date="2016-01-01", train_filing_end_date="2016-01-21", val_filing_start_date="2016-01-22", val_filing_end_date="2016-01-31", ) df_train = pd.DataFrame(dataset_dict["train"]) df_val = pd.DataFrame(dataset_dict["validation"]) df = pd.concat([df_train, df_val], ignore_index=True) # drop down menu with patent numbers _patent_id = st.selectbox( "Select the Patent Number", options=df["patent_number"], ) # display abstract and claim def get_abs_claim(_pid): # get abstract and claim corresponding to this patent id _abs = df.loc[df["patent_number"] == _pid]["abstract"] _cl = df.loc[df["patent_number"] == _pid]["claims"] return _abs.values[0], _cl.values[0] _abstract, _claim = get_abs_claim(_patent_id) st.title("Abstract:") # display abstract st.write(_abstract) st.title("Claim:") # display claims st.write(_claim) # model and tokenizer initialization @st.cache_resource def load_model(language_model_path): tokenizer = AutoTokenizer.from_pretrained(language_model_path) model = AutoModelForSequenceClassification.from_pretrained(language_model_path) return tokenizer, model tokenizer, model = load_model(language_model_path) # input to our model input_text = _abstract + _claim # get tokens inputs = tokenizer( input_text, truncation=True, padding=True, return_tensors="pt", ) # get predictions id2label = {0: "REJECTED", 1: "ACCEPTED"} # when submit button clicked, run the model and get result if st.button("Submit"): with torch.no_grad(): outputs = model(**inputs) probability = torch.nn.functional.softmax(outputs.logits, dim=1) predicted_class_id = probability.argmax().item() pred_label = id2label[predicted_class_id] st.title("Predicted Patentability") if probability[0][0] > probability[0][1]: st.write("Rejection Score:") st.write(probability[0][0].item()) else: st.write("Acceptance Score:") st.write(probability[0][1].item()) st.write("Result:", pred_label)