DataRaptor commited on
Commit
152844c
1 Parent(s): 86a60b5

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +161 -2
  2. fold-0-train.csv +0 -0
  3. infer.py +133 -0
  4. model_weights.pth +3 -0
  5. requirements.txt +6 -0
app.py CHANGED
@@ -1,8 +1,167 @@
 
 
 
 
 
 
 
 
1
  import streamlit as st
 
 
 
 
 
 
2
 
3
- st.markdown('Live demo will be available very soon.')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
 
6
 
7
- st.markdown('Source code: https://github.com/dataraptor/PatentMatch/tree/main')
8
 
 
1
+ import datetime
2
+ import os
3
+ import pathlib
4
+ import requests
5
+ import zipfile
6
+ import pandas as pd
7
+ import pydeck as pdk
8
+ import geopandas as gpd
9
  import streamlit as st
10
+ import leafmap.colormaps as cm
11
+ from leafmap.common import hex_to_rgb
12
+ import time
13
+ from infer import USPPPMModel, USPPPMDataset
14
+ import torch
15
+ import pandas as pd
16
 
17
+ @st.cache_resource
18
+ def load_model():
19
+ model = USPPPMModel('microsoft/deberta-v3-small')
20
+ model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
21
+ model.eval()
22
+ ds = USPPPMDataset(model.tokenizer, 133)
23
+ return model, ds
24
+
25
+ def infer(anchor, target, title):
26
+ model, ds = load_model()
27
+ d = {
28
+ 'anchor': anchor,
29
+ 'target': target,
30
+ 'title': title,
31
+ 'label': 0
32
+ }
33
+
34
+ x = ds[d][0]
35
+ with torch.no_grad():
36
+ y = model(x)
37
+
38
+ return y.cpu().numpy()[0][0]
39
+
40
+ @st.cache_data
41
+ def get_context():
42
+ df = pd.read_csv('./fold-0-train.csv')
43
+ l = list(set(list(df['title'].values)))
44
+ return l
45
+
46
+
47
+
48
+
49
+ st.set_page_config(
50
+ page_title="PatentMatch",
51
+ page_icon="🧊",
52
+ layout="centered",
53
+ initial_sidebar_state="expanded",
54
+ )
55
+
56
+
57
+
58
+ # fix sidebar
59
+ st.markdown("""
60
+ <style>
61
+ .css-vk3wp9 {
62
+ background-color: rgb(255 255 255);
63
+ }
64
+ .css-18l0hbk {
65
+ padding: 0.34rem 1.2rem !important;
66
+ margin: 0.125rem 2rem;
67
+ }
68
+ .css-nziaof {
69
+ padding: 0.34rem 1.2rem !important;
70
+ margin: 0.125rem 2rem;
71
+ background-color: rgb(181 197 227 / 18%) !important;
72
+ }
73
+ </style>
74
+ """, unsafe_allow_html=True
75
+ )
76
+ hide_st_style = """
77
+ <style>
78
+ #MainMenu {visibility: hidden;}
79
+ footer {visibility: hidden;}
80
+ header {visibility: hidden;}
81
+ </style>
82
+ """
83
+ st.markdown(hide_st_style, unsafe_allow_html=True)
84
+
85
+
86
+
87
+ def app():
88
+
89
+ st.title("PatentMatch: Patent Semantic Similarity Matcher")
90
+ #st.markdown("[![View in W&B](https://img.shields.io/badge/View%20in-W%26B-blue)](https://wandb.ai/<username>/<project_name>?workspace=user-<username>)")
91
+
92
+ st.markdown(
93
+ """This project is focused on developing a Transformer based NLP model to match phrases
94
+ in U.S. patents based on their semantic similarity within a specific
95
+ technical domain context. The trained model achieved Pearson correlation coefficient score of 0.745.
96
+ [[Source Code]](https://github.com/dataraptor/PatentMatch)
97
+ """
98
+ )
99
+
100
+ st.markdown('---')
101
+ # st.selectbox("Select from example",
102
+ # [
103
+ # "Example 1",
104
+ # "Example 2",
105
+ # ])
106
+
107
+
108
+ row1_col1, row1_col2, row1_col3 = st.columns(
109
+ [0.5, 0.4, 0.4]
110
+ )
111
+ # with row1_col1:
112
+ # frequency = st.selectbox("Section",
113
+ # [
114
+ # "A: Human Necessities",
115
+ # "B: Operations and Transport",
116
+ # "C: Chemistry and Metallurgy",
117
+ # "D: Textiles",
118
+ # "E: Fixed Constructions",
119
+ # "F: Mechanical Engineering",
120
+ # "G: Physics",
121
+ # "H: Electricity",
122
+ # "Y: Emerging Cross-Sectional Technologies",
123
+ # ])
124
+ # with row1_col2:
125
+ # class_box = st.selectbox("Class",
126
+ # [
127
+ # "21",
128
+ # "14",
129
+ # "23",
130
+ # ])
131
+
132
+ with row1_col1:
133
+ l = get_context()
134
+ context = st.selectbox("Context", l, l.index('basic electric elements'))
135
+
136
+
137
+ with row1_col2:
138
+ anchor = st.text_input("Anchor", "deflect light")
139
+ with row1_col3:
140
+ target = st.text_input("Target", "bending moment")
141
+
142
+
143
+
144
+
145
+ if st.button("Predict Scores", type="primary"):
146
+ with st.spinner("Predicting scores..."):
147
+ score = infer(anchor, target, context)
148
+ ss = st.success("Scores predicted successfully!")
149
+
150
+ score += 2.0
151
+ fmt = "{:<.3f}".format(score)
152
+ st.subheader(f"Similarity Score: {fmt}")
153
+
154
+
155
+
156
+
157
+ app()
158
+
159
+
160
+ # Display a footer with links and credits
161
+ st.markdown("---")
162
+ st.markdown("Built by [Shamim Ahamed](https://www.shamimahamed.com/). Data provided by [Kaggle](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching)")
163
+ #st.markdown("Data provided by [The Feedback Prize - ELLIPSE Corpus Scoring Challenge on Kaggle](https://www.kaggle.com/c/feedbackprize-ellipse-corpus-scoring-challenge)")
164
 
165
 
166
 
 
167
 
fold-0-train.csv ADDED
The diff for this file is too large to render. See raw diff
 
infer.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torch import nn
2
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
3
+ import torch
4
+ from torch.utils.data import Dataset
5
+
6
+ class MeanPooling(nn.Module):
7
+ def __init__(self):
8
+ super(MeanPooling, self).__init__()
9
+
10
+ def forward(self, last_hidden_state, attention_mask):
11
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
12
+ sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
13
+ sum_mask = input_mask_expanded.sum(1)
14
+ sum_mask = torch.clamp(sum_mask, min=1e-9)
15
+ mean_embeddings = sum_embeddings / sum_mask
16
+ return mean_embeddings
17
+
18
+ class MeanPoolingLayer(nn.Module):
19
+ def __init__(self, input_size, target_size):
20
+ super(MeanPoolingLayer, self).__init__()
21
+ self.pool = MeanPooling()
22
+ self.fc = nn.Linear(input_size, target_size)
23
+
24
+ def forward(self, inputs, mask):
25
+ last_hidden_states = inputs[0]
26
+ feature = self.pool(last_hidden_states, mask)
27
+ outputs = self.fc(feature)
28
+ return outputs
29
+
30
+
31
+ def weight_init_normal(module, model):
32
+ if isinstance(module, nn.Linear):
33
+ module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
34
+ if module.bias is not None:
35
+ module.bias.data.zero_()
36
+ elif isinstance(module, nn.Embedding):
37
+ module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
38
+ if module.padding_idx is not None:
39
+ module.weight.data[module.padding_idx].zero_()
40
+ elif isinstance(module, nn.LayerNorm):
41
+ module.bias.data.zero_()
42
+ module.weight.data.fill_(1.0)
43
+
44
+
45
+ class USPPPMModel(nn.Module):
46
+ def __init__(self, backbone):
47
+ super(USPPPMModel, self).__init__()
48
+ self.config = AutoConfig.from_pretrained(backbone, output_hidden_states=True)
49
+ self.model = AutoModel.from_pretrained(backbone, config=self.config)
50
+ self.head = MeanPoolingLayer(768,1)
51
+ self.tokenizer = AutoTokenizer.from_pretrained(backbone);
52
+
53
+ # sectoks = ['[CTG]', '[CTX]', '[ANC]', '[TGT]']
54
+ # self.tokenizer.add_special_tokens({'additional_special_tokens': sectoks})
55
+ # self.model.resize_token_embeddings(len(self.tokenizer))
56
+
57
+ def _init_weights(self, layer):
58
+ for module in layer.modules():
59
+ init_fn = weight_init_normal
60
+ init_fn(module, self)
61
+ # print(type(module))
62
+
63
+ def forward(self, inputs):
64
+ outputs = self.model(**inputs)
65
+ outputs = self.head(outputs, inputs['attention_mask'])
66
+ return outputs
67
+
68
+
69
+ table = """
70
+ A: Human Necessities
71
+ B: Operations and Transport
72
+ C: Chemistry and Metallurgy
73
+ D: Textiles
74
+ E: Fixed Constructions
75
+ F: Mechanical Engineering
76
+ G: Physics
77
+ H: Electricity
78
+ Y: Emerging Cross-Sectional Technologies
79
+ """
80
+ splits = [i for i in table.split('\n') if i != '']
81
+ table = {e.split(': ')[0]: e.split(': ')[1] for e in splits}
82
+
83
+
84
+
85
+ class USPPPMDataset(Dataset):
86
+ def __init__(self, tokenizer, max_length):
87
+ self.tokenizer = tokenizer
88
+ self.max_length = max_length
89
+
90
+
91
+ def __len__(self): return 0
92
+
93
+ def __getitem__(self, x):
94
+ score = x['label']
95
+
96
+ sep = '' + self.tokenizer.sep_token + ''
97
+
98
+ s = x['anchor'] + sep + x['target'] + sep + x['title']
99
+
100
+ inputs = self.tokenizer(
101
+ s, add_special_tokens=True,
102
+ max_length=self.max_length, padding='max_length',
103
+ truncation=True,
104
+ return_offsets_mapping=False
105
+ )
106
+ for k, v in inputs.items(): inputs[k] = torch.tensor(v, dtype=torch.long).unsqueeze(dim=0)
107
+ label = torch.tensor(score, dtype=torch.float)
108
+ return inputs, label
109
+
110
+
111
+
112
+
113
+ if __name__ == '__main__':
114
+ model = USPPPMModel('microsoft/deberta-v3-small')
115
+ model.load_state_dict(torch.load('model_weights.pth', map_location=torch.device('cpu')))
116
+ model.eval()
117
+
118
+ ds = USPPPMDataset(model.tokenizer, 133)
119
+
120
+ d = {
121
+ 'anchor': 'sprayed',
122
+ 'target': 'thermal sprayed coating',
123
+ 'title': 'building',
124
+ 'label': 0
125
+ }
126
+ inp = ds[d]
127
+ x = inp[0]
128
+
129
+ with torch.no_grad():
130
+ y = model(x)
131
+ print('y:', y)
132
+
133
+
model_weights.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0b49ff053c7beac972a85d464305398ab93252901348418af1692e7ca0959dd
3
+ size 565268017
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit==1.21.0
2
+ Pillow
3
+ protobuf
4
+ torchvision==0.15.2
5
+ torch==2.0.1
6
+ numpy