Ryan Kim commited on
Commit
6bec35d
1 Parent(s): e2ba2f3

jupyter notebook now primed for training

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. src/patent_train.ipynb +1245 -0
  3. src/patent_train.py +0 -57
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ **/.DS_Store
src/patent_train.ipynb ADDED
@@ -0,0 +1,1245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {},
6
+ "source": [
7
+ "# Harvard USPTO Dataset Training"
8
+ ]
9
+ },
10
+ {
11
+ "cell_type": "markdown",
12
+ "metadata": {},
13
+ "source": [
14
+ "## Importing Packages"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 3,
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stdout",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "Collecting datasets\n",
27
+ " Downloading datasets-2.11.0-py3-none-any.whl (468 kB)\n",
28
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m468.7/468.7 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
29
+ "\u001b[?25hRequirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.64.1)\n",
30
+ "Requirement already satisfied: pyarrow>=8.0.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (9.0.0)\n",
31
+ "Requirement already satisfied: fsspec[http]>=2021.11.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (2022.8.2)\n",
32
+ "Collecting aiohttp\n",
33
+ " Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (1.0 MB)\n",
34
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m19.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
35
+ "\u001b[?25hCollecting huggingface-hub<1.0.0,>=0.11.0\n",
36
+ " Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)\n",
37
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.8/199.8 kB\u001b[0m \u001b[31m18.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
38
+ "\u001b[?25hRequirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (1.5.0)\n",
39
+ "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from datasets) (21.3)\n",
40
+ "Collecting responses<0.19\n",
41
+ " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
42
+ "Requirement already satisfied: dill<0.3.7,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.5.1)\n",
43
+ "Collecting xxhash\n",
44
+ " Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (242 kB)\n",
45
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m242.7/242.7 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
46
+ "\u001b[?25hRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (2.28.1)\n",
47
+ "Collecting multiprocess\n",
48
+ " Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)\n",
49
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.3/134.3 kB\u001b[0m \u001b[31m11.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
50
+ "\u001b[?25hRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (6.0)\n",
51
+ "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from datasets) (1.23.3)\n",
52
+ "Collecting multidict<7.0,>=4.5\n",
53
+ " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (116 kB)\n",
54
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.6/116.6 kB\u001b[0m \u001b[31m12.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
55
+ "\u001b[?25hCollecting yarl<2.0,>=1.0\n",
56
+ " Downloading yarl-1.8.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (257 kB)\n",
57
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m257.3/257.3 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
58
+ "\u001b[?25hCollecting frozenlist>=1.1.1\n",
59
+ " Downloading frozenlist-1.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (148 kB)\n",
60
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m148.1/148.1 kB\u001b[0m \u001b[31m10.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
61
+ "\u001b[?25hRequirement already satisfied: charset-normalizer<4.0,>=2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (2.1.1)\n",
62
+ "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->datasets) (22.1.0)\n",
63
+ "Collecting aiosignal>=1.1.2\n",
64
+ " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
65
+ "Collecting async-timeout<5.0,>=4.0.0a3\n",
66
+ " Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)\n",
67
+ "Collecting filelock\n",
68
+ " Downloading filelock-3.10.7-py3-none-any.whl (10 kB)\n",
69
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.11.0->datasets) (4.4.0)\n",
70
+ "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /opt/conda/lib/python3.10/site-packages (from packaging->datasets) (3.0.9)\n",
71
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.4)\n",
72
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.11)\n",
73
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2022.9.24)\n",
74
+ "Collecting dill<0.3.7,>=0.3.0\n",
75
+ " Downloading dill-0.3.6-py3-none-any.whl (110 kB)\n",
76
+ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m110.5/110.5 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
77
+ "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n",
78
+ "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2022.4)\n",
79
+ "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
80
+ "Installing collected packages: xxhash, multidict, frozenlist, filelock, dill, async-timeout, yarl, responses, multiprocess, huggingface-hub, aiosignal, aiohttp, datasets\n",
81
+ " Attempting uninstall: dill\n",
82
+ " Found existing installation: dill 0.3.5.1\n",
83
+ " Uninstalling dill-0.3.5.1:\n",
84
+ " Successfully uninstalled dill-0.3.5.1\n",
85
+ "Successfully installed aiohttp-3.8.4 aiosignal-1.3.1 async-timeout-4.0.2 datasets-2.11.0 dill-0.3.6 filelock-3.10.7 frozenlist-1.3.3 huggingface-hub-0.13.3 multidict-6.0.4 multiprocess-0.70.14 responses-0.18.0 xxhash-3.2.0 yarl-1.8.2\n"
86
+ ]
87
+ }
88
+ ],
89
+ "source": [
90
+ "!pip install datasets"
91
+ ]
92
+ },
93
+ {
94
+ "cell_type": "code",
95
+ "execution_count": 3,
96
+ "metadata": {},
97
+ "outputs": [],
98
+ "source": [
99
+ "from datasets import load_dataset\n",
100
+ "import pandas as pd\n",
101
+ "import numpy as np\n",
102
+ "import matplotlib.pyplot as plt"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "markdown",
107
+ "metadata": {},
108
+ "source": [
109
+ "## Loading the Dataset"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "markdown",
114
+ "metadata": {},
115
+ "source": [
116
+ "We first need to extract the dataset. We filter only for those in January 2016."
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 17,
122
+ "metadata": {},
123
+ "outputs": [
124
+ {
125
+ "name": "stderr",
126
+ "output_type": "stream",
127
+ "text": [
128
+ "Found cached dataset hupd (/home/jovyan/.cache/huggingface/datasets/HUPD___hupd/sample-ba3b43e1cc5c9c76/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142)\n"
129
+ ]
130
+ },
131
+ {
132
+ "data": {
133
+ "application/vnd.jupyter.widget-view+json": {
134
+ "model_id": "e9c97a02e1834189bcdd4c188ef555d7",
135
+ "version_major": 2,
136
+ "version_minor": 0
137
+ },
138
+ "text/plain": [
139
+ " 0%| | 0/2 [00:00<?, ?it/s]"
140
+ ]
141
+ },
142
+ "metadata": {},
143
+ "output_type": "display_data"
144
+ }
145
+ ],
146
+ "source": [
147
+ "dataset_dict = load_dataset('HUPD/hupd',\n",
148
+ " name='sample',\n",
149
+ " data_files=\"https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather\", \n",
150
+ " icpr_label=None,\n",
151
+ " train_filing_start_date='2016-01-01',\n",
152
+ " train_filing_end_date='2016-01-21',\n",
153
+ " val_filing_start_date='2016-01-22',\n",
154
+ " val_filing_end_date='2016-01-31',\n",
155
+ ")"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "markdown",
160
+ "metadata": {},
161
+ "source": [
162
+ "We print out the dataset to understand what exactly we want to look for"
163
+ ]
164
+ },
165
+ {
166
+ "cell_type": "code",
167
+ "execution_count": 21,
168
+ "metadata": {},
169
+ "outputs": [
170
+ {
171
+ "name": "stdout",
172
+ "output_type": "stream",
173
+ "text": [
174
+ "DatasetDict({\n",
175
+ " train: Dataset({\n",
176
+ " features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],\n",
177
+ " num_rows: 16153\n",
178
+ " })\n",
179
+ " validation: Dataset({\n",
180
+ " features: ['patent_number', 'decision', 'title', 'abstract', 'claims', 'background', 'summary', 'description', 'cpc_label', 'ipc_label', 'filing_date', 'patent_issue_date', 'date_published', 'examiner_id'],\n",
181
+ " num_rows: 9094\n",
182
+ " })\n",
183
+ "})\n"
184
+ ]
185
+ }
186
+ ],
187
+ "source": [
188
+ "print(dataset_dict)"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "markdown",
193
+ "metadata": {},
194
+ "source": [
195
+ "We separate our data between training and validation"
196
+ ]
197
+ },
198
+ {
199
+ "cell_type": "code",
200
+ "execution_count": 19,
201
+ "metadata": {},
202
+ "outputs": [],
203
+ "source": [
204
+ "df_train = pd.DataFrame(dataset_dict['train'] )\n",
205
+ "df_val = pd.DataFrame(dataset_dict['validation'] )"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "markdown",
210
+ "metadata": {},
211
+ "source": [
212
+ "We can preview the training data"
213
+ ]
214
+ },
215
+ {
216
+ "cell_type": "code",
217
+ "execution_count": 20,
218
+ "metadata": {},
219
+ "outputs": [
220
+ {
221
+ "data": {
222
+ "text/html": [
223
+ "<div>\n",
224
+ "<style scoped>\n",
225
+ " .dataframe tbody tr th:only-of-type {\n",
226
+ " vertical-align: middle;\n",
227
+ " }\n",
228
+ "\n",
229
+ " .dataframe tbody tr th {\n",
230
+ " vertical-align: top;\n",
231
+ " }\n",
232
+ "\n",
233
+ " .dataframe thead th {\n",
234
+ " text-align: right;\n",
235
+ " }\n",
236
+ "</style>\n",
237
+ "<table border=\"1\" class=\"dataframe\">\n",
238
+ " <thead>\n",
239
+ " <tr style=\"text-align: right;\">\n",
240
+ " <th></th>\n",
241
+ " <th>patent_number</th>\n",
242
+ " <th>decision</th>\n",
243
+ " <th>title</th>\n",
244
+ " <th>abstract</th>\n",
245
+ " <th>claims</th>\n",
246
+ " <th>background</th>\n",
247
+ " <th>summary</th>\n",
248
+ " <th>description</th>\n",
249
+ " <th>cpc_label</th>\n",
250
+ " <th>ipc_label</th>\n",
251
+ " <th>filing_date</th>\n",
252
+ " <th>patent_issue_date</th>\n",
253
+ " <th>date_published</th>\n",
254
+ " <th>examiner_id</th>\n",
255
+ " </tr>\n",
256
+ " </thead>\n",
257
+ " <tbody>\n",
258
+ " <tr>\n",
259
+ " <th>0</th>\n",
260
+ " <td>13261748</td>\n",
261
+ " <td>ACCEPTED</td>\n",
262
+ " <td>MINI-OPTICAL NETWORK TERMINAL (ONT)</td>\n",
263
+ " <td>The present invention relates to passive optic...</td>\n",
264
+ " <td>1. A compact optical network terminal, compris...</td>\n",
265
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;A netwo...</td>\n",
266
+ " <td>&lt;SOH&gt; SUMMARY OF THE INVENTION &lt;EOH&gt;An aspect ...</td>\n",
267
+ " <td>FIELD OF THE INVENTION The present invention r...</td>\n",
268
+ " <td>H04Q110071</td>\n",
269
+ " <td>H04Q1100</td>\n",
270
+ " <td>20160120</td>\n",
271
+ " <td>20170606</td>\n",
272
+ " <td>20160526</td>\n",
273
+ " <td>95191.0</td>\n",
274
+ " </tr>\n",
275
+ " <tr>\n",
276
+ " <th>1</th>\n",
277
+ " <td>13995128</td>\n",
278
+ " <td>ACCEPTED</td>\n",
279
+ " <td>APPARATUS FOR FORMING AND READING AN IDENTIFIC...</td>\n",
280
+ " <td>Embodiments of the invention provide a method ...</td>\n",
281
+ " <td>1. A method comprising: using a first reader t...</td>\n",
282
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;Identif...</td>\n",
283
+ " <td>&lt;SOH&gt; SUMMARY OF THE INVENTION &lt;EOH&gt;In accorda...</td>\n",
284
+ " <td>CROSS-REFERENCE TO RELATED APPLICATIONS The pr...</td>\n",
285
+ " <td>G06K500</td>\n",
286
+ " <td>G06K500</td>\n",
287
+ " <td>20160112</td>\n",
288
+ " <td>20160322</td>\n",
289
+ " <td>20140102</td>\n",
290
+ " <td>59514.0</td>\n",
291
+ " </tr>\n",
292
+ " <tr>\n",
293
+ " <th>2</th>\n",
294
+ " <td>14241799</td>\n",
295
+ " <td>PENDING</td>\n",
296
+ " <td>PORTABLE DRUG DISPENSER</td>\n",
297
+ " <td>A portable drug dispenser includes a chamber f...</td>\n",
298
+ " <td>1. A portable drug dispenser, comprising: a ch...</td>\n",
299
+ " <td></td>\n",
300
+ " <td></td>\n",
301
+ " <td>This application claims priority from U.S. app...</td>\n",
302
+ " <td>A61J70084</td>\n",
303
+ " <td>A61J700</td>\n",
304
+ " <td>20160104</td>\n",
305
+ " <td></td>\n",
306
+ " <td>20171116</td>\n",
307
+ " <td>95928.0</td>\n",
308
+ " </tr>\n",
309
+ " <tr>\n",
310
+ " <th>3</th>\n",
311
+ " <td>14348792</td>\n",
312
+ " <td>ACCEPTED</td>\n",
313
+ " <td>LIQUID-COOLED HEAT EXCHANGER</td>\n",
314
+ " <td>A crystal growth furnace comprising a crucible...</td>\n",
315
+ " <td>1. A crystal growth furnace for growing a crys...</td>\n",
316
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;1. Fiel...</td>\n",
317
+ " <td>&lt;SOH&gt; SUMMARY OF THE INVENTION &lt;EOH&gt;The presen...</td>\n",
318
+ " <td>CROSS-REFERENCE TO RELATED APPLICATIONS The pr...</td>\n",
319
+ " <td>C30B11003</td>\n",
320
+ " <td>C30B1100</td>\n",
321
+ " <td>20160111</td>\n",
322
+ " <td>20180529</td>\n",
323
+ " <td>20160512</td>\n",
324
+ " <td>63013.0</td>\n",
325
+ " </tr>\n",
326
+ " <tr>\n",
327
+ " <th>4</th>\n",
328
+ " <td>14360978</td>\n",
329
+ " <td>REJECTED</td>\n",
330
+ " <td>SOLE MEMBER OF FOOTWEAR</td>\n",
331
+ " <td>A shoe midsole is composed of a base plate (1)...</td>\n",
332
+ " <td>1. A sole member of footwear comprising a base...</td>\n",
333
+ " <td>&lt;SOH&gt; BACKGROUND ART &lt;EOH&gt;When the heel touche...</td>\n",
334
+ " <td>&lt;SOH&gt; BRIEF DESCRIPTION OF THE DRAWINGS &lt;EOH&gt;F...</td>\n",
335
+ " <td>TECHNICAL FIELD The present invention relates ...</td>\n",
336
+ " <td>A43B13181</td>\n",
337
+ " <td>A43B1318</td>\n",
338
+ " <td>20160113</td>\n",
339
+ " <td></td>\n",
340
+ " <td>20160512</td>\n",
341
+ " <td>94490.0</td>\n",
342
+ " </tr>\n",
343
+ " <tr>\n",
344
+ " <th>...</th>\n",
345
+ " <td>...</td>\n",
346
+ " <td>...</td>\n",
347
+ " <td>...</td>\n",
348
+ " <td>...</td>\n",
349
+ " <td>...</td>\n",
350
+ " <td>...</td>\n",
351
+ " <td>...</td>\n",
352
+ " <td>...</td>\n",
353
+ " <td>...</td>\n",
354
+ " <td>...</td>\n",
355
+ " <td>...</td>\n",
356
+ " <td>...</td>\n",
357
+ " <td>...</td>\n",
358
+ " <td>...</td>\n",
359
+ " </tr>\n",
360
+ " <tr>\n",
361
+ " <th>16148</th>\n",
362
+ " <td>15002394</td>\n",
363
+ " <td>ACCEPTED</td>\n",
364
+ " <td>ROBOT HAND CONTROLLING METHOD AND ROBOTICS DEVICE</td>\n",
365
+ " <td>A robot hand controlling method executes calcu...</td>\n",
366
+ " <td>1. A controlling method of a robot hand, the r...</td>\n",
367
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;1. Fiel...</td>\n",
368
+ " <td>&lt;SOH&gt; SUMMARY OF THE INVENTION &lt;EOH&gt;An object ...</td>\n",
369
+ " <td>BACKGROUND OF THE INVENTION 1. Field of the In...</td>\n",
370
+ " <td>B25J91612</td>\n",
371
+ " <td>B25J916</td>\n",
372
+ " <td>20160120</td>\n",
373
+ " <td>20180710</td>\n",
374
+ " <td>20160804</td>\n",
375
+ " <td>66148.0</td>\n",
376
+ " </tr>\n",
377
+ " <tr>\n",
378
+ " <th>16149</th>\n",
379
+ " <td>15002396</td>\n",
380
+ " <td>REJECTED</td>\n",
381
+ " <td>IMMUNOGLOBULIN FUSION PROTEINS AND USES THEREOF</td>\n",
382
+ " <td>A fusion protein is disclosed. The fusion prot...</td>\n",
383
+ " <td>1. A fusion protein comprising an Fc fragment ...</td>\n",
384
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;An immu...</td>\n",
385
+ " <td>&lt;SOH&gt; SUMMARY OF THE INVENTION &lt;EOH&gt;The presen...</td>\n",
386
+ " <td>The present application is a U.S. Nonprovision...</td>\n",
387
+ " <td>C07K14745</td>\n",
388
+ " <td>C07K14745</td>\n",
389
+ " <td>20160120</td>\n",
390
+ " <td></td>\n",
391
+ " <td>20161215</td>\n",
392
+ " <td>95819.0</td>\n",
393
+ " </tr>\n",
394
+ " <tr>\n",
395
+ " <th>16150</th>\n",
396
+ " <td>15330955</td>\n",
397
+ " <td>REJECTED</td>\n",
398
+ " <td>PIPE EXTRACTION TOOL</td>\n",
399
+ " <td>A pipe extraction tool that grips the inside o...</td>\n",
400
+ " <td>1. A pipe extraction tool for extracting a pip...</td>\n",
401
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;1. Fiel...</td>\n",
402
+ " <td>&lt;SOH&gt; BRIEF SUMMARY OF THE INVENTION &lt;EOH&gt;The ...</td>\n",
403
+ " <td>CROSS-REFERENCES TO RELATED APPLICATIONS Not a...</td>\n",
404
+ " <td>B25B2714</td>\n",
405
+ " <td>B25B2714</td>\n",
406
+ " <td>20160120</td>\n",
407
+ " <td></td>\n",
408
+ " <td>20170907</td>\n",
409
+ " <td>95661.0</td>\n",
410
+ " </tr>\n",
411
+ " <tr>\n",
412
+ " <th>16151</th>\n",
413
+ " <td>15330961</td>\n",
414
+ " <td>PENDING</td>\n",
415
+ " <td>Molded parts with thermoplastic cellulose biop...</td>\n",
416
+ " <td>A longitudinal extending body with oriented fi...</td>\n",
417
+ " <td>1. A longitudinal body of a solidified organic...</td>\n",
418
+ " <td>&lt;SOH&gt; BACKGROUND OF INVENTION &lt;EOH&gt;In the medi...</td>\n",
419
+ " <td>&lt;SOH&gt; BRIEF SUMMARY OF THE PRESENT INVENTION &lt;...</td>\n",
420
+ " <td>CROSS REFERENCES Application claims priority o...</td>\n",
421
+ " <td>A61L3106</td>\n",
422
+ " <td>A61L3106</td>\n",
423
+ " <td>20160111</td>\n",
424
+ " <td></td>\n",
425
+ " <td>20171019</td>\n",
426
+ " <td>96956.0</td>\n",
427
+ " </tr>\n",
428
+ " <tr>\n",
429
+ " <th>16152</th>\n",
430
+ " <td>15330968</td>\n",
431
+ " <td>PENDING</td>\n",
432
+ " <td>Transmission method with double directivity</td>\n",
433
+ " <td>A transmission method using a massive MIMO (Mu...</td>\n",
434
+ " <td>1. Transmission method with double directivity...</td>\n",
435
+ " <td>&lt;SOH&gt; BACKGROUND OF THE INVENTION &lt;EOH&gt;</td>\n",
436
+ " <td>&lt;SOH&gt; BRIEF SUMMARY OF THE INVENTION &lt;EOH&gt;The ...</td>\n",
437
+ " <td>BACKGROUND OF THE INVENTION Field of the Inven...</td>\n",
438
+ " <td>H04B7043</td>\n",
439
+ " <td>H04B704</td>\n",
440
+ " <td>20160114</td>\n",
441
+ " <td></td>\n",
442
+ " <td>20180329</td>\n",
443
+ " <td>70883.0</td>\n",
444
+ " </tr>\n",
445
+ " </tbody>\n",
446
+ "</table>\n",
447
+ "<p>16153 rows × 14 columns</p>\n",
448
+ "</div>"
449
+ ],
450
+ "text/plain": [
451
+ " patent_number decision \\\n",
452
+ "0 13261748 ACCEPTED \n",
453
+ "1 13995128 ACCEPTED \n",
454
+ "2 14241799 PENDING \n",
455
+ "3 14348792 ACCEPTED \n",
456
+ "4 14360978 REJECTED \n",
457
+ "... ... ... \n",
458
+ "16148 15002394 ACCEPTED \n",
459
+ "16149 15002396 REJECTED \n",
460
+ "16150 15330955 REJECTED \n",
461
+ "16151 15330961 PENDING \n",
462
+ "16152 15330968 PENDING \n",
463
+ "\n",
464
+ " title \\\n",
465
+ "0 MINI-OPTICAL NETWORK TERMINAL (ONT) \n",
466
+ "1 APPARATUS FOR FORMING AND READING AN IDENTIFIC... \n",
467
+ "2 PORTABLE DRUG DISPENSER \n",
468
+ "3 LIQUID-COOLED HEAT EXCHANGER \n",
469
+ "4 SOLE MEMBER OF FOOTWEAR \n",
470
+ "... ... \n",
471
+ "16148 ROBOT HAND CONTROLLING METHOD AND ROBOTICS DEVICE \n",
472
+ "16149 IMMUNOGLOBULIN FUSION PROTEINS AND USES THEREOF \n",
473
+ "16150 PIPE EXTRACTION TOOL \n",
474
+ "16151 Molded parts with thermoplastic cellulose biop... \n",
475
+ "16152 Transmission method with double directivity \n",
476
+ "\n",
477
+ " abstract \\\n",
478
+ "0 The present invention relates to passive optic... \n",
479
+ "1 Embodiments of the invention provide a method ... \n",
480
+ "2 A portable drug dispenser includes a chamber f... \n",
481
+ "3 A crystal growth furnace comprising a crucible... \n",
482
+ "4 A shoe midsole is composed of a base plate (1)... \n",
483
+ "... ... \n",
484
+ "16148 A robot hand controlling method executes calcu... \n",
485
+ "16149 A fusion protein is disclosed. The fusion prot... \n",
486
+ "16150 A pipe extraction tool that grips the inside o... \n",
487
+ "16151 A longitudinal extending body with oriented fi... \n",
488
+ "16152 A transmission method using a massive MIMO (Mu... \n",
489
+ "\n",
490
+ " claims \\\n",
491
+ "0 1. A compact optical network terminal, compris... \n",
492
+ "1 1. A method comprising: using a first reader t... \n",
493
+ "2 1. A portable drug dispenser, comprising: a ch... \n",
494
+ "3 1. A crystal growth furnace for growing a crys... \n",
495
+ "4 1. A sole member of footwear comprising a base... \n",
496
+ "... ... \n",
497
+ "16148 1. A controlling method of a robot hand, the r... \n",
498
+ "16149 1. A fusion protein comprising an Fc fragment ... \n",
499
+ "16150 1. A pipe extraction tool for extracting a pip... \n",
500
+ "16151 1. A longitudinal body of a solidified organic... \n",
501
+ "16152 1. Transmission method with double directivity... \n",
502
+ "\n",
503
+ " background \\\n",
504
+ "0 <SOH> BACKGROUND OF THE INVENTION <EOH>A netwo... \n",
505
+ "1 <SOH> BACKGROUND OF THE INVENTION <EOH>Identif... \n",
506
+ "2 \n",
507
+ "3 <SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel... \n",
508
+ "4 <SOH> BACKGROUND ART <EOH>When the heel touche... \n",
509
+ "... ... \n",
510
+ "16148 <SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel... \n",
511
+ "16149 <SOH> BACKGROUND OF THE INVENTION <EOH>An immu... \n",
512
+ "16150 <SOH> BACKGROUND OF THE INVENTION <EOH>1. Fiel... \n",
513
+ "16151 <SOH> BACKGROUND OF INVENTION <EOH>In the medi... \n",
514
+ "16152 <SOH> BACKGROUND OF THE INVENTION <EOH> \n",
515
+ "\n",
516
+ " summary \\\n",
517
+ "0 <SOH> SUMMARY OF THE INVENTION <EOH>An aspect ... \n",
518
+ "1 <SOH> SUMMARY OF THE INVENTION <EOH>In accorda... \n",
519
+ "2 \n",
520
+ "3 <SOH> SUMMARY OF THE INVENTION <EOH>The presen... \n",
521
+ "4 <SOH> BRIEF DESCRIPTION OF THE DRAWINGS <EOH>F... \n",
522
+ "... ... \n",
523
+ "16148 <SOH> SUMMARY OF THE INVENTION <EOH>An object ... \n",
524
+ "16149 <SOH> SUMMARY OF THE INVENTION <EOH>The presen... \n",
525
+ "16150 <SOH> BRIEF SUMMARY OF THE INVENTION <EOH>The ... \n",
526
+ "16151 <SOH> BRIEF SUMMARY OF THE PRESENT INVENTION <... \n",
527
+ "16152 <SOH> BRIEF SUMMARY OF THE INVENTION <EOH>The ... \n",
528
+ "\n",
529
+ " description cpc_label \\\n",
530
+ "0 FIELD OF THE INVENTION The present invention r... H04Q110071 \n",
531
+ "1 CROSS-REFERENCE TO RELATED APPLICATIONS The pr... G06K500 \n",
532
+ "2 This application claims priority from U.S. app... A61J70084 \n",
533
+ "3 CROSS-REFERENCE TO RELATED APPLICATIONS The pr... C30B11003 \n",
534
+ "4 TECHNICAL FIELD The present invention relates ... A43B13181 \n",
535
+ "... ... ... \n",
536
+ "16148 BACKGROUND OF THE INVENTION 1. Field of the In... B25J91612 \n",
537
+ "16149 The present application is a U.S. Nonprovision... C07K14745 \n",
538
+ "16150 CROSS-REFERENCES TO RELATED APPLICATIONS Not a... B25B2714 \n",
539
+ "16151 CROSS REFERENCES Application claims priority o... A61L3106 \n",
540
+ "16152 BACKGROUND OF THE INVENTION Field of the Inven... H04B7043 \n",
541
+ "\n",
542
+ " ipc_label filing_date patent_issue_date date_published examiner_id \n",
543
+ "0 H04Q1100 20160120 20170606 20160526 95191.0 \n",
544
+ "1 G06K500 20160112 20160322 20140102 59514.0 \n",
545
+ "2 A61J700 20160104 20171116 95928.0 \n",
546
+ "3 C30B1100 20160111 20180529 20160512 63013.0 \n",
547
+ "4 A43B1318 20160113 20160512 94490.0 \n",
548
+ "... ... ... ... ... ... \n",
549
+ "16148 B25J916 20160120 20180710 20160804 66148.0 \n",
550
+ "16149 C07K14745 20160120 20161215 95819.0 \n",
551
+ "16150 B25B2714 20160120 20170907 95661.0 \n",
552
+ "16151 A61L3106 20160111 20171019 96956.0 \n",
553
+ "16152 H04B704 20160114 20180329 70883.0 \n",
554
+ "\n",
555
+ "[16153 rows x 14 columns]"
556
+ ]
557
+ },
558
+ "execution_count": 20,
559
+ "metadata": {},
560
+ "output_type": "execute_result"
561
+ }
562
+ ],
563
+ "source": [
564
+ "df_train"
565
+ ]
566
+ },
567
+ {
568
+ "cell_type": "markdown",
569
+ "metadata": {},
570
+ "source": [
571
+ "## Pre-Processing the Data"
572
+ ]
573
+ },
574
+ {
575
+ "cell_type": "markdown",
576
+ "metadata": {},
577
+ "source": [
578
+ "We are interested in the following columns:\n",
579
+ "- Abstract\n",
580
+ "- Claims\n",
581
+ "- Decision <- our `y`\n",
582
+ "\n",
583
+ "Let's preprocess them both out of our training and validation data\n",
584
+ "\n",
585
+ "Also, consider that the \"Decision\" column has three types of values: \"Accepted\", \"Rejected\", and \"Pending\". To remove unecessary baggage, we will be only looking for \"Accepted\" and \"Rejected\"."
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 28,
591
+ "metadata": {},
592
+ "outputs": [],
593
+ "source": [
594
+ "necessary_columns = [\"abstract\",\"claims\",\"decision\"]\n",
595
+ "output_values = ['ACCEPTED','REJECTED'] "
596
+ ]
597
+ },
598
+ {
599
+ "cell_type": "code",
600
+ "execution_count": 29,
601
+ "metadata": {},
602
+ "outputs": [],
603
+ "source": [
604
+ "trainFeaturesToDrop = [col for col in list(df_train.columns) if col not in necessary_columns]\n",
605
+ "trainDF = df_train.dropna()\n",
606
+ "trainDF.drop(columns=trainFeaturesToDrop, inplace=True)\n",
607
+ "trainDF = trainDF[trainDF['decision'].isin(output_values)]"
608
+ ]
609
+ },
610
+ {
611
+ "cell_type": "code",
612
+ "execution_count": 30,
613
+ "metadata": {},
614
+ "outputs": [
615
+ {
616
+ "data": {
617
+ "text/html": [
618
+ "<div>\n",
619
+ "<style scoped>\n",
620
+ " .dataframe tbody tr th:only-of-type {\n",
621
+ " vertical-align: middle;\n",
622
+ " }\n",
623
+ "\n",
624
+ " .dataframe tbody tr th {\n",
625
+ " vertical-align: top;\n",
626
+ " }\n",
627
+ "\n",
628
+ " .dataframe thead th {\n",
629
+ " text-align: right;\n",
630
+ " }\n",
631
+ "</style>\n",
632
+ "<table border=\"1\" class=\"dataframe\">\n",
633
+ " <thead>\n",
634
+ " <tr style=\"text-align: right;\">\n",
635
+ " <th></th>\n",
636
+ " <th>decision</th>\n",
637
+ " <th>abstract</th>\n",
638
+ " <th>claims</th>\n",
639
+ " </tr>\n",
640
+ " </thead>\n",
641
+ " <tbody>\n",
642
+ " <tr>\n",
643
+ " <th>0</th>\n",
644
+ " <td>ACCEPTED</td>\n",
645
+ " <td>The present invention relates to passive optic...</td>\n",
646
+ " <td>1. A compact optical network terminal, compris...</td>\n",
647
+ " </tr>\n",
648
+ " <tr>\n",
649
+ " <th>1</th>\n",
650
+ " <td>ACCEPTED</td>\n",
651
+ " <td>Embodiments of the invention provide a method ...</td>\n",
652
+ " <td>1. A method comprising: using a first reader t...</td>\n",
653
+ " </tr>\n",
654
+ " <tr>\n",
655
+ " <th>3</th>\n",
656
+ " <td>ACCEPTED</td>\n",
657
+ " <td>A crystal growth furnace comprising a crucible...</td>\n",
658
+ " <td>1. A crystal growth furnace for growing a crys...</td>\n",
659
+ " </tr>\n",
660
+ " <tr>\n",
661
+ " <th>4</th>\n",
662
+ " <td>REJECTED</td>\n",
663
+ " <td>A shoe midsole is composed of a base plate (1)...</td>\n",
664
+ " <td>1. A sole member of footwear comprising a base...</td>\n",
665
+ " </tr>\n",
666
+ " <tr>\n",
667
+ " <th>5</th>\n",
668
+ " <td>ACCEPTED</td>\n",
669
+ " <td>A ratchet tool includes a shaft member, a hand...</td>\n",
670
+ " <td>1. A ratchet tool, comprising a shaft member, ...</td>\n",
671
+ " </tr>\n",
672
+ " <tr>\n",
673
+ " <th>...</th>\n",
674
+ " <td>...</td>\n",
675
+ " <td>...</td>\n",
676
+ " <td>...</td>\n",
677
+ " </tr>\n",
678
+ " <tr>\n",
679
+ " <th>16144</th>\n",
680
+ " <td>ACCEPTED</td>\n",
681
+ " <td>A wavelength tunable laser device, including: ...</td>\n",
682
+ " <td>1. A wavelength tunable laser device, comprisi...</td>\n",
683
+ " </tr>\n",
684
+ " <tr>\n",
685
+ " <th>16145</th>\n",
686
+ " <td>ACCEPTED</td>\n",
687
+ " <td>In one aspect, a method for use in preparing a...</td>\n",
688
+ " <td>1. (canceled) 2. The method of claim 19, where...</td>\n",
689
+ " </tr>\n",
690
+ " <tr>\n",
691
+ " <th>16148</th>\n",
692
+ " <td>ACCEPTED</td>\n",
693
+ " <td>A robot hand controlling method executes calcu...</td>\n",
694
+ " <td>1. A controlling method of a robot hand, the r...</td>\n",
695
+ " </tr>\n",
696
+ " <tr>\n",
697
+ " <th>16149</th>\n",
698
+ " <td>REJECTED</td>\n",
699
+ " <td>A fusion protein is disclosed. The fusion prot...</td>\n",
700
+ " <td>1. A fusion protein comprising an Fc fragment ...</td>\n",
701
+ " </tr>\n",
702
+ " <tr>\n",
703
+ " <th>16150</th>\n",
704
+ " <td>REJECTED</td>\n",
705
+ " <td>A pipe extraction tool that grips the inside o...</td>\n",
706
+ " <td>1. A pipe extraction tool for extracting a pip...</td>\n",
707
+ " </tr>\n",
708
+ " </tbody>\n",
709
+ "</table>\n",
710
+ "<p>8719 rows × 3 columns</p>\n",
711
+ "</div>"
712
+ ],
713
+ "text/plain": [
714
+ " decision abstract \\\n",
715
+ "0 ACCEPTED The present invention relates to passive optic... \n",
716
+ "1 ACCEPTED Embodiments of the invention provide a method ... \n",
717
+ "3 ACCEPTED A crystal growth furnace comprising a crucible... \n",
718
+ "4 REJECTED A shoe midsole is composed of a base plate (1)... \n",
719
+ "5 ACCEPTED A ratchet tool includes a shaft member, a hand... \n",
720
+ "... ... ... \n",
721
+ "16144 ACCEPTED A wavelength tunable laser device, including: ... \n",
722
+ "16145 ACCEPTED In one aspect, a method for use in preparing a... \n",
723
+ "16148 ACCEPTED A robot hand controlling method executes calcu... \n",
724
+ "16149 REJECTED A fusion protein is disclosed. The fusion prot... \n",
725
+ "16150 REJECTED A pipe extraction tool that grips the inside o... \n",
726
+ "\n",
727
+ " claims \n",
728
+ "0 1. A compact optical network terminal, compris... \n",
729
+ "1 1. A method comprising: using a first reader t... \n",
730
+ "3 1. A crystal growth furnace for growing a crys... \n",
731
+ "4 1. A sole member of footwear comprising a base... \n",
732
+ "5 1. A ratchet tool, comprising a shaft member, ... \n",
733
+ "... ... \n",
734
+ "16144 1. A wavelength tunable laser device, comprisi... \n",
735
+ "16145 1. (canceled) 2. The method of claim 19, where... \n",
736
+ "16148 1. A controlling method of a robot hand, the r... \n",
737
+ "16149 1. A fusion protein comprising an Fc fragment ... \n",
738
+ "16150 1. A pipe extraction tool for extracting a pip... \n",
739
+ "\n",
740
+ "[8719 rows x 3 columns]"
741
+ ]
742
+ },
743
+ "execution_count": 30,
744
+ "metadata": {},
745
+ "output_type": "execute_result"
746
+ }
747
+ ],
748
+ "source": [
749
+ "trainDF"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": 31,
755
+ "metadata": {},
756
+ "outputs": [],
757
+ "source": [
758
+ "valFeaturesToDrop = [col for col in list(df_val.columns) if col not in necessary_columns]\n",
759
+ "valDF = df_val.dropna()\n",
760
+ "valDF.drop(columns=valFeaturesToDrop, inplace=True)\n",
761
+ "valDF = valDF[valDF['decision'].isin(output_values)]"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": 32,
767
+ "metadata": {},
768
+ "outputs": [
769
+ {
770
+ "data": {
771
+ "text/html": [
772
+ "<div>\n",
773
+ "<style scoped>\n",
774
+ " .dataframe tbody tr th:only-of-type {\n",
775
+ " vertical-align: middle;\n",
776
+ " }\n",
777
+ "\n",
778
+ " .dataframe tbody tr th {\n",
779
+ " vertical-align: top;\n",
780
+ " }\n",
781
+ "\n",
782
+ " .dataframe thead th {\n",
783
+ " text-align: right;\n",
784
+ " }\n",
785
+ "</style>\n",
786
+ "<table border=\"1\" class=\"dataframe\">\n",
787
+ " <thead>\n",
788
+ " <tr style=\"text-align: right;\">\n",
789
+ " <th></th>\n",
790
+ " <th>decision</th>\n",
791
+ " <th>abstract</th>\n",
792
+ " <th>claims</th>\n",
793
+ " </tr>\n",
794
+ " </thead>\n",
795
+ " <tbody>\n",
796
+ " <tr>\n",
797
+ " <th>0</th>\n",
798
+ " <td>REJECTED</td>\n",
799
+ " <td>Regimen for the treatment of rosacea include t...</td>\n",
800
+ " <td>1. A treatment regimen comprising: cleansing a...</td>\n",
801
+ " </tr>\n",
802
+ " <tr>\n",
803
+ " <th>1</th>\n",
804
+ " <td>ACCEPTED</td>\n",
805
+ " <td>A clamp arrangement includes a pair of bracket...</td>\n",
806
+ " <td>1. A clamp arrangement for supporting a fractu...</td>\n",
807
+ " </tr>\n",
808
+ " <tr>\n",
809
+ " <th>2</th>\n",
810
+ " <td>REJECTED</td>\n",
811
+ " <td>A system and method for device action and conf...</td>\n",
812
+ " <td>1-20. (canceled) 21. A mobile device comprisin...</td>\n",
813
+ " </tr>\n",
814
+ " <tr>\n",
815
+ " <th>4</th>\n",
816
+ " <td>REJECTED</td>\n",
817
+ " <td>Systems and methods for managing datasets prod...</td>\n",
818
+ " <td>1. A method, comprising: executing, by one or ...</td>\n",
819
+ " </tr>\n",
820
+ " <tr>\n",
821
+ " <th>9</th>\n",
822
+ " <td>ACCEPTED</td>\n",
823
+ " <td>A scan driving circuit is provided. The scan d...</td>\n",
824
+ " <td>1. A scan driving circuit for driving a scan l...</td>\n",
825
+ " </tr>\n",
826
+ " <tr>\n",
827
+ " <th>...</th>\n",
828
+ " <td>...</td>\n",
829
+ " <td>...</td>\n",
830
+ " <td>...</td>\n",
831
+ " </tr>\n",
832
+ " <tr>\n",
833
+ " <th>9085</th>\n",
834
+ " <td>REJECTED</td>\n",
835
+ " <td>The non-rigid gate device as described may be ...</td>\n",
836
+ " <td>1; A non-rigid blocking apparatus referred to ...</td>\n",
837
+ " </tr>\n",
838
+ " <tr>\n",
839
+ " <th>9090</th>\n",
840
+ " <td>REJECTED</td>\n",
841
+ " <td>The present invention provides an improved unc...</td>\n",
842
+ " <td>1. A method for rendering a plastic surface am...</td>\n",
843
+ " </tr>\n",
844
+ " <tr>\n",
845
+ " <th>9091</th>\n",
846
+ " <td>ACCEPTED</td>\n",
847
+ " <td>A method for detecting a software-race conditi...</td>\n",
848
+ " <td>1. A method for detecting a software-race cond...</td>\n",
849
+ " </tr>\n",
850
+ " <tr>\n",
851
+ " <th>9092</th>\n",
852
+ " <td>ACCEPTED</td>\n",
853
+ " <td>The present application relates to multi-stage...</td>\n",
854
+ " <td>1. A multi-stage amplitude modulation-based me...</td>\n",
855
+ " </tr>\n",
856
+ " <tr>\n",
857
+ " <th>9093</th>\n",
858
+ " <td>ACCEPTED</td>\n",
859
+ " <td>A paper feeder includes a housing, a driving u...</td>\n",
860
+ " <td>1. A paper feeder, comprising: a housing; a dr...</td>\n",
861
+ " </tr>\n",
862
+ " </tbody>\n",
863
+ "</table>\n",
864
+ "<p>4888 rows × 3 columns</p>\n",
865
+ "</div>"
866
+ ],
867
+ "text/plain": [
868
+ " decision abstract \\\n",
869
+ "0 REJECTED Regimen for the treatment of rosacea include t... \n",
870
+ "1 ACCEPTED A clamp arrangement includes a pair of bracket... \n",
871
+ "2 REJECTED A system and method for device action and conf... \n",
872
+ "4 REJECTED Systems and methods for managing datasets prod... \n",
873
+ "9 ACCEPTED A scan driving circuit is provided. The scan d... \n",
874
+ "... ... ... \n",
875
+ "9085 REJECTED The non-rigid gate device as described may be ... \n",
876
+ "9090 REJECTED The present invention provides an improved unc... \n",
877
+ "9091 ACCEPTED A method for detecting a software-race conditi... \n",
878
+ "9092 ACCEPTED The present application relates to multi-stage... \n",
879
+ "9093 ACCEPTED A paper feeder includes a housing, a driving u... \n",
880
+ "\n",
881
+ " claims \n",
882
+ "0 1. A treatment regimen comprising: cleansing a... \n",
883
+ "1 1. A clamp arrangement for supporting a fractu... \n",
884
+ "2 1-20. (canceled) 21. A mobile device comprisin... \n",
885
+ "4 1. A method, comprising: executing, by one or ... \n",
886
+ "9 1. A scan driving circuit for driving a scan l... \n",
887
+ "... ... \n",
888
+ "9085 1; A non-rigid blocking apparatus referred to ... \n",
889
+ "9090 1. A method for rendering a plastic surface am... \n",
890
+ "9091 1. A method for detecting a software-race cond... \n",
891
+ "9092 1. A multi-stage amplitude modulation-based me... \n",
892
+ "9093 1. A paper feeder, comprising: a housing; a dr... \n",
893
+ "\n",
894
+ "[4888 rows x 3 columns]"
895
+ ]
896
+ },
897
+ "execution_count": 32,
898
+ "metadata": {},
899
+ "output_type": "execute_result"
900
+ }
901
+ ],
902
+ "source": [
903
+ "valDF"
904
+ ]
905
+ },
906
+ {
907
+ "cell_type": "markdown",
908
+ "metadata": {},
909
+ "source": [
910
+ "We need to replace the values in the `decision` column to numerical representations. We will set \"ACCEPTED\" as `1` and \"REJECTED\" as `0`."
911
+ ]
912
+ },
913
+ {
914
+ "cell_type": "code",
915
+ "execution_count": 33,
916
+ "metadata": {},
917
+ "outputs": [],
918
+ "source": [
919
+ "yKey = {\"ACCEPTED\":1,\"REJECTED\":0}"
920
+ ]
921
+ },
922
+ {
923
+ "cell_type": "code",
924
+ "execution_count": 34,
925
+ "metadata": {},
926
+ "outputs": [],
927
+ "source": [
928
+ "trainDF2 = trainDF.replace({\"decision\": yKey})\n",
929
+ "valDF2 = valDF.replace({\"decision\": yKey})"
930
+ ]
931
+ },
932
+ {
933
+ "cell_type": "code",
934
+ "execution_count": 35,
935
+ "metadata": {},
936
+ "outputs": [
937
+ {
938
+ "data": {
939
+ "text/html": [
940
+ "<div>\n",
941
+ "<style scoped>\n",
942
+ " .dataframe tbody tr th:only-of-type {\n",
943
+ " vertical-align: middle;\n",
944
+ " }\n",
945
+ "\n",
946
+ " .dataframe tbody tr th {\n",
947
+ " vertical-align: top;\n",
948
+ " }\n",
949
+ "\n",
950
+ " .dataframe thead th {\n",
951
+ " text-align: right;\n",
952
+ " }\n",
953
+ "</style>\n",
954
+ "<table border=\"1\" class=\"dataframe\">\n",
955
+ " <thead>\n",
956
+ " <tr style=\"text-align: right;\">\n",
957
+ " <th></th>\n",
958
+ " <th>decision</th>\n",
959
+ " <th>abstract</th>\n",
960
+ " <th>claims</th>\n",
961
+ " </tr>\n",
962
+ " </thead>\n",
963
+ " <tbody>\n",
964
+ " <tr>\n",
965
+ " <th>0</th>\n",
966
+ " <td>1</td>\n",
967
+ " <td>The present invention relates to passive optic...</td>\n",
968
+ " <td>1. A compact optical network terminal, compris...</td>\n",
969
+ " </tr>\n",
970
+ " <tr>\n",
971
+ " <th>1</th>\n",
972
+ " <td>1</td>\n",
973
+ " <td>Embodiments of the invention provide a method ...</td>\n",
974
+ " <td>1. A method comprising: using a first reader t...</td>\n",
975
+ " </tr>\n",
976
+ " <tr>\n",
977
+ " <th>3</th>\n",
978
+ " <td>1</td>\n",
979
+ " <td>A crystal growth furnace comprising a crucible...</td>\n",
980
+ " <td>1. A crystal growth furnace for growing a crys...</td>\n",
981
+ " </tr>\n",
982
+ " <tr>\n",
983
+ " <th>4</th>\n",
984
+ " <td>0</td>\n",
985
+ " <td>A shoe midsole is composed of a base plate (1)...</td>\n",
986
+ " <td>1. A sole member of footwear comprising a base...</td>\n",
987
+ " </tr>\n",
988
+ " <tr>\n",
989
+ " <th>5</th>\n",
990
+ " <td>1</td>\n",
991
+ " <td>A ratchet tool includes a shaft member, a hand...</td>\n",
992
+ " <td>1. A ratchet tool, comprising a shaft member, ...</td>\n",
993
+ " </tr>\n",
994
+ " <tr>\n",
995
+ " <th>...</th>\n",
996
+ " <td>...</td>\n",
997
+ " <td>...</td>\n",
998
+ " <td>...</td>\n",
999
+ " </tr>\n",
1000
+ " <tr>\n",
1001
+ " <th>16144</th>\n",
1002
+ " <td>1</td>\n",
1003
+ " <td>A wavelength tunable laser device, including: ...</td>\n",
1004
+ " <td>1. A wavelength tunable laser device, comprisi...</td>\n",
1005
+ " </tr>\n",
1006
+ " <tr>\n",
1007
+ " <th>16145</th>\n",
1008
+ " <td>1</td>\n",
1009
+ " <td>In one aspect, a method for use in preparing a...</td>\n",
1010
+ " <td>1. (canceled) 2. The method of claim 19, where...</td>\n",
1011
+ " </tr>\n",
1012
+ " <tr>\n",
1013
+ " <th>16148</th>\n",
1014
+ " <td>1</td>\n",
1015
+ " <td>A robot hand controlling method executes calcu...</td>\n",
1016
+ " <td>1. A controlling method of a robot hand, the r...</td>\n",
1017
+ " </tr>\n",
1018
+ " <tr>\n",
1019
+ " <th>16149</th>\n",
1020
+ " <td>0</td>\n",
1021
+ " <td>A fusion protein is disclosed. The fusion prot...</td>\n",
1022
+ " <td>1. A fusion protein comprising an Fc fragment ...</td>\n",
1023
+ " </tr>\n",
1024
+ " <tr>\n",
1025
+ " <th>16150</th>\n",
1026
+ " <td>0</td>\n",
1027
+ " <td>A pipe extraction tool that grips the inside o...</td>\n",
1028
+ " <td>1. A pipe extraction tool for extracting a pip...</td>\n",
1029
+ " </tr>\n",
1030
+ " </tbody>\n",
1031
+ "</table>\n",
1032
+ "<p>8719 rows × 3 columns</p>\n",
1033
+ "</div>"
1034
+ ],
1035
+ "text/plain": [
1036
+ " decision abstract \\\n",
1037
+ "0 1 The present invention relates to passive optic... \n",
1038
+ "1 1 Embodiments of the invention provide a method ... \n",
1039
+ "3 1 A crystal growth furnace comprising a crucible... \n",
1040
+ "4 0 A shoe midsole is composed of a base plate (1)... \n",
1041
+ "5 1 A ratchet tool includes a shaft member, a hand... \n",
1042
+ "... ... ... \n",
1043
+ "16144 1 A wavelength tunable laser device, including: ... \n",
1044
+ "16145 1 In one aspect, a method for use in preparing a... \n",
1045
+ "16148 1 A robot hand controlling method executes calcu... \n",
1046
+ "16149 0 A fusion protein is disclosed. The fusion prot... \n",
1047
+ "16150 0 A pipe extraction tool that grips the inside o... \n",
1048
+ "\n",
1049
+ " claims \n",
1050
+ "0 1. A compact optical network terminal, compris... \n",
1051
+ "1 1. A method comprising: using a first reader t... \n",
1052
+ "3 1. A crystal growth furnace for growing a crys... \n",
1053
+ "4 1. A sole member of footwear comprising a base... \n",
1054
+ "5 1. A ratchet tool, comprising a shaft member, ... \n",
1055
+ "... ... \n",
1056
+ "16144 1. A wavelength tunable laser device, comprisi... \n",
1057
+ "16145 1. (canceled) 2. The method of claim 19, where... \n",
1058
+ "16148 1. A controlling method of a robot hand, the r... \n",
1059
+ "16149 1. A fusion protein comprising an Fc fragment ... \n",
1060
+ "16150 1. A pipe extraction tool for extracting a pip... \n",
1061
+ "\n",
1062
+ "[8719 rows x 3 columns]"
1063
+ ]
1064
+ },
1065
+ "execution_count": 35,
1066
+ "metadata": {},
1067
+ "output_type": "execute_result"
1068
+ }
1069
+ ],
1070
+ "source": [
1071
+ "trainDF2"
1072
+ ]
1073
+ },
1074
+ {
1075
+ "cell_type": "code",
1076
+ "execution_count": 36,
1077
+ "metadata": {},
1078
+ "outputs": [
1079
+ {
1080
+ "data": {
1081
+ "text/html": [
1082
+ "<div>\n",
1083
+ "<style scoped>\n",
1084
+ " .dataframe tbody tr th:only-of-type {\n",
1085
+ " vertical-align: middle;\n",
1086
+ " }\n",
1087
+ "\n",
1088
+ " .dataframe tbody tr th {\n",
1089
+ " vertical-align: top;\n",
1090
+ " }\n",
1091
+ "\n",
1092
+ " .dataframe thead th {\n",
1093
+ " text-align: right;\n",
1094
+ " }\n",
1095
+ "</style>\n",
1096
+ "<table border=\"1\" class=\"dataframe\">\n",
1097
+ " <thead>\n",
1098
+ " <tr style=\"text-align: right;\">\n",
1099
+ " <th></th>\n",
1100
+ " <th>decision</th>\n",
1101
+ " <th>abstract</th>\n",
1102
+ " <th>claims</th>\n",
1103
+ " </tr>\n",
1104
+ " </thead>\n",
1105
+ " <tbody>\n",
1106
+ " <tr>\n",
1107
+ " <th>0</th>\n",
1108
+ " <td>0</td>\n",
1109
+ " <td>Regimen for the treatment of rosacea include t...</td>\n",
1110
+ " <td>1. A treatment regimen comprising: cleansing a...</td>\n",
1111
+ " </tr>\n",
1112
+ " <tr>\n",
1113
+ " <th>1</th>\n",
1114
+ " <td>1</td>\n",
1115
+ " <td>A clamp arrangement includes a pair of bracket...</td>\n",
1116
+ " <td>1. A clamp arrangement for supporting a fractu...</td>\n",
1117
+ " </tr>\n",
1118
+ " <tr>\n",
1119
+ " <th>2</th>\n",
1120
+ " <td>0</td>\n",
1121
+ " <td>A system and method for device action and conf...</td>\n",
1122
+ " <td>1-20. (canceled) 21. A mobile device comprisin...</td>\n",
1123
+ " </tr>\n",
1124
+ " <tr>\n",
1125
+ " <th>4</th>\n",
1126
+ " <td>0</td>\n",
1127
+ " <td>Systems and methods for managing datasets prod...</td>\n",
1128
+ " <td>1. A method, comprising: executing, by one or ...</td>\n",
1129
+ " </tr>\n",
1130
+ " <tr>\n",
1131
+ " <th>9</th>\n",
1132
+ " <td>1</td>\n",
1133
+ " <td>A scan driving circuit is provided. The scan d...</td>\n",
1134
+ " <td>1. A scan driving circuit for driving a scan l...</td>\n",
1135
+ " </tr>\n",
1136
+ " <tr>\n",
1137
+ " <th>...</th>\n",
1138
+ " <td>...</td>\n",
1139
+ " <td>...</td>\n",
1140
+ " <td>...</td>\n",
1141
+ " </tr>\n",
1142
+ " <tr>\n",
1143
+ " <th>9085</th>\n",
1144
+ " <td>0</td>\n",
1145
+ " <td>The non-rigid gate device as described may be ...</td>\n",
1146
+ " <td>1; A non-rigid blocking apparatus referred to ...</td>\n",
1147
+ " </tr>\n",
1148
+ " <tr>\n",
1149
+ " <th>9090</th>\n",
1150
+ " <td>0</td>\n",
1151
+ " <td>The present invention provides an improved unc...</td>\n",
1152
+ " <td>1. A method for rendering a plastic surface am...</td>\n",
1153
+ " </tr>\n",
1154
+ " <tr>\n",
1155
+ " <th>9091</th>\n",
1156
+ " <td>1</td>\n",
1157
+ " <td>A method for detecting a software-race conditi...</td>\n",
1158
+ " <td>1. A method for detecting a software-race cond...</td>\n",
1159
+ " </tr>\n",
1160
+ " <tr>\n",
1161
+ " <th>9092</th>\n",
1162
+ " <td>1</td>\n",
1163
+ " <td>The present application relates to multi-stage...</td>\n",
1164
+ " <td>1. A multi-stage amplitude modulation-based me...</td>\n",
1165
+ " </tr>\n",
1166
+ " <tr>\n",
1167
+ " <th>9093</th>\n",
1168
+ " <td>1</td>\n",
1169
+ " <td>A paper feeder includes a housing, a driving u...</td>\n",
1170
+ " <td>1. A paper feeder, comprising: a housing; a dr...</td>\n",
1171
+ " </tr>\n",
1172
+ " </tbody>\n",
1173
+ "</table>\n",
1174
+ "<p>4888 rows × 3 columns</p>\n",
1175
+ "</div>"
1176
+ ],
1177
+ "text/plain": [
1178
+ " decision abstract \\\n",
1179
+ "0 0 Regimen for the treatment of rosacea include t... \n",
1180
+ "1 1 A clamp arrangement includes a pair of bracket... \n",
1181
+ "2 0 A system and method for device action and conf... \n",
1182
+ "4 0 Systems and methods for managing datasets prod... \n",
1183
+ "9 1 A scan driving circuit is provided. The scan d... \n",
1184
+ "... ... ... \n",
1185
+ "9085 0 The non-rigid gate device as described may be ... \n",
1186
+ "9090 0 The present invention provides an improved unc... \n",
1187
+ "9091 1 A method for detecting a software-race conditi... \n",
1188
+ "9092 1 The present application relates to multi-stage... \n",
1189
+ "9093 1 A paper feeder includes a housing, a driving u... \n",
1190
+ "\n",
1191
+ " claims \n",
1192
+ "0 1. A treatment regimen comprising: cleansing a... \n",
1193
+ "1 1. A clamp arrangement for supporting a fractu... \n",
1194
+ "2 1-20. (canceled) 21. A mobile device comprisin... \n",
1195
+ "4 1. A method, comprising: executing, by one or ... \n",
1196
+ "9 1. A scan driving circuit for driving a scan l... \n",
1197
+ "... ... \n",
1198
+ "9085 1; A non-rigid blocking apparatus referred to ... \n",
1199
+ "9090 1. A method for rendering a plastic surface am... \n",
1200
+ "9091 1. A method for detecting a software-race cond... \n",
1201
+ "9092 1. A multi-stage amplitude modulation-based me... \n",
1202
+ "9093 1. A paper feeder, comprising: a housing; a dr... \n",
1203
+ "\n",
1204
+ "[4888 rows x 3 columns]"
1205
+ ]
1206
+ },
1207
+ "execution_count": 36,
1208
+ "metadata": {},
1209
+ "output_type": "execute_result"
1210
+ }
1211
+ ],
1212
+ "source": [
1213
+ "valDF2"
1214
+ ]
1215
+ },
1216
+ {
1217
+ "cell_type": "code",
1218
+ "execution_count": null,
1219
+ "metadata": {},
1220
+ "outputs": [],
1221
+ "source": []
1222
+ }
1223
+ ],
1224
+ "metadata": {
1225
+ "kernelspec": {
1226
+ "display_name": "Python 3 (ipykernel)",
1227
+ "language": "python",
1228
+ "name": "python3"
1229
+ },
1230
+ "language_info": {
1231
+ "codemirror_mode": {
1232
+ "name": "ipython",
1233
+ "version": 3
1234
+ },
1235
+ "file_extension": ".py",
1236
+ "mimetype": "text/x-python",
1237
+ "name": "python",
1238
+ "nbconvert_exporter": "python",
1239
+ "pygments_lexer": "ipython3",
1240
+ "version": "3.10.6"
1241
+ }
1242
+ },
1243
+ "nbformat": 4,
1244
+ "nbformat_minor": 4
1245
+ }
src/patent_train.py DELETED
@@ -1,57 +0,0 @@
1
- from datasets import load_dataset
2
- from transformers import pipeline
3
- from transformers import AutoTokenizer, AutoModelForSequenceClassification
4
-
5
- # Import data for specifically January 2016
6
- dataset_dict = load_dataset(
7
- 'HUPD/hupd',
8
- name='Jan2016 Sample',
9
- data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
10
- icpr_label=None,
11
- train_filing_start_date='2016-01-01',
12
- train_filing_end_date='2016-01-21',
13
- val_filing_start_date='2016-01-22',
14
- val_filing_end_date='2016-01-31',
15
- )
16
-
17
- print(dataset_dict)
18
-
19
- """
20
- data_fields = {
21
- "application_number": "...",
22
- "publication_number": "...",
23
- "title": "...",
24
- "decision": "...",
25
- "date_produced": "...",
26
- "date_published": "...",
27
- "main_cpc_label": "...",
28
- "cpc_labels": ["...", "...", "..."],
29
- "main_ipcr_label": "...",
30
- "ipcr_labels": ["...", "...", "..."],
31
- "patent_number": "...",
32
- "filing_date": "...",
33
- "patent_issue_date": "...",
34
- "abandon_date": "...",
35
- "uspc_class": "...",
36
- "uspc_subclass": "...",
37
- "examiner_id": "...",
38
- "examiner_name_last": "...",
39
- "examiner_name_first": "...",
40
- "examiner_name_middle": "...",
41
- "inventor_list": [
42
- {
43
- "inventor_name_last": "...",
44
- "inventor_name_first": "...",
45
- "inventor_city": "...",
46
- "inventor_state": "...",
47
- "inventor_country": "..."
48
- }
49
- ],
50
- "abstract": "...",
51
- "claims": "...",
52
- "background": "...",
53
- "summary": "...",
54
- "full_description": "..."
55
- }
56
- """
57
-