Julien Simon commited on
Commit
2eacb8b
1 Parent(s): 090ac3f

Initial version

Browse files
Files changed (1) hide show
  1. code/dataset_prep.ipynb +231 -0
code/dataset_prep.ipynb ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "882ae3ed",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import datasets"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "f6b5f6bf",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stderr",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Reusing dataset reuters21578 (/Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f)\n"
24
+ ]
25
+ },
26
+ {
27
+ "data": {
28
+ "application/vnd.jupyter.widget-view+json": {
29
+ "model_id": "d47dfac1e7e54f87bcf922b4616bfc9b",
30
+ "version_major": 2,
31
+ "version_minor": 0
32
+ },
33
+ "text/plain": [
34
+ " 0%| | 0/2 [00:00<?, ?it/s]"
35
+ ]
36
+ },
37
+ "metadata": {},
38
+ "output_type": "display_data"
39
+ }
40
+ ],
41
+ "source": [
42
+ "from datasets import load_dataset\n",
43
+ "\n",
44
+ "dataset = load_dataset(\"reuters21578\", 'ModHayes')"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 3,
50
+ "id": "252eaf4a",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "dataset.save_to_disk('reuters_original')"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": 4,
60
+ "id": "af85b023",
61
+ "metadata": {},
62
+ "outputs": [
63
+ {
64
+ "data": {
65
+ "text/plain": [
66
+ "DatasetDict({\n",
67
+ " test: Dataset({\n",
68
+ " features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],\n",
69
+ " num_rows: 722\n",
70
+ " })\n",
71
+ " train: Dataset({\n",
72
+ " features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],\n",
73
+ " num_rows: 20856\n",
74
+ " })\n",
75
+ "})"
76
+ ]
77
+ },
78
+ "execution_count": 4,
79
+ "metadata": {},
80
+ "output_type": "execute_result"
81
+ }
82
+ ],
83
+ "source": [
84
+ "dataset"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 5,
90
+ "id": "0af0d4b3",
91
+ "metadata": {},
92
+ "outputs": [],
93
+ "source": [
94
+ "dataset = dataset.remove_columns(\n",
95
+ " ['text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id',\n",
96
+ " 'new_id','places', 'people', 'orgs', 'exchanges', 'date'])\n",
97
+ "\n",
98
+ "dataset = dataset.rename_column('title', 'target') "
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 6,
104
+ "id": "3a97954e",
105
+ "metadata": {},
106
+ "outputs": [
107
+ {
108
+ "data": {
109
+ "text/plain": [
110
+ "{'text': 'Standard Oil Co and BP North America\\nInc said they plan to form a venture to manage the money market\\nborrowing and investment activities of both companies.\\n BP North America is a subsidiary of British Petroleum Co\\nPlc &lt;BP>, which also owns a 55 pct interest in Standard Oil.\\n The venture will be called BP/Standard Financial Trading\\nand will be operated by Standard Oil under the oversight of a\\njoint management committee.\\n\\n Reuter\\n',\n",
111
+ " 'target': 'STANDARD OIL &lt;SRD> TO FORM FINANCIAL UNIT'}"
112
+ ]
113
+ },
114
+ "execution_count": 6,
115
+ "metadata": {},
116
+ "output_type": "execute_result"
117
+ }
118
+ ],
119
+ "source": [
120
+ "dataset['train'][1]"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": 7,
126
+ "id": "6947b4a0",
127
+ "metadata": {},
128
+ "outputs": [],
129
+ "source": [
130
+ "def clean(row):\n",
131
+ " row['text'] = row['text'].replace('\\n',' ').replace('\\t',' ')\\\n",
132
+ " .replace(',','').replace('\\'','').replace('\\\"','')\\\n",
133
+ " .replace(' Reuter','').replace(' REUTER','')\n",
134
+ " row['text'] = \" \".join(row['text'].split())\n",
135
+ " row['target'] = row['target'].replace('&lt;','<').replace('&gt;','>')\n",
136
+ " return row"
137
+ ]
138
+ },
139
+ {
140
+ "cell_type": "code",
141
+ "execution_count": 8,
142
+ "id": "deec4da9",
143
+ "metadata": {},
144
+ "outputs": [
145
+ {
146
+ "name": "stderr",
147
+ "output_type": "stream",
148
+ "text": [
149
+ "Loading cached processed dataset at /Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f/cache-80b0dc9c8071ba93.arrow\n",
150
+ "Loading cached processed dataset at /Users/juliensimon/.cache/huggingface/datasets/reuters21578/ModHayes/1.0.0/bd91fac5a25fc818873c02a7281cc276c9b326a9e6a89288fc6ba6967772240f/cache-38a3b39f977c1f02.arrow\n"
151
+ ]
152
+ }
153
+ ],
154
+ "source": [
155
+ "dataset = dataset.map(clean)"
156
+ ]
157
+ },
158
+ {
159
+ "cell_type": "code",
160
+ "execution_count": 9,
161
+ "id": "d6aa8777",
162
+ "metadata": {},
163
+ "outputs": [
164
+ {
165
+ "data": {
166
+ "text/plain": [
167
+ "{'text': 'Standard Oil Co and BP North America Inc said they plan to form a venture to manage the money market borrowing and investment activities of both companies. BP North America is a subsidiary of British Petroleum Co Plc &lt;BP> which also owns a 55 pct interest in Standard Oil. The venture will be called BP/Standard Financial Trading and will be operated by Standard Oil under the oversight of a joint management committee.',\n",
168
+ " 'target': 'STANDARD OIL <SRD> TO FORM FINANCIAL UNIT'}"
169
+ ]
170
+ },
171
+ "execution_count": 9,
172
+ "metadata": {},
173
+ "output_type": "execute_result"
174
+ }
175
+ ],
176
+ "source": [
177
+ "dataset['train'][1]"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 10,
183
+ "id": "b0562fe2",
184
+ "metadata": {},
185
+ "outputs": [],
186
+ "source": [
187
+ "dataset.save_to_disk('reuters_processed')"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": null,
193
+ "id": "d2e6a3e4",
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "dataset['train'].to_csv('reuters_train.csv', index=False, header=True)\n",
198
+ "dataset['test'].to_csv('reuters_test.csv', index=False, header=True)"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "id": "d79a04f2",
205
+ "metadata": {},
206
+ "outputs": [],
207
+ "source": []
208
+ }
209
+ ],
210
+ "metadata": {
211
+ "kernelspec": {
212
+ "display_name": "Python 3",
213
+ "language": "python",
214
+ "name": "python3"
215
+ },
216
+ "language_info": {
217
+ "codemirror_mode": {
218
+ "name": "ipython",
219
+ "version": 3
220
+ },
221
+ "file_extension": ".py",
222
+ "mimetype": "text/x-python",
223
+ "name": "python",
224
+ "nbconvert_exporter": "python",
225
+ "pygments_lexer": "ipython3",
226
+ "version": "3.8.8"
227
+ }
228
+ },
229
+ "nbformat": 4,
230
+ "nbformat_minor": 5
231
+ }