keesephillips commited on
Commit
ae1f24d
·
verified ·
1 Parent(s): ac05f7a

Upload folder using huggingface_hub

Browse files
notebooks/model.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e580316d90588119633a0091618c9eba64d964086822c44c4e41c96101c7177
3
- size 17075128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65ea1ca0239919445b4377838a0e614ddf2afb5648287551618d12cf8d46fbfa
3
+ size 18438
notebooks/naive.ipynb ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "from transformers import GPT2Tokenizer, GPT2LMHeadModel\n",
11
+ "from sklearn.metrics import accuracy_score, recall_score\n",
12
+ "import numpy as np\n",
13
+ "from datasets import load_dataset\n",
14
+ "from PIL import Image, ImageEnhance\n",
15
+ "import os\n",
16
+ "import cv2\n",
17
+ "from sklearn.preprocessing import LabelEncoder\n",
18
+ "import json\n",
19
+ "import csv\n",
20
+ "import re\n",
21
+ "import pandas as pd"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {},
28
+ "outputs": [],
29
+ "source": [
30
+ "def prepare_dataset(ocr_dir, csv_dir, output_file):\n",
31
+ " with open(output_file, 'w', encoding='utf-8') as jsonl_file:\n",
32
+ " for filename in os.listdir(ocr_dir):\n",
33
+ " if filename.endswith('.txt'):\n",
34
+ " ocr_path = os.path.join(ocr_dir, filename)\n",
35
+ " csv_path = os.path.join(csv_dir, filename)#.replace('.txt', '.csv'))\n",
36
+ " print(csv_path)\n",
37
+ " # if not os.path.exists(csv_path):\n",
38
+ " # print(f\"Warning: Corresponding CSV file not found for {ocr_path}\")\n",
39
+ " # continue\n",
40
+ " \n",
41
+ " with open(ocr_path, 'r', encoding='utf-8') as ocr_file:\n",
42
+ " ocr_text = ocr_file.read()\n",
43
+ " \n",
44
+ " with open(csv_path, 'r', encoding='utf-8') as csv_file:\n",
45
+ " csv_text = csv_file.read()\n",
46
+ " \n",
47
+ " json_object = {\n",
48
+ " \"prompt\": ocr_text,\n",
49
+ " \"completion\": csv_text\n",
50
+ " }\n",
51
+ " jsonl_file.write(json.dumps(json_object) + '\\n')\n"
52
+ ]
53
+ },
54
+ {
55
+ "cell_type": "code",
56
+ "execution_count": null,
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "# Usage\n",
61
+ "ocr_dir = os.getcwd() + '/../data/processed/annotations'\n",
62
+ "csv_dir = os.getcwd() + '/../data/processed/hand_labeled_tables/hand_labeled_tables'\n",
63
+ "output_file = 'dataset.jsonl'\n",
64
+ "prepare_dataset(ocr_dir, csv_dir, output_file)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# Load pre-trained GPT model and tokenizer\n",
74
+ "model_name = 'gpt2'\n",
75
+ "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n",
76
+ "model = GPT2LMHeadModel.from_pretrained(model_name)\n",
77
+ "\n",
78
+ "# Ensure the model is in evaluation mode\n",
79
+ "model.eval()\n"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": null,
85
+ "metadata": {},
86
+ "outputs": [],
87
+ "source": [
88
+ "def preprocess_text(text):\n",
89
+ " # Basic cleaning for OCR text\n",
90
+ " text = re.sub(r'\\s+', ' ', text) # Remove extra whitespace\n",
91
+ " text = re.sub(r'[^a-zA-Z0-9\\s,.:()%+-]', '', text) # Remove most special characters, but keep some relevant ones\n",
92
+ " return text.strip()\n",
93
+ "\n",
94
+ "def calculate_loss(model, tokenizer, prompt, true_completion):\n",
95
+ " # Combine prompt and completion for full context\n",
96
+ " full_text = f\"{prompt} {true_completion}\"\n",
97
+ " inputs = tokenizer.encode(full_text, return_tensors='pt', truncation=True, max_length=512)\n",
98
+ " \n",
99
+ " # Calculate loss\n",
100
+ " with torch.no_grad():\n",
101
+ " outputs = model(inputs, labels=inputs)\n",
102
+ " \n",
103
+ " return outputs.loss.item()\n",
104
+ "\n",
105
+ "def evaluate_json_dataset(json_file, model, tokenizer):\n",
106
+ " with open(json_file, 'r') as f:\n",
107
+ " dataset = [json.loads(line) for line in f]\n",
108
+ " \n",
109
+ " losses = []\n",
110
+ " \n",
111
+ " for item in dataset:\n",
112
+ " prompt = preprocess_text(item['prompt'])\n",
113
+ " completion = preprocess_text(item['completion'])\n",
114
+ " \n",
115
+ " loss = calculate_loss(model, tokenizer, prompt, completion)\n",
116
+ " losses.append(loss)\n",
117
+ " \n",
118
+ " average_loss = np.mean(losses)\n",
119
+ " \n",
120
+ " return average_loss"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "average_loss = evaluate_json_dataset('dataset.jsonl', model, tokenizer)\n",
130
+ "print(f\"cross-entropy loss: {average_loss:.4f}\")"
131
+ ]
132
+ }
133
+ ],
134
+ "metadata": {
135
+ "kernelspec": {
136
+ "display_name": "term_project",
137
+ "language": "python",
138
+ "name": "python3"
139
+ },
140
+ "language_info": {
141
+ "codemirror_mode": {
142
+ "name": "ipython",
143
+ "version": 3
144
+ },
145
+ "file_extension": ".py",
146
+ "mimetype": "text/x-python",
147
+ "name": "python",
148
+ "nbconvert_exporter": "python",
149
+ "pygments_lexer": "ipython3",
150
+ "version": "3.9.19"
151
+ }
152
+ },
153
+ "nbformat": 4,
154
+ "nbformat_minor": 2
155
+ }
notebooks/svm.ipynb CHANGED
The diff for this file is too large to render. See raw diff