initial commit

Browse files

Files changed (14) hide show

.gitattributes +1 -0
README.md +54 -3
assets/music_notes.png +0 -0
assets/trumpet.png +0 -0
config.yaml +10 -0
hand_labeled_tables.zip +3 -0
main.py +52 -0
notebooks/model.ipynb +3 -0
notebooks/svm.ipynb +0 -0
requirements.txt +0 -0
scripts/build_features.py +333 -0
scripts/make_dataset.py +91 -0
scripts/model.py +193 -0
setup.py +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+notebooks/model.ipynb filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,54 @@
----
-license: mit
----

+# AIPI Term Project
+## Developer: Keese Phillips
+## About:
+The purpose of this project is to perform very basic intelligent document processing (IDP) to extract a table from a document image. This can be a document that is in a PDF or image format that cannot be mapped directly to a csv file. The steps in this process is table detection, optical character recognition (OCR), table extraction and conversion to csv format.
+## How to run the project
+### If you want to run the full pipeline and train the model from scratch
+1. You will need to install all of the necessary packages to run the setup.py script beforehand
+3. You will then need to run setup.py to create the data pipeline and train the model
+4. You will then need to run the frontend to use the model
+```bash
+pip install -r requirements.txt
+python setup.py
+streamlit run main.py
+```
+### If you want to just run the frontend
+1. You will need to install all of the necessary packages to run the setup.py script beforehand
+2. You will then need to run the frontend to use the model
+```bash
+pip install -r requirements.txt
+streamlit run main.py
+```
+## Project Structure
+> - requirements.txt: list of python libraries to download before running project
+> - setup.py: script to set up project (get data, train model)
+> -  main.py: main script/notebook to run streamlit user interface
+> - assets: directory for images used in frontend
+> - scripts: directory for pipeline scripts or utility scripts
+>   - make_dataset.py: script to get data
+>   - build_features.py: script to prepare the dataset for training
+>   - model.py: script to train model and predict
+> - models: directory for trained models
+>   - recommendation.pt: pytorch trained model for album recommendations
+> - data:  directory for project data
+>   - raw: directory for raw data
+>   - processed: directory to store the processed data
+>   - outputs: directory to store the prepared data
+> - notebooks: directory to store any exploration notebooks used
+> - .gitignore: git ignore file
+## [Data source](https://github.com/ibm-aur-nlp/PubLayNet)
+The data used to train the model was provided by [IBM](https://developer.ibm.com/exchanges/data/all/publaynet/) and [PubLayNet: largest dataset ever for document layout analysis](https://arxiv.org/abs/1908.07836). As per their dataset description:
+> PubLayNet is a large dataset of document images, of which the layout is annotated with both bounding boxes and polygonal segmentations.  The source of the documents is PubMed Central Open Access Subset (commercial use collection). The annotations are automatically generated by matching the PDF format and the XML format of the articles in the PubMed Central Open Access Subset.
+## Contributions
+Brinnae Bent
+Jon Reifschneider
+Xu Zhong
+Jianbin Tang
+Antonio Jimeno Yepes

assets/music_notes.png ADDED Viewed

assets/trumpet.png ADDED Viewed

config.yaml ADDED Viewed

	@@ -0,0 +1,10 @@

+path: C:/Users/keese/term_project
+train: training/images
+val: validation/images
+names:
+  0: text
+  1: title
+  2: list
+  3: table
+  4: figure

hand_labeled_tables.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c72e57db0f0e7b6f770771b8b212e991afda049773d95120d5bae783b110ada8
+size 2028840

main.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Attribution: https://github.com/AIPI540/AIPI540-Deep-Learning-Applications/
+Jon Reifschneider
+Brinnae Bent
+"""
+import streamlit as st
+from PIL import Image
+import numpy as np
+import os
+import numpy as np
+import pandas as pd
+import pandas as pd
+import os
+import json
+import pandas as pd
+import torch
+import numpy as np
+import pandas as pd
+import torch.nn as nn
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+if __name__ == '__main__':
+    st.header('Spotify Playlists')
+    img1, img2 = st.columns(2)
+    music_notes = Image.open('assets/music_notes.png')
+    img1.image(music_notes, use_column_width=True)
+    trumpet = Image.open('assets/trumpet.png')
+    img2.image(trumpet, use_column_width=True)
+    with st.sidebar:
+        playlist_name = st.selectbox(
+            "Playlist Selection",
+            (   list(set([1,2]))            )
+        )
+    col1, col2 = st.columns(2)
+    with col1:
+        st.write(f'Artist')
+    with col2:
+        st.write(f'Album')

notebooks/model.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e580316d90588119633a0091618c9eba64d964086822c44c4e41c96101c7177
+size 17075128

notebooks/svm.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

Binary file (9.92 kB). View file

scripts/build_features.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import os
+import urllib.request
+import json
+import sys
+import requests
+import tarfile
+import numpy as np
+from PIL import Image
+import PIL.Image
+from pathlib import Path
+import shutil
+from PIL import Image
+import pandas as pd
+from PIL import ImageFont, ImageDraw
+from IPython.display import display, Image
+from matplotlib import pyplot as plt
+import cv2 as cv
+def get_data_and_annots():
+  images={}
+  with open('data/raw/label/publaynet/train.json') as t:
+    data=json.load(t)
+  for train_images in os.walk('data/raw/train/publaynet/train'):
+    train_imgs = train_images[2]
+  for image in data['images']:
+    if image['file_name'] in train_imgs:
+      images[image['id']] = {'file_name': "data/raw/train/publaynet/train/" + image['file_name'], 'annotations': []}
+    if len(images) == 10000:
+      break
+  for ann in data['annotations']:
+    if ann['image_id'] in images.keys():
+      images[ann['image_id']]['annotations'].append(ann)
+  return images,data
+def markup(samples,image, annotations):
+    ''' Draws the segmentation, bounding box, and label of each annotation
+    '''
+    draw = ImageDraw.Draw(image, 'RGBA')
+    font = ImageFont.load_default()  # You can specify a different font if needed
+    for annotation in annotations:
+        # Draw segmentation
+        draw.polygon(annotation['segmentation'][0],
+                     fill=colors[samples['categories'][annotation['category_id'] - 1]['name']] + (64,))
+        # Draw bbox
+        draw.rectangle(
+            (annotation['bbox'][0],
+             annotation['bbox'][1],
+             annotation['bbox'][0] + annotation['bbox'][2],
+             annotation['bbox'][1] + annotation['bbox'][3]),
+            outline=colors[data['categories'][annotation['category_id'] - 1]['name']] + (255,),
+            width=2
+        )
+        # Draw label
+        text = samples['categories'][annotation['category_id'] - 1]['name']
+        bbox = draw.textbbox((0, 0), text, font=font)
+        w = bbox[2] - bbox[0]
+        h = bbox[3] - bbox[1]
+        if annotation['bbox'][3] < h:
+            draw.rectangle(
+                (annotation['bbox'][0] + annotation['bbox'][2],
+                 annotation['bbox'][1],
+                 annotation['bbox'][0] + annotation['bbox'][2] + w,
+                 annotation['bbox'][1] + h),
+                fill=(64, 64, 64, 255)
+            )
+            draw.text(
+                (annotation['bbox'][0] + annotation['bbox'][2],
+                 annotation['bbox'][1]),
+                text=samples['categories'][annotation['category_id'] - 1]['name'],
+                fill=(255, 255, 255, 255)
+            )
+        else:
+            draw.rectangle(
+                (annotation['bbox'][0],
+                 annotation['bbox'][1],
+                 annotation['bbox'][0] + w,
+                 annotation['bbox'][1] + h),
+                fill=(64, 64, 64, 255)
+            )
+            draw.text(
+                (annotation['bbox'][0],
+                 annotation['bbox'][1]),
+                text=samples['categories'][annotation['category_id'] - 1]['name'],
+                fill=(255, 255, 255, 255)
+            )
+    return np.array(image)
+import os
+import shutil
+from pathlib import Path
+import cv2 as cv
+def write_file(image_id, inside, filename, content, check_set):
+    """
+    Writes content to a file. If 'inside' is True, appends the content, otherwise overwrites the file.
+    Args:
+        image_id (str): The ID of the image.
+        inside (bool): Flag to determine if content should be appended or overwritten.
+        filename (str): The path to the file.
+        content (str): The content to write to the file.
+        check_set (set): A set to keep track of image IDs.
+    """
+    if inside:
+        with open(filename, "a") as file:
+            file.write("\n")
+            file.write(content)
+    else:
+        check_set.add(image_id)
+        with open(filename, "w") as file:
+            file.write(content)
+def get_bb_shape(bboxe, img):
+    """
+    Calculates the shape of the bounding box in the image.
+    Args:
+        bboxe (list): Bounding box coordinates [x, y, width, height].
+        img (numpy.ndarray): The image array.
+    Returns:
+        tuple: The shape (height, width) of the bounding box.
+    """
+    tleft = (bboxe[0], bboxe[1])
+    tright = (bboxe[0] + bboxe[2], bboxe[1])
+    bleft = (bboxe[0], bboxe[1] + bboxe[3])
+    bright = (bboxe[0] + bboxe[2], bboxe[1] + bboxe[3])
+    top_left_x = min([tleft[0], tright[0], bleft[0], bright[0]])
+    top_left_y = min([tleft[1], tright[1], bleft[1], bright[1]])
+    bot_right_x = max([tleft[0], tright[0], bleft[0], bright[0]])
+    bot_right_y = max([tleft[1], tright[1], bleft[1], bright[1]])
+    image = img[int(top_left_y):int(bot_right_y) + 1, int(top_left_x):int(bot_right_x) + 1]
+    return image.shape[:2]
+def coco_to_yolo(x1, y1, w, h, image_w, image_h):
+    """
+    Converts COCO format bounding box to YOLO format.
+    Args:
+        x1 (float): Top-left x coordinate.
+        y1 (float): Top-left y coordinate.
+        w (float): Width of the bounding box.
+        h (float): Height of the bounding box.
+        image_w (int): Width of the image.
+        image_h (int): Height of the image.
+    Returns:
+        list: YOLO format bounding box [x_center, y_center, width, height].
+    """
+    return [((2 * x1 + w) / (2 * image_w)), ((2 * y1 + h) / (2 * image_h)), w / image_w, h / image_h]
+def create_directory(path):
+    """
+    Creates a directory, deleting it first if it already exists.
+    Args:
+        path (str): The path to the directory.
+    """
+    dirpath = Path(path)
+    if dirpath.exists() and dirpath.is_dir():
+        shutil.rmtree(dirpath)
+    os.mkdir(dirpath)
+def generate_yolo_labels(images):
+    """
+    Generates YOLO format labels from the given images and annotations.
+    Args:
+        images (dict): Dictionary containing image data and annotations.
+    """
+    check_set = set()
+    create_directory(os.getcwd() + '/data/processed/yolo')
+    for key in images:
+        image_id = ','.join(map(str, [image_id['image_id'] for image_id in images[key]['annotations']]))
+        category_id = ''.join(map(str, [cat_id['category_id'] - 1 for cat_id in images[key]['annotations']]))
+        bbox = [bbox['bbox'] for bbox in images[key]['annotations']]
+        image_path = images[key]['file_name']
+        filename = os.getcwd() + '/data/processed/yolo/' + image_path.split('/')[-1].split(".")[0] + '.txt'
+        for index, b in enumerate(bbox):
+            bbox = [b[0], b[1], b[2], b[3]]
+            shape = get_bb_shape(bbox, cv.imread(image_path))
+            yolo_bbox = coco_to_yolo(bbox[0], bbox[1], shape[1], shape[0], cv.imread(image_path).shape[1], cv.imread(image_path).shape[0])
+            content = category_id[index] + ' ' + str(yolo_bbox[0]) + ' ' + str(yolo_bbox[1]) + ' ' + str(yolo_bbox[2]) + ' ' + str(yolo_bbox[3])
+            if image_id in check_set:
+                write_file(image_id, True, filename, content, check_set)
+            else:
+                write_file(image_id, False, filename, content, check_set)
+def delete_additional_images(old_train_path, temp_images_path, yolo_path):
+    train = next(os.walk(old_train_path), (None, None, []))[2]
+    label = next(os.walk(yolo_path), (None, None, []))[2]
+    dirpath = Path(temp_images_path)
+    if dirpath.exists() and dirpath.is_dir():
+        shutil.rmtree(dirpath)
+    os.mkdir(dirpath)
+    for img in train:
+        splited = img.split(".")[0]
+        txt = f"{splited}.txt"
+        if txt in label:
+            shutil.move(f"{old_train_path}/{img}", f"{temp_images_path}/{img}")
+    return
+def split_data(temp_images_path):
+    image = next(os.walk(temp_images_path), (None, None, []))[2]
+    train = image[int(len(image) * .1) : int(len(image) * .90)]
+    validation = list(set(image) - set(train))
+    create_directory(os.getcwd() + '/data/processed/training')
+    create_directory(os.getcwd() + '/data/processed/validation')
+    create_directory(os.getcwd() + '/data/processed/training/images/')
+    create_directory(os.getcwd() + '/data/processed/validation/images/')
+    for train_img in train:
+        shutil.move(f'{temp_images_path}/{train_img}', os.getcwd() + '/data/processed/training/images/')
+    for valid_img in validation:
+        shutil.move(f'{temp_images_path}/{valid_img}', os.getcwd() + '/data/processed/validation/images/')
+    validation_without_ext = [i.split('.')[0] for i in validation]
+    return validation_without_ext
+def create_directory(path):
+    dirpath = Path(path)
+    if dirpath.exists() and dirpath.is_dir():
+        shutil.rmtree(dirpath)
+    os.mkdir(dirpath)
+def get_labels(yolo_path, valid_without_extension):
+    create_directory(os.getcwd() + '/data/processed/training/labels')
+    create_directory(os.getcwd() + '/data/processed/validation/labels')
+    label = next(os.walk(yolo_path), (None, None, []))[2]
+    for lab in label:
+        split = lab.split(".")[0]
+        if split in valid_without_extension:
+            shutil.move(f"{yolo_path}/{lab}", os.getcwd() + f'/data/processed/validation/labels/{lab}')
+        else:
+            shutil.move(f"{yolo_path}/{lab}", os.getcwd() + f'/data/processed/training/labels/{lab}')
+    return
+def final_preparation(old_train_path, temp_images_path, yolo_path, images):
+    delete_additional_images(old_train_path, temp_images_path, yolo_path)
+    valid_without_extension = split_data(temp_images_path)
+    dirpath = Path(temp_images_path)
+    if dirpath.exists() and dirpath.is_dir():
+        shutil.rmtree(dirpath)
+    return get_labels(yolo_path, valid_without_extension)
+def annotate_tables(directory):
+    dirpath = Path(os.getcwd() + f'/data/processed/tables')
+    if dirpath.exists() and dirpath.is_dir():
+        shutil.rmtree(dirpath)
+    os.mkdir(dirpath)
+    # Iterate through the directory
+    for filename in os.listdir(directory):
+        # Get the full path of the file
+        file_path = os.path.join(directory, filename)
+        # Check if it's a file (not a subdirectory)
+        if os.path.isfile(file_path):
+            img_name = filename.split('.')[0]
+            if os.path.isfile(os.getcwd() + f'/data/processed/training/images/{img_name}.jpg'):
+                with open(os.getcwd() + f'/data/processed/training/labels/{img_name}.txt', 'r') as f:
+                    results = f.read()
+                original_image = Image.open(os.getcwd() + f'/data/processed/training/images/{img_name}.jpg')
+            elif os.path.isfile(os.getcwd() + f'/data/processed/validation/images/{img_name}.jpg'):
+                with open(os.getcwd() + f'/data/processed/validation/labels/{img_name}.txt', 'r') as f:
+                    results = f.read()
+                original_image = Image.open(os.getcwd() + f'/data/processed/validation/images/{img_name}.jpg')
+            # Iterate through the results
+            for r in results:
+                boxes = r.boxes  # Bounding boxes object
+                for box in boxes:
+                    # Check if the detected object is a table
+                    if box.cls == 3:
+                        # Get the bounding box coordinates
+                        x1, y1, x2, y2 = box.xyxy[0]  # get box coordinates in (top, left, bottom, right) format
+                        # Convert tensor to int
+                        x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
+                        # Crop the original image to the table region
+                        table_image = original_image.crop((x1, y1, x2, y2))
+                        # Show the cropped table image
+                        table_image.show()
+                        # Save the cropped table image
+                        table_image.save(os.getcwd() + f'/data/processed/tables/{img_name}.jpg')
+                        # Break after finding the first table (remove this if you want to detect multiple tables)
+                        break
+                # Break after processing the first result (usually there's only one result per image)
+                break
+if __name__ == '__main__':
+    colors = {'title': (255, 0, 0),
+            'text': (0, 255, 0),
+            'figure': (0, 0, 255),
+            'table': (255, 255, 0),
+            'list': (0, 255, 255)}
+    images,data = get_data_and_annots()
+    generate_labels = generate_yolo_labels(images)
+    finalPrep = final_preparation(os.path.join(os.getcwd() + r'\data\raw\train\publaynet\train'),os.path.join(os.getcwd() + r"\data\processed\images"), os.getcwd() + '/data/processed/yolo',images)
+    annotate_tables(os.getcwd() + '/data/processed/hand_labeled_tables/hand_labeled_tables')

scripts/make_dataset.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import urllib.request
+import tarfile
+from pathlib import Path
+import shutil
+import zipfile
+import os
+def get_archive(path,url,Set):
+  try:
+    os.mkdir(path)
+  except:
+    path=path
+  urllib.request.urlretrieve(url,f"{path}/{Set}.tar")
+def extract_tar(tar_file):
+  print(f'{os.getcwd()}/data/raw/{tar_file}.tar', end='\r')
+  file = tarfile.open(f'{os.getcwd()}/data/raw/{tar_file}.tar')
+  file.extractall(f'{os.getcwd()}/data/raw/{tar_file}')
+  file.close()
+  os.remove(f'{os.getcwd()}/data/raw/{tar_file}.tar')
+def make_dir(target_dir):
+    if Path(target_dir).exists() and Path(target_dir).is_dir():
+        shutil.rmtree(Path(target_dir))
+    os.makedirs(target_dir, exist_ok=True)
+def combine_dirs(source_dirs):
+  for source_dir in source_dirs:
+      for subdir, dirs, files in os.walk(os.getcwd() + '/data/raw/' + source_dir):
+          for file in files:
+              filepath = subdir + os.sep + file
+              if filepath.find('.jpg') != -1:
+                  shutil.copy(filepath, target_dir)
+      if Path(os.getcwd() + '/data/raw/' + source_dir).exists():
+          shutil.rmtree(Path(os.getcwd() + '/data/raw/' + source_dir))
+def unzip_file(zip_file_path, extract_to):
+    # Create the target directory if it doesn't exist
+    os.makedirs(extract_to, exist_ok=True)
+    # Open the zip file
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        # Extract all contents to the specified directory
+        zip_ref.extractall(extract_to)
+if __name__ == '__main__':
+    make_dir(os.getcwd() + '/data/raw')
+    make_dir(os.getcwd() + '/data/processed')
+    make_dir(os.getcwd() + '/data/outputs')
+    make_dir(os.getcwd() + '/models')
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/labels.tar.gz',"label")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-0.tar.gz',"train0")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-1.tar.gz',"train1")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-2.tar.gz',"train2")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-3.tar.gz',"train3")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-4.tar.gz',"train4")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-5.tar.gz',"train5")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/train-6.tar.gz',"train6")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/val.tar.gz',"val")
+    get_archive(os.getcwd() + '/data/raw','https://dax-cdn.cdn.appdomain.cloud/dax-publaynet/1.0.0/test.tar.gz',"test")
+    extract_tar("train0")
+    extract_tar("train1")
+    extract_tar("train2")
+    extract_tar("train3")
+    extract_tar("train4")
+    extract_tar("train5")
+    extract_tar("train6")
+    extract_tar("label")
+    extract_tar("val")
+    extract_tar("test")
+    target_dir = os.getcwd() + '/data/raw/train/publaynet/train/'
+    make_dir(target_dir)
+    source_dirs = ['train0','train1','train2','train3', 'train4', 'train5', 'train6']
+    combine_dirs(source_dirs)
+    source_dirs = ['val', 'test']
+    combine_dirs(source_dirs)
+    unzip_file('hand_labeled_tables.zip', os.getcwd() + '/data/processed/hand_labeled_tables')

scripts/model.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import os
+import json
+import numpy as np
+from PIL import Image
+import pandas as pd
+from IPython.display import Image
+from ultralytics import YOLO
+import torch
+from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
+from datasets import load_dataset
+import cv2
+import pytesseract
+from PIL import Image, ImageEnhance
+import numpy as np
+# Ensure you have installed Tesseract OCR and set the path
+pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'  # Update this path for your system
+def ocr_core(image):
+    # Run Tesseract OCR on the preprocessed image
+    data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+    df = pd.DataFrame(data)
+    df = df[df['conf'] != -1]
+    df['left_diff'] = df.groupby('block_num')['left'].diff().fillna(0).astype(int)
+    df['prev_width'] = df['width'].shift(1).fillna(0).astype(int)
+    df['spacing'] = (df['left_diff'] - df['prev_width']).fillna(0).astype(int)
+    df['text'] = df.apply(lambda x: '\n' + x['text'] if (x['word_num'] == 1) & (x['block_num'] != 1) else x['text'], axis=1)
+    df['text'] = df.apply(lambda x: ',' + x['text'] if x['spacing'] > 100 else x['text'], axis=1)
+    ocr_text = ""
+    for text in df['text']:
+        ocr_text += text + ' '
+    return ocr_text
+def improve_ocr_accuracy(img):
+    # Read image with PIL (for color preservation)
+    img =Image.open(img)
+    # Increase image size (can improve accuracy for small text)
+    img = img.resize((img.width * 4, img.height * 4))
+    # Increase contrast
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(2)
+    _, thresh = cv2.threshold(np.array(img), 127, 255, cv2.THRESH_BINARY_INV)
+    return thresh
+def create_ocr_outputs():
+    directory_path = os.getcwd() + '/data/processed/hand_labeled_tables/hand_labeled_tables'
+    for root, dirs, files in os.walk(directory_path):
+        # Print the current directory
+        print(f"Current directory: {root}")
+        # Print all subdirectories in the current directory
+        print("Subdirectories:")
+        for dir in dirs:
+            print(f"- {dir}")
+        # Print all files in the current directory
+        print("Files:")
+        for image_path in files:
+            print(f"- {image_path}")
+            full_path = os.path.join(root, image_path)
+            # Preprocess the image
+            preprocessed_image = improve_ocr_accuracy(full_path)
+            ocr_text = ocr_core(preprocessed_image)
+            with open(os.getcwd() + f"/data/processed/annotations/{image_path.split('.')[0]}.txt", 'wb') as f:
+                f.write(ocr_text.encode('utf-8'))
+        print("\n")  # Add a blank line for readability
+def prepare_dataset(ocr_dir, csv_dir, output_file):
+    with open(output_file, 'w', encoding='utf-8') as jsonl_file:
+        for filename in os.listdir(ocr_dir):
+            if filename.endswith('.txt'):
+                ocr_path = os.path.join(ocr_dir, filename)
+                csv_path = os.path.join(csv_dir, filename)#.replace('.txt', '.csv'))
+                print(csv_path)
+                # if not os.path.exists(csv_path):
+                #     print(f"Warning: Corresponding CSV file not found for {ocr_path}")
+                #     continue
+                with open(ocr_path, 'r', encoding='utf-8') as ocr_file:
+                    ocr_text = ocr_file.read()
+                with open(csv_path, 'r', encoding='utf-8') as csv_file:
+                    csv_text = csv_file.read()
+                json_object = {
+                    "prompt": ocr_text,
+                    "completion": csv_text
+                }
+                jsonl_file.write(json.dumps(json_object) + '\n')
+def tokenize_function(examples):
+    # Tokenize the inputs
+    inputs = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=1012)
+    # Create labels which are the same as input_ids
+    inputs['labels'] = inputs['input_ids'].copy()
+    return inputs
+if __name__ == '__name__':
+    # Ensure CUDA is available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Load a pretrained YOLOv8 model
+    model = YOLO('yolov8l.pt')
+    # Train the model on your custom dataset
+    results = model.train(
+        data='config.yaml',
+        epochs=10,
+        imgsz=640,
+        batch=8,
+        name='yolov8l_custom',
+        device=device
+    )
+    # Evaluate the model's performance
+    metrics = model.val()
+    print(metrics.box.map)    # print the mean Average Precision
+    torch.save(model, os.getcwd() + '/models/trained_yolov8.pt')
+    create_ocr_outputs()
+    # Usage
+    ocr_dir = os.getcwd() + '/data/processed/annotations'
+    csv_dir = os.getcwd() + '/data/processed/hand_labeled_tables'
+    output_file = 'dataset.jsonl'
+    prepare_dataset(ocr_dir, csv_dir, output_file)
+    # Load the dataset
+    dataset = load_dataset('json', data_files={'train': 'dataset.jsonl'})
+    dataset = dataset['train'].train_test_split(test_size=0.1)
+    # Tokenization
+    model_name = 'gpt2'  # You can choose other models like 'gpt2-medium', 'gpt2-large', etc.
+    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+    # Add a new pad token
+    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    # Load the model
+    model = GPT2LMHeadModel.from_pretrained(model_name)
+    # Resize the model embeddings to accommodate the new pad token
+    model.resize_token_embeddings(len(tokenizer))
+    training_args = TrainingArguments(
+        output_dir='./results',
+        num_train_epochs=3,
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=2,
+        warmup_steps=500,
+        weight_decay=0.01,
+        logging_dir='./logs',
+        logging_steps=10,
+        evaluation_strategy="epoch",  # Evaluate at the end of each epoch
+        save_strategy="epoch",  # Save at the end of each epoch
+        load_best_model_at_end=True,  # Load the best model when finished training (based on evaluation)
+        metric_for_best_model="eval_loss",  # Use eval_loss to determine the best model
+    )
+    # Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset['train'],
+        eval_dataset=tokenized_dataset['test'],
+    )
+    # Train the model
+    trainer.train()
+    # Evaluate the model
+    eval_results = trainer.evaluate()
+    print(f"Evaluation results: {eval_results}")
+    # Save the model
+    model.save_pretrained(os.getcwd() + '/models/gpt')
+    tokenizer.save_pretrained(os.getcwd() + '/models/gpt')

setup.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import subprocess
+import sys
+script = 'make_dataset.py'
+command = f'{sys.executable} scripts/{script}'
+subprocess.run(command, shell=True)
+script = 'build_features.py'
+command = f'{sys.executable} python scripts/{script}'
+subprocess.run(command, shell=True)
+script = 'model.py'
+command = f'{sys.executable} python scripts/{script}'
+subprocess.run(command, shell=True)