File size: 3,743 Bytes
385b1f2
 
 
 
 
 
6dd9bd4
385b1f2
 
fc8e190
385b1f2
6dd9bd4
385b1f2
 
 
 
fc8e190
 
385b1f2
 
 
 
 
fc8e190
 
 
 
385b1f2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc8e190
b1eea1f
fc8e190
 
 
 
 
 
 
 
b1eea1f
fc8e190
 
 
 
 
 
b1eea1f
fc8e190
 
b1eea1f
fc8e190
 
b1eea1f
fc8e190
 
b1eea1f
fc8e190
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import streamlit as st
import easyocr
import pickle
import spacy
import re
import os
import subprocess

# Function to Load the Spacy tokenizer
@st.cache_resource
def load_nlp():
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    return spacy.load('en_core_web_sm')

# Function to Load the model
@st.cache_resource
def load_tokenizer_model():
    with open('./models/autoclassifier.pkl', 'rb') as model_file:
        stopwords = pickle.load(model_file)
        punctuations = pickle.load(model_file)
        model_pipe = pickle.load(model_file)
    return (stopwords, punctuations, model_pipe)

# Function to Initialze the OCR Engine
@st.cache_resource
def load_ocr_engine():
    return easyocr.Reader(['en'])

# Function to process uploaded images
@st.cache_data
def autoclassifier(images):
    # Iterate through all uploaded images
    with st.spinner(f"Processing Images"):
        for image in images:
            # Write bytes to disk
            with open(image.name, 'wb') as f:
                f.write(image.read())

            # Load image into OCR Engine and extract text
            raw_ocr = ocr_engine.readtext(image.name)

            # Extract relevant words from raw OCR
            words = ''
            for (bbox, text, prob) in raw_ocr:
                # Only keep OCR text with 50% probability or higher
                if prob > 0.5:
                    # Filter out any digits
                    text = re.sub('[0-9]+', '', text)
                    # If we have any characters left, append to string
                    if text != '':
                        words += ' ' + text
            # Pass filtered OCR string to the model
            doc_type = model_pipe.predict([words])

            # Report filename and document class
            st.info(f"filename: '{image.name}', doc_type: '{doc_type[0]}'")

            # Delete image file
            os.remove(image.name)

st.header('Document Classifier', divider='green')

st.markdown("#### What is OCR?")
st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
st.divider()

# Fetch uploaded images
images = st.file_uploader(
    "Choose an image to classify",
    type=['png','jpg','jpeg'],
    accept_multiple_files=True
)

# Load the Spacy tokenizer
nlp = load_nlp()

# Load the Model
stopwords, punctuations, model_pipe = load_tokenizer_model()

# Initialze the OCR Engine
ocr_engine = load_ocr_engine()

# Process and predict document classification
autoclassifier(images)