File size: 8,576 Bytes
e5b2779
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import io
import cv2 as cv
import pytesseract

min_size_of_cell = st.sidebar.slider('Min. size of Cell', 1, 50000, 5000)
st.sidebar.write("Adjust this setting so that no text gets selected and not too large that any cell will be missed.")
table_contour_factor = st.sidebar.slider('Table Contour Factor', 1, 100, 10)
st.sidebar.write("Adjust this setting so that the border of entire table / image is not selected. Also not too large that any cell will be missed.")

if 'significant_contour_list' not in st.session_state:
    st.session_state.significant_contour_list = []

if 'imgray' not in st.session_state:
    st.session_state.imgray = 0

if 'df' not in st.session_state:
    st.session_state.df = pd.DataFrame()

def remove_newline_char(a):
    if a == "":
        return ""
    if str(a) == "NaN":
        return ""
    if a[-1] == '\n':
        return a[:-1]
    return a

def convert_DF_to_csv(df):
    s = ""
    for i in range(0,df.shape[0]):
        for j in range(0,df.shape[1]):
            if j == df.shape[1] - 1:
                s = s + str(df.iloc[i,j])
            else:
                s = s + str(df.iloc[i,j]) + ","
                
        s = s + '\n'
    return s

def runalgo():
    
    # now for easy of computing and establishing regions for text mining each signifiant contour, their respective bounding rectangular boxes are found.
    significant_contour_list = st.session_state.significant_contour_list
    significant_contour_rect_details = []
    imgray = st.session_state.imgray
    for i in range(0,len(significant_contour_list)):
        significant_contour_rect_details.append(cv.boundingRect(significant_contour_list[i]))

    # the center of each rect for each cell is computed to further easy in sorting and finding the order of cells.
    significant_contour_rect_center = []
    for i in range(0,len(significant_contour_rect_details)):
        significant_contour_rect_center.append((significant_contour_rect_details[i][0] + 
                                                significant_contour_rect_details[i][2] / 2,
                                            significant_contour_rect_details[i][1] + 
                                                significant_contour_rect_details[i][3] / 2,
                                            i))
        
    # since the order of contours can be different and the exact no. of rows and columns are always unclear
    # 1. the contour with least y value is found
    # 2. then the header row is figured out by comparing the y value of each cell with the least y value
    # 3. still the header row may not be in a correct sequence hence they are ordered by x value to represent the header row of a flat table.
    unordered_header_rows = []
    min_y = 1000000.0
    min_index = 0
    for i in range(0,len(significant_contour_rect_center)):
        if min_y >= significant_contour_rect_center[i][1]:
            min_y = significant_contour_rect_center[i][1]
            min_index = i
    for i in range(0,len(significant_contour_rect_center)):
        if abs(min_y - significant_contour_rect_center[i][1]) <= 5:
            unordered_header_rows.append(i)
    header_rows_x_values_unordered = []
    for i in range(0,len(unordered_header_rows)):
        header_rows_x_values_unordered.append(significant_contour_rect_center[unordered_header_rows[i]][0])
    header_rows_x_values_index = np.argsort(header_rows_x_values_unordered)
    header_rows_index = []
    for i in range(0,len(header_rows_x_values_index)):
        header_rows_index.append(unordered_header_rows[header_rows_x_values_index[i]])

    # now from ordered header row cells the remaining cells that are vertically below are found out and then they are ordered by y value.
    table_cells_index = []
    for i in header_rows_index:
        table_cells_index.append([i])
    for i in range(0,len(header_rows_index)):
        for j in range(0,len(significant_contour_rect_center)):
            if abs(significant_contour_rect_center[j][0] - 
                significant_contour_rect_center[header_rows_index[i]][0]) <= 5 and j != header_rows_index[i]:
                table_cells_index[i].append(j)
    for i in range(0,len(header_rows_index)):
        a = list(table_cells_index[i][1:])
        col_y = []
        for j in a:
            col_y.append(significant_contour_rect_center[j][1])
        col_y_index = np.argsort(col_y)
        col_y_index = col_y_index
        b = []
        for j in col_y_index:
            b.append(a[j])
        table_cells_index[i] = [header_rows_index[i]] + b

    # for ech cell tesseract is used to extract the text and stored in a 2d list.
    pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/Cellar/tesseract/5.3.0_1/bin/tesseract" #this is must for macOS M1
    table_contents = []
    for i in range(0,len(table_cells_index)):
        a = []
        for j in table_cells_index[i]:
            y = significant_contour_rect_details[j][1]
            h = significant_contour_rect_details[j][3]
            x = significant_contour_rect_details[j][0]
            w = significant_contour_rect_details[j][2]
            cropped = imgray[y:y + h, x:x + w]
            text = pytesseract.image_to_string(cropped)
            a.append(text)
        table_contents.append(a)
    df = pd.DataFrame(table_contents)
    df = df.transpose() # since the data is column wise we have to apply transpose to convert to a flat table.
    # some preprocessing is required like removing new line character at the last for each cell in the dataframe.
    for i in range(0,len(df.columns)):
        df[i] = df.apply(lambda x: remove_newline_char(x[i]),axis = 1)
    st.session_state.df = df
        
    

def contour_area(a):
    return cv.contourArea(a)


def setCountours(img_bytes):
    
    imgray = cv.cvtColor(img_bytes, cv.COLOR_BGR2GRAY)
    st.session_state.imgray = imgray
    ret, thresh = cv.threshold(imgray, 127, 255, 0)
    contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)

    # creating a list of areas by each contour
    contour_area_list = []
    for i in range(0,len(contours)):
        contour_area_list.append(contour_area(contours[i]))
    contour_area_list = np.array(contour_area_list)

    # finding only significant_counters -- here the area is used as metric to eliminate text contours and other small regions
    significant_contour_list = []
    max_contour_area = max(contour_area_list)
    for i in range(0,len(contours)):
        # here it is assumed that each cell int able be atleast 800 sq. pixels
        # there is always a possiblity of non exact crop of image hence there will always be atleast 1 large contour around the table border.
        if contour_area_list[i] > min_size_of_cell and contour_area_list[i] < max_contour_area / table_contour_factor:
            significant_contour_list.append(contours[i])
    significant_contour_list = np.array(significant_contour_list)
    st.session_state.significant_contour_list = significant_contour_list
    im_contours_significant = img_bytes.copy()
    im_contours_significant = cv.drawContours(im_contours_significant, significant_contour_list, -1, (0,255,0), 3) # the contours are set to be visible in green

    img = cv.cvtColor(im_contours_significant, cv.COLOR_BGR2RGB)
    im_pil = Image.fromarray(img)

    return im_pil

def convertImg(img):
    nparr = np.array(img.convert('RGB'))
    return nparr[:, :, ::-1].copy()

st.title("Table from Image using opencv")

image = Image.open('/Users/sarat/Desktop/Projects/Python/Table with data extraction/sports_data.png')
image_contoured = setCountours(convertImg(image))

info_placeholder = st.empty()

# tab1, tab2 = st.tabs(["Data","Contoured Image"])

# upload_image_button = st.button("Upload Image")

uploaded_file = st.file_uploader("Upload Image",type=['png'])
if uploaded_file is not None:
    bytes_data = uploaded_file.getvalue()
    image = Image.open(io.BytesIO(bytes_data))
    image_contoured = setCountours(convertImg(image))


st.sidebar.header("Original Image")
st.sidebar.image(image)

col_b_1, col_b_2 = st.columns(2)

with col_b_1:
    st.button("Convert",on_click=runalgo)

with col_b_2:
    st.download_button('Download CSV', convert_DF_to_csv(st.session_state.df), file_name='data.csv')

col1, col2 = st.columns(2)

with col2:
    if st.session_state.df.shape[0] != 0:
        st.header("Data")
        st.dataframe(st.session_state.df)

with col1:
    st.header("Image with Contours")
    st.image(image_contoured)