Sarat Chandra Ventrapragada commited on
Commit
e5b2779
·
1 Parent(s): e69de3e

first commit

Browse files
Files changed (2) hide show
  1. app.py +205 -0
  2. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import pandas as pd
4
+ from PIL import Image
5
+ import io
6
+ import cv2 as cv
7
+ import pytesseract
8
+
9
+ min_size_of_cell = st.sidebar.slider('Min. size of Cell', 1, 50000, 5000)
10
+ st.sidebar.write("Adjust this setting so that no text gets selected and not too large that any cell will be missed.")
11
+ table_contour_factor = st.sidebar.slider('Table Contour Factor', 1, 100, 10)
12
+ st.sidebar.write("Adjust this setting so that the border of entire table / image is not selected. Also not too large that any cell will be missed.")
13
+
14
+ if 'significant_contour_list' not in st.session_state:
15
+ st.session_state.significant_contour_list = []
16
+
17
+ if 'imgray' not in st.session_state:
18
+ st.session_state.imgray = 0
19
+
20
+ if 'df' not in st.session_state:
21
+ st.session_state.df = pd.DataFrame()
22
+
23
+ def remove_newline_char(a):
24
+ if a == "":
25
+ return ""
26
+ if str(a) == "NaN":
27
+ return ""
28
+ if a[-1] == '\n':
29
+ return a[:-1]
30
+ return a
31
+
32
+ def convert_DF_to_csv(df):
33
+ s = ""
34
+ for i in range(0,df.shape[0]):
35
+ for j in range(0,df.shape[1]):
36
+ if j == df.shape[1] - 1:
37
+ s = s + str(df.iloc[i,j])
38
+ else:
39
+ s = s + str(df.iloc[i,j]) + ","
40
+
41
+ s = s + '\n'
42
+ return s
43
+
44
+ def runalgo():
45
+
46
+ # now for easy of computing and establishing regions for text mining each signifiant contour, their respective bounding rectangular boxes are found.
47
+ significant_contour_list = st.session_state.significant_contour_list
48
+ significant_contour_rect_details = []
49
+ imgray = st.session_state.imgray
50
+ for i in range(0,len(significant_contour_list)):
51
+ significant_contour_rect_details.append(cv.boundingRect(significant_contour_list[i]))
52
+
53
+ # the center of each rect for each cell is computed to further easy in sorting and finding the order of cells.
54
+ significant_contour_rect_center = []
55
+ for i in range(0,len(significant_contour_rect_details)):
56
+ significant_contour_rect_center.append((significant_contour_rect_details[i][0] +
57
+ significant_contour_rect_details[i][2] / 2,
58
+ significant_contour_rect_details[i][1] +
59
+ significant_contour_rect_details[i][3] / 2,
60
+ i))
61
+
62
+ # since the order of contours can be different and the exact no. of rows and columns are always unclear
63
+ # 1. the contour with least y value is found
64
+ # 2. then the header row is figured out by comparing the y value of each cell with the least y value
65
+ # 3. still the header row may not be in a correct sequence hence they are ordered by x value to represent the header row of a flat table.
66
+ unordered_header_rows = []
67
+ min_y = 1000000.0
68
+ min_index = 0
69
+ for i in range(0,len(significant_contour_rect_center)):
70
+ if min_y >= significant_contour_rect_center[i][1]:
71
+ min_y = significant_contour_rect_center[i][1]
72
+ min_index = i
73
+ for i in range(0,len(significant_contour_rect_center)):
74
+ if abs(min_y - significant_contour_rect_center[i][1]) <= 5:
75
+ unordered_header_rows.append(i)
76
+ header_rows_x_values_unordered = []
77
+ for i in range(0,len(unordered_header_rows)):
78
+ header_rows_x_values_unordered.append(significant_contour_rect_center[unordered_header_rows[i]][0])
79
+ header_rows_x_values_index = np.argsort(header_rows_x_values_unordered)
80
+ header_rows_index = []
81
+ for i in range(0,len(header_rows_x_values_index)):
82
+ header_rows_index.append(unordered_header_rows[header_rows_x_values_index[i]])
83
+
84
+ # now from ordered header row cells the remaining cells that are vertically below are found out and then they are ordered by y value.
85
+ table_cells_index = []
86
+ for i in header_rows_index:
87
+ table_cells_index.append([i])
88
+ for i in range(0,len(header_rows_index)):
89
+ for j in range(0,len(significant_contour_rect_center)):
90
+ if abs(significant_contour_rect_center[j][0] -
91
+ significant_contour_rect_center[header_rows_index[i]][0]) <= 5 and j != header_rows_index[i]:
92
+ table_cells_index[i].append(j)
93
+ for i in range(0,len(header_rows_index)):
94
+ a = list(table_cells_index[i][1:])
95
+ col_y = []
96
+ for j in a:
97
+ col_y.append(significant_contour_rect_center[j][1])
98
+ col_y_index = np.argsort(col_y)
99
+ col_y_index = col_y_index
100
+ b = []
101
+ for j in col_y_index:
102
+ b.append(a[j])
103
+ table_cells_index[i] = [header_rows_index[i]] + b
104
+
105
+ # for ech cell tesseract is used to extract the text and stored in a 2d list.
106
+ pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/Cellar/tesseract/5.3.0_1/bin/tesseract" #this is must for macOS M1
107
+ table_contents = []
108
+ for i in range(0,len(table_cells_index)):
109
+ a = []
110
+ for j in table_cells_index[i]:
111
+ y = significant_contour_rect_details[j][1]
112
+ h = significant_contour_rect_details[j][3]
113
+ x = significant_contour_rect_details[j][0]
114
+ w = significant_contour_rect_details[j][2]
115
+ cropped = imgray[y:y + h, x:x + w]
116
+ text = pytesseract.image_to_string(cropped)
117
+ a.append(text)
118
+ table_contents.append(a)
119
+ df = pd.DataFrame(table_contents)
120
+ df = df.transpose() # since the data is column wise we have to apply transpose to convert to a flat table.
121
+ # some preprocessing is required like removing new line character at the last for each cell in the dataframe.
122
+ for i in range(0,len(df.columns)):
123
+ df[i] = df.apply(lambda x: remove_newline_char(x[i]),axis = 1)
124
+ st.session_state.df = df
125
+
126
+
127
+
128
+ def contour_area(a):
129
+ return cv.contourArea(a)
130
+
131
+
132
+ def setCountours(img_bytes):
133
+
134
+ imgray = cv.cvtColor(img_bytes, cv.COLOR_BGR2GRAY)
135
+ st.session_state.imgray = imgray
136
+ ret, thresh = cv.threshold(imgray, 127, 255, 0)
137
+ contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
138
+
139
+ # creating a list of areas by each contour
140
+ contour_area_list = []
141
+ for i in range(0,len(contours)):
142
+ contour_area_list.append(contour_area(contours[i]))
143
+ contour_area_list = np.array(contour_area_list)
144
+
145
+ # finding only significant_counters -- here the area is used as metric to eliminate text contours and other small regions
146
+ significant_contour_list = []
147
+ max_contour_area = max(contour_area_list)
148
+ for i in range(0,len(contours)):
149
+ # here it is assumed that each cell int able be atleast 800 sq. pixels
150
+ # there is always a possiblity of non exact crop of image hence there will always be atleast 1 large contour around the table border.
151
+ if contour_area_list[i] > min_size_of_cell and contour_area_list[i] < max_contour_area / table_contour_factor:
152
+ significant_contour_list.append(contours[i])
153
+ significant_contour_list = np.array(significant_contour_list)
154
+ st.session_state.significant_contour_list = significant_contour_list
155
+ im_contours_significant = img_bytes.copy()
156
+ im_contours_significant = cv.drawContours(im_contours_significant, significant_contour_list, -1, (0,255,0), 3) # the contours are set to be visible in green
157
+
158
+ img = cv.cvtColor(im_contours_significant, cv.COLOR_BGR2RGB)
159
+ im_pil = Image.fromarray(img)
160
+
161
+ return im_pil
162
+
163
+ def convertImg(img):
164
+ nparr = np.array(img.convert('RGB'))
165
+ return nparr[:, :, ::-1].copy()
166
+
167
+ st.title("Table from Image using opencv")
168
+
169
+ image = Image.open('/Users/sarat/Desktop/Projects/Python/Table with data extraction/sports_data.png')
170
+ image_contoured = setCountours(convertImg(image))
171
+
172
+ info_placeholder = st.empty()
173
+
174
+ # tab1, tab2 = st.tabs(["Data","Contoured Image"])
175
+
176
+ # upload_image_button = st.button("Upload Image")
177
+
178
+ uploaded_file = st.file_uploader("Upload Image",type=['png'])
179
+ if uploaded_file is not None:
180
+ bytes_data = uploaded_file.getvalue()
181
+ image = Image.open(io.BytesIO(bytes_data))
182
+ image_contoured = setCountours(convertImg(image))
183
+
184
+
185
+ st.sidebar.header("Original Image")
186
+ st.sidebar.image(image)
187
+
188
+ col_b_1, col_b_2 = st.columns(2)
189
+
190
+ with col_b_1:
191
+ st.button("Convert",on_click=runalgo)
192
+
193
+ with col_b_2:
194
+ st.download_button('Download CSV', convert_DF_to_csv(st.session_state.df), file_name='data.csv')
195
+
196
+ col1, col2 = st.columns(2)
197
+
198
+ with col2:
199
+ if st.session_state.df.shape[0] != 0:
200
+ st.header("Data")
201
+ st.dataframe(st.session_state.df)
202
+
203
+ with col1:
204
+ st.header("Image with Contours")
205
+ st.image(image_contoured)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy
2
+ pandas
3
+ PIL
4
+ io
5
+ cv2
6
+ pytesseract