Spaces:
Runtime error
Runtime error
Sarat Chandra Ventrapragada
commited on
Commit
·
e5b2779
1
Parent(s):
e69de3e
first commit
Browse files- app.py +205 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
+
from PIL import Image
|
5 |
+
import io
|
6 |
+
import cv2 as cv
|
7 |
+
import pytesseract
|
8 |
+
|
9 |
+
min_size_of_cell = st.sidebar.slider('Min. size of Cell', 1, 50000, 5000)
|
10 |
+
st.sidebar.write("Adjust this setting so that no text gets selected and not too large that any cell will be missed.")
|
11 |
+
table_contour_factor = st.sidebar.slider('Table Contour Factor', 1, 100, 10)
|
12 |
+
st.sidebar.write("Adjust this setting so that the border of entire table / image is not selected. Also not too large that any cell will be missed.")
|
13 |
+
|
14 |
+
if 'significant_contour_list' not in st.session_state:
|
15 |
+
st.session_state.significant_contour_list = []
|
16 |
+
|
17 |
+
if 'imgray' not in st.session_state:
|
18 |
+
st.session_state.imgray = 0
|
19 |
+
|
20 |
+
if 'df' not in st.session_state:
|
21 |
+
st.session_state.df = pd.DataFrame()
|
22 |
+
|
23 |
+
def remove_newline_char(a):
|
24 |
+
if a == "":
|
25 |
+
return ""
|
26 |
+
if str(a) == "NaN":
|
27 |
+
return ""
|
28 |
+
if a[-1] == '\n':
|
29 |
+
return a[:-1]
|
30 |
+
return a
|
31 |
+
|
32 |
+
def convert_DF_to_csv(df):
|
33 |
+
s = ""
|
34 |
+
for i in range(0,df.shape[0]):
|
35 |
+
for j in range(0,df.shape[1]):
|
36 |
+
if j == df.shape[1] - 1:
|
37 |
+
s = s + str(df.iloc[i,j])
|
38 |
+
else:
|
39 |
+
s = s + str(df.iloc[i,j]) + ","
|
40 |
+
|
41 |
+
s = s + '\n'
|
42 |
+
return s
|
43 |
+
|
44 |
+
def runalgo():
|
45 |
+
|
46 |
+
# now for easy of computing and establishing regions for text mining each signifiant contour, their respective bounding rectangular boxes are found.
|
47 |
+
significant_contour_list = st.session_state.significant_contour_list
|
48 |
+
significant_contour_rect_details = []
|
49 |
+
imgray = st.session_state.imgray
|
50 |
+
for i in range(0,len(significant_contour_list)):
|
51 |
+
significant_contour_rect_details.append(cv.boundingRect(significant_contour_list[i]))
|
52 |
+
|
53 |
+
# the center of each rect for each cell is computed to further easy in sorting and finding the order of cells.
|
54 |
+
significant_contour_rect_center = []
|
55 |
+
for i in range(0,len(significant_contour_rect_details)):
|
56 |
+
significant_contour_rect_center.append((significant_contour_rect_details[i][0] +
|
57 |
+
significant_contour_rect_details[i][2] / 2,
|
58 |
+
significant_contour_rect_details[i][1] +
|
59 |
+
significant_contour_rect_details[i][3] / 2,
|
60 |
+
i))
|
61 |
+
|
62 |
+
# since the order of contours can be different and the exact no. of rows and columns are always unclear
|
63 |
+
# 1. the contour with least y value is found
|
64 |
+
# 2. then the header row is figured out by comparing the y value of each cell with the least y value
|
65 |
+
# 3. still the header row may not be in a correct sequence hence they are ordered by x value to represent the header row of a flat table.
|
66 |
+
unordered_header_rows = []
|
67 |
+
min_y = 1000000.0
|
68 |
+
min_index = 0
|
69 |
+
for i in range(0,len(significant_contour_rect_center)):
|
70 |
+
if min_y >= significant_contour_rect_center[i][1]:
|
71 |
+
min_y = significant_contour_rect_center[i][1]
|
72 |
+
min_index = i
|
73 |
+
for i in range(0,len(significant_contour_rect_center)):
|
74 |
+
if abs(min_y - significant_contour_rect_center[i][1]) <= 5:
|
75 |
+
unordered_header_rows.append(i)
|
76 |
+
header_rows_x_values_unordered = []
|
77 |
+
for i in range(0,len(unordered_header_rows)):
|
78 |
+
header_rows_x_values_unordered.append(significant_contour_rect_center[unordered_header_rows[i]][0])
|
79 |
+
header_rows_x_values_index = np.argsort(header_rows_x_values_unordered)
|
80 |
+
header_rows_index = []
|
81 |
+
for i in range(0,len(header_rows_x_values_index)):
|
82 |
+
header_rows_index.append(unordered_header_rows[header_rows_x_values_index[i]])
|
83 |
+
|
84 |
+
# now from ordered header row cells the remaining cells that are vertically below are found out and then they are ordered by y value.
|
85 |
+
table_cells_index = []
|
86 |
+
for i in header_rows_index:
|
87 |
+
table_cells_index.append([i])
|
88 |
+
for i in range(0,len(header_rows_index)):
|
89 |
+
for j in range(0,len(significant_contour_rect_center)):
|
90 |
+
if abs(significant_contour_rect_center[j][0] -
|
91 |
+
significant_contour_rect_center[header_rows_index[i]][0]) <= 5 and j != header_rows_index[i]:
|
92 |
+
table_cells_index[i].append(j)
|
93 |
+
for i in range(0,len(header_rows_index)):
|
94 |
+
a = list(table_cells_index[i][1:])
|
95 |
+
col_y = []
|
96 |
+
for j in a:
|
97 |
+
col_y.append(significant_contour_rect_center[j][1])
|
98 |
+
col_y_index = np.argsort(col_y)
|
99 |
+
col_y_index = col_y_index
|
100 |
+
b = []
|
101 |
+
for j in col_y_index:
|
102 |
+
b.append(a[j])
|
103 |
+
table_cells_index[i] = [header_rows_index[i]] + b
|
104 |
+
|
105 |
+
# for ech cell tesseract is used to extract the text and stored in a 2d list.
|
106 |
+
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/Cellar/tesseract/5.3.0_1/bin/tesseract" #this is must for macOS M1
|
107 |
+
table_contents = []
|
108 |
+
for i in range(0,len(table_cells_index)):
|
109 |
+
a = []
|
110 |
+
for j in table_cells_index[i]:
|
111 |
+
y = significant_contour_rect_details[j][1]
|
112 |
+
h = significant_contour_rect_details[j][3]
|
113 |
+
x = significant_contour_rect_details[j][0]
|
114 |
+
w = significant_contour_rect_details[j][2]
|
115 |
+
cropped = imgray[y:y + h, x:x + w]
|
116 |
+
text = pytesseract.image_to_string(cropped)
|
117 |
+
a.append(text)
|
118 |
+
table_contents.append(a)
|
119 |
+
df = pd.DataFrame(table_contents)
|
120 |
+
df = df.transpose() # since the data is column wise we have to apply transpose to convert to a flat table.
|
121 |
+
# some preprocessing is required like removing new line character at the last for each cell in the dataframe.
|
122 |
+
for i in range(0,len(df.columns)):
|
123 |
+
df[i] = df.apply(lambda x: remove_newline_char(x[i]),axis = 1)
|
124 |
+
st.session_state.df = df
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
def contour_area(a):
|
129 |
+
return cv.contourArea(a)
|
130 |
+
|
131 |
+
|
132 |
+
def setCountours(img_bytes):
|
133 |
+
|
134 |
+
imgray = cv.cvtColor(img_bytes, cv.COLOR_BGR2GRAY)
|
135 |
+
st.session_state.imgray = imgray
|
136 |
+
ret, thresh = cv.threshold(imgray, 127, 255, 0)
|
137 |
+
contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
|
138 |
+
|
139 |
+
# creating a list of areas by each contour
|
140 |
+
contour_area_list = []
|
141 |
+
for i in range(0,len(contours)):
|
142 |
+
contour_area_list.append(contour_area(contours[i]))
|
143 |
+
contour_area_list = np.array(contour_area_list)
|
144 |
+
|
145 |
+
# finding only significant_counters -- here the area is used as metric to eliminate text contours and other small regions
|
146 |
+
significant_contour_list = []
|
147 |
+
max_contour_area = max(contour_area_list)
|
148 |
+
for i in range(0,len(contours)):
|
149 |
+
# here it is assumed that each cell int able be atleast 800 sq. pixels
|
150 |
+
# there is always a possiblity of non exact crop of image hence there will always be atleast 1 large contour around the table border.
|
151 |
+
if contour_area_list[i] > min_size_of_cell and contour_area_list[i] < max_contour_area / table_contour_factor:
|
152 |
+
significant_contour_list.append(contours[i])
|
153 |
+
significant_contour_list = np.array(significant_contour_list)
|
154 |
+
st.session_state.significant_contour_list = significant_contour_list
|
155 |
+
im_contours_significant = img_bytes.copy()
|
156 |
+
im_contours_significant = cv.drawContours(im_contours_significant, significant_contour_list, -1, (0,255,0), 3) # the contours are set to be visible in green
|
157 |
+
|
158 |
+
img = cv.cvtColor(im_contours_significant, cv.COLOR_BGR2RGB)
|
159 |
+
im_pil = Image.fromarray(img)
|
160 |
+
|
161 |
+
return im_pil
|
162 |
+
|
163 |
+
def convertImg(img):
|
164 |
+
nparr = np.array(img.convert('RGB'))
|
165 |
+
return nparr[:, :, ::-1].copy()
|
166 |
+
|
167 |
+
st.title("Table from Image using opencv")
|
168 |
+
|
169 |
+
image = Image.open('/Users/sarat/Desktop/Projects/Python/Table with data extraction/sports_data.png')
|
170 |
+
image_contoured = setCountours(convertImg(image))
|
171 |
+
|
172 |
+
info_placeholder = st.empty()
|
173 |
+
|
174 |
+
# tab1, tab2 = st.tabs(["Data","Contoured Image"])
|
175 |
+
|
176 |
+
# upload_image_button = st.button("Upload Image")
|
177 |
+
|
178 |
+
uploaded_file = st.file_uploader("Upload Image",type=['png'])
|
179 |
+
if uploaded_file is not None:
|
180 |
+
bytes_data = uploaded_file.getvalue()
|
181 |
+
image = Image.open(io.BytesIO(bytes_data))
|
182 |
+
image_contoured = setCountours(convertImg(image))
|
183 |
+
|
184 |
+
|
185 |
+
st.sidebar.header("Original Image")
|
186 |
+
st.sidebar.image(image)
|
187 |
+
|
188 |
+
col_b_1, col_b_2 = st.columns(2)
|
189 |
+
|
190 |
+
with col_b_1:
|
191 |
+
st.button("Convert",on_click=runalgo)
|
192 |
+
|
193 |
+
with col_b_2:
|
194 |
+
st.download_button('Download CSV', convert_DF_to_csv(st.session_state.df), file_name='data.csv')
|
195 |
+
|
196 |
+
col1, col2 = st.columns(2)
|
197 |
+
|
198 |
+
with col2:
|
199 |
+
if st.session_state.df.shape[0] != 0:
|
200 |
+
st.header("Data")
|
201 |
+
st.dataframe(st.session_state.df)
|
202 |
+
|
203 |
+
with col1:
|
204 |
+
st.header("Image with Contours")
|
205 |
+
st.image(image_contoured)
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
numpy
|
2 |
+
pandas
|
3 |
+
PIL
|
4 |
+
io
|
5 |
+
cv2
|
6 |
+
pytesseract
|