Runtime error
Runtime error
Sarat Chandra Ventrapragada
commited on
first commit
Browse files- +205 -0
- requirements.txt +6 -0
@@ -0,0 +1,205 @@
1 |
import streamlit as st
2 |
import numpy as np
3 |
import pandas as pd
4 |
from PIL import Image
5 |
import io
6 |
import cv2 as cv
7 |
import pytesseract
8 |
9 |
min_size_of_cell = st.sidebar.slider('Min. size of Cell', 1, 50000, 5000)
10 |
st.sidebar.write("Adjust this setting so that no text gets selected and not too large that any cell will be missed.")
11 |
table_contour_factor = st.sidebar.slider('Table Contour Factor', 1, 100, 10)
12 |
st.sidebar.write("Adjust this setting so that the border of entire table / image is not selected. Also not too large that any cell will be missed.")
13 |
14 |
if 'significant_contour_list' not in st.session_state:
15 |
st.session_state.significant_contour_list = []
16 |
17 |
if 'imgray' not in st.session_state:
18 |
st.session_state.imgray = 0
19 |
20 |
if 'df' not in st.session_state:
21 |
st.session_state.df = pd.DataFrame()
22 |
23 |
def remove_newline_char(a):
24 |
if a == "":
25 |
return ""
26 |
if str(a) == "NaN":
27 |
return ""
28 |
if a[-1] == '\n':
29 |
return a[:-1]
30 |
return a
31 |
32 |
def convert_DF_to_csv(df):
33 |
s = ""
34 |
for i in range(0,df.shape[0]):
35 |
for j in range(0,df.shape[1]):
36 |
if j == df.shape[1] - 1:
37 |
s = s + str(df.iloc[i,j])
38 |
39 |
s = s + str(df.iloc[i,j]) + ","
40 |
41 |
s = s + '\n'
42 |
return s
43 |
44 |
def runalgo():
45 |
46 |
# now for easy of computing and establishing regions for text mining each signifiant contour, their respective bounding rectangular boxes are found.
47 |
significant_contour_list = st.session_state.significant_contour_list
48 |
significant_contour_rect_details = []
49 |
imgray = st.session_state.imgray
50 |
for i in range(0,len(significant_contour_list)):
51 |
52 |
53 |
# the center of each rect for each cell is computed to further easy in sorting and finding the order of cells.
54 |
significant_contour_rect_center = []
55 |
for i in range(0,len(significant_contour_rect_details)):
56 |
significant_contour_rect_center.append((significant_contour_rect_details[i][0] +
57 |
significant_contour_rect_details[i][2] / 2,
58 |
significant_contour_rect_details[i][1] +
59 |
significant_contour_rect_details[i][3] / 2,
60 |
61 |
62 |
# since the order of contours can be different and the exact no. of rows and columns are always unclear
63 |
# 1. the contour with least y value is found
64 |
# 2. then the header row is figured out by comparing the y value of each cell with the least y value
65 |
# 3. still the header row may not be in a correct sequence hence they are ordered by x value to represent the header row of a flat table.
66 |
unordered_header_rows = []
67 |
min_y = 1000000.0
68 |
min_index = 0
69 |
for i in range(0,len(significant_contour_rect_center)):
70 |
if min_y >= significant_contour_rect_center[i][1]:
71 |
min_y = significant_contour_rect_center[i][1]
72 |
min_index = i
73 |
for i in range(0,len(significant_contour_rect_center)):
74 |
if abs(min_y - significant_contour_rect_center[i][1]) <= 5:
75 |
76 |
header_rows_x_values_unordered = []
77 |
for i in range(0,len(unordered_header_rows)):
78 |
79 |
header_rows_x_values_index = np.argsort(header_rows_x_values_unordered)
80 |
header_rows_index = []
81 |
for i in range(0,len(header_rows_x_values_index)):
82 |
83 |
84 |
# now from ordered header row cells the remaining cells that are vertically below are found out and then they are ordered by y value.
85 |
table_cells_index = []
86 |
for i in header_rows_index:
87 |
88 |
for i in range(0,len(header_rows_index)):
89 |
for j in range(0,len(significant_contour_rect_center)):
90 |
if abs(significant_contour_rect_center[j][0] -
91 |
significant_contour_rect_center[header_rows_index[i]][0]) <= 5 and j != header_rows_index[i]:
92 |
93 |
for i in range(0,len(header_rows_index)):
94 |
a = list(table_cells_index[i][1:])
95 |
col_y = []
96 |
for j in a:
97 |
98 |
col_y_index = np.argsort(col_y)
99 |
col_y_index = col_y_index
100 |
b = []
101 |
for j in col_y_index:
102 |
103 |
table_cells_index[i] = [header_rows_index[i]] + b
104 |
105 |
# for ech cell tesseract is used to extract the text and stored in a 2d list.
106 |
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/Cellar/tesseract/5.3.0_1/bin/tesseract" #this is must for macOS M1
107 |
table_contents = []
108 |
for i in range(0,len(table_cells_index)):
109 |
a = []
110 |
for j in table_cells_index[i]:
111 |
y = significant_contour_rect_details[j][1]
112 |
h = significant_contour_rect_details[j][3]
113 |
x = significant_contour_rect_details[j][0]
114 |
w = significant_contour_rect_details[j][2]
115 |
cropped = imgray[y:y + h, x:x + w]
116 |
text = pytesseract.image_to_string(cropped)
117 |
118 |
119 |
df = pd.DataFrame(table_contents)
120 |
df = df.transpose() # since the data is column wise we have to apply transpose to convert to a flat table.
121 |
# some preprocessing is required like removing new line character at the last for each cell in the dataframe.
122 |
for i in range(0,len(df.columns)):
123 |
df[i] = df.apply(lambda x: remove_newline_char(x[i]),axis = 1)
124 |
st.session_state.df = df
125 |
126 |
127 |
128 |
def contour_area(a):
129 |
return cv.contourArea(a)
130 |
131 |
132 |
def setCountours(img_bytes):
133 |
134 |
imgray = cv.cvtColor(img_bytes, cv.COLOR_BGR2GRAY)
135 |
st.session_state.imgray = imgray
136 |
ret, thresh = cv.threshold(imgray, 127, 255, 0)
137 |
contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
138 |
139 |
# creating a list of areas by each contour
140 |
contour_area_list = []
141 |
for i in range(0,len(contours)):
142 |
143 |
contour_area_list = np.array(contour_area_list)
144 |
145 |
# finding only significant_counters -- here the area is used as metric to eliminate text contours and other small regions
146 |
significant_contour_list = []
147 |
max_contour_area = max(contour_area_list)
148 |
for i in range(0,len(contours)):
149 |
# here it is assumed that each cell int able be atleast 800 sq. pixels
150 |
# there is always a possiblity of non exact crop of image hence there will always be atleast 1 large contour around the table border.
151 |
if contour_area_list[i] > min_size_of_cell and contour_area_list[i] < max_contour_area / table_contour_factor:
152 |
153 |
significant_contour_list = np.array(significant_contour_list)
154 |
st.session_state.significant_contour_list = significant_contour_list
155 |
im_contours_significant = img_bytes.copy()
156 |
im_contours_significant = cv.drawContours(im_contours_significant, significant_contour_list, -1, (0,255,0), 3) # the contours are set to be visible in green
157 |
158 |
img = cv.cvtColor(im_contours_significant, cv.COLOR_BGR2RGB)
159 |
im_pil = Image.fromarray(img)
160 |
161 |
return im_pil
162 |
163 |
def convertImg(img):
164 |
nparr = np.array(img.convert('RGB'))
165 |
return nparr[:, :, ::-1].copy()
166 |
167 |
st.title("Table from Image using opencv")
168 |
169 |
image ='/Users/sarat/Desktop/Projects/Python/Table with data extraction/sports_data.png')
170 |
image_contoured = setCountours(convertImg(image))
171 |
172 |
info_placeholder = st.empty()
173 |
174 |
# tab1, tab2 = st.tabs(["Data","Contoured Image"])
175 |
176 |
# upload_image_button = st.button("Upload Image")
177 |
178 |
uploaded_file = st.file_uploader("Upload Image",type=['png'])
179 |
if uploaded_file is not None:
180 |
bytes_data = uploaded_file.getvalue()
181 |
image =
182 |
image_contoured = setCountours(convertImg(image))
183 |
184 |
185 |
st.sidebar.header("Original Image")
186 |
187 |
188 |
col_b_1, col_b_2 = st.columns(2)
189 |
190 |
with col_b_1:
191 |
192 |
193 |
with col_b_2:
194 |
st.download_button('Download CSV', convert_DF_to_csv(st.session_state.df), file_name='data.csv')
195 |
196 |
col1, col2 = st.columns(2)
197 |
198 |
with col2:
199 |
if st.session_state.df.shape[0] != 0:
200 |
201 |
202 |
203 |
with col1:
204 |
st.header("Image with Contours")
205 |
@@ -0,0 +1,6 @@
1 |
2 |
3 |
4 |
5 |
6 |