Sarat Chandra Ventrapragada
first commit
f169836
raw
history blame
8.51 kB
import streamlit as st
import numpy as np
import pandas as pd
from PIL import Image
import io
import cv2 as cv
import pytesseract
min_size_of_cell = st.sidebar.slider('Min. size of Cell', 1, 50000, 5000)
st.sidebar.write("Adjust this setting so that no text gets selected and not too large that any cell will be missed.")
table_contour_factor = st.sidebar.slider('Table Contour Factor', 1, 100, 10)
st.sidebar.write("Adjust this setting so that the border of entire table / image is not selected. Also not too large that any cell will be missed.")
if 'significant_contour_list' not in st.session_state:
st.session_state.significant_contour_list = []
if 'imgray' not in st.session_state:
st.session_state.imgray = 0
if 'df' not in st.session_state:
st.session_state.df = pd.DataFrame()
def remove_newline_char(a):
if a == "":
return ""
if str(a) == "NaN":
return ""
if a[-1] == '\n':
return a[:-1]
return a
def convert_DF_to_csv(df):
s = ""
for i in range(0,df.shape[0]):
for j in range(0,df.shape[1]):
if j == df.shape[1] - 1:
s = s + str(df.iloc[i,j])
else:
s = s + str(df.iloc[i,j]) + ","
s = s + '\n'
return s
def runalgo():
# now for easy of computing and establishing regions for text mining each signifiant contour, their respective bounding rectangular boxes are found.
significant_contour_list = st.session_state.significant_contour_list
significant_contour_rect_details = []
imgray = st.session_state.imgray
for i in range(0,len(significant_contour_list)):
significant_contour_rect_details.append(cv.boundingRect(significant_contour_list[i]))
# the center of each rect for each cell is computed to further easy in sorting and finding the order of cells.
significant_contour_rect_center = []
for i in range(0,len(significant_contour_rect_details)):
significant_contour_rect_center.append((significant_contour_rect_details[i][0] +
significant_contour_rect_details[i][2] / 2,
significant_contour_rect_details[i][1] +
significant_contour_rect_details[i][3] / 2,
i))
# since the order of contours can be different and the exact no. of rows and columns are always unclear
# 1. the contour with least y value is found
# 2. then the header row is figured out by comparing the y value of each cell with the least y value
# 3. still the header row may not be in a correct sequence hence they are ordered by x value to represent the header row of a flat table.
unordered_header_rows = []
min_y = 1000000.0
min_index = 0
for i in range(0,len(significant_contour_rect_center)):
if min_y >= significant_contour_rect_center[i][1]:
min_y = significant_contour_rect_center[i][1]
min_index = i
for i in range(0,len(significant_contour_rect_center)):
if abs(min_y - significant_contour_rect_center[i][1]) <= 5:
unordered_header_rows.append(i)
header_rows_x_values_unordered = []
for i in range(0,len(unordered_header_rows)):
header_rows_x_values_unordered.append(significant_contour_rect_center[unordered_header_rows[i]][0])
header_rows_x_values_index = np.argsort(header_rows_x_values_unordered)
header_rows_index = []
for i in range(0,len(header_rows_x_values_index)):
header_rows_index.append(unordered_header_rows[header_rows_x_values_index[i]])
# now from ordered header row cells the remaining cells that are vertically below are found out and then they are ordered by y value.
table_cells_index = []
for i in header_rows_index:
table_cells_index.append([i])
for i in range(0,len(header_rows_index)):
for j in range(0,len(significant_contour_rect_center)):
if abs(significant_contour_rect_center[j][0] -
significant_contour_rect_center[header_rows_index[i]][0]) <= 5 and j != header_rows_index[i]:
table_cells_index[i].append(j)
for i in range(0,len(header_rows_index)):
a = list(table_cells_index[i][1:])
col_y = []
for j in a:
col_y.append(significant_contour_rect_center[j][1])
col_y_index = np.argsort(col_y)
col_y_index = col_y_index
b = []
for j in col_y_index:
b.append(a[j])
table_cells_index[i] = [header_rows_index[i]] + b
# for ech cell tesseract is used to extract the text and stored in a 2d list.
# pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/Cellar/tesseract/5.3.0_1/bin/tesseract" #this is must for macOS M1
table_contents = []
for i in range(0,len(table_cells_index)):
a = []
for j in table_cells_index[i]:
y = significant_contour_rect_details[j][1]
h = significant_contour_rect_details[j][3]
x = significant_contour_rect_details[j][0]
w = significant_contour_rect_details[j][2]
cropped = imgray[y:y + h, x:x + w]
text = pytesseract.image_to_string(cropped)
a.append(text)
table_contents.append(a)
df = pd.DataFrame(table_contents)
df = df.transpose() # since the data is column wise we have to apply transpose to convert to a flat table.
# some preprocessing is required like removing new line character at the last for each cell in the dataframe.
for i in range(0,len(df.columns)):
df[i] = df.apply(lambda x: remove_newline_char(x[i]),axis = 1)
st.session_state.df = df
def contour_area(a):
return cv.contourArea(a)
def setCountours(img_bytes):
imgray = cv.cvtColor(img_bytes, cv.COLOR_BGR2GRAY)
st.session_state.imgray = imgray
ret, thresh = cv.threshold(imgray, 127, 255, 0)
contours, hierarchy = cv.findContours(thresh, cv.RETR_TREE, cv.CHAIN_APPROX_SIMPLE)
# creating a list of areas by each contour
contour_area_list = []
for i in range(0,len(contours)):
contour_area_list.append(contour_area(contours[i]))
contour_area_list = np.array(contour_area_list)
# finding only significant_counters -- here the area is used as metric to eliminate text contours and other small regions
significant_contour_list = []
max_contour_area = max(contour_area_list)
for i in range(0,len(contours)):
# here it is assumed that each cell int able be atleast 800 sq. pixels
# there is always a possiblity of non exact crop of image hence there will always be atleast 1 large contour around the table border.
if contour_area_list[i] > min_size_of_cell and contour_area_list[i] < max_contour_area / table_contour_factor:
significant_contour_list.append(contours[i])
significant_contour_list = np.array(significant_contour_list)
st.session_state.significant_contour_list = significant_contour_list
im_contours_significant = img_bytes.copy()
im_contours_significant = cv.drawContours(im_contours_significant, significant_contour_list, -1, (0,255,0), 3) # the contours are set to be visible in green
img = cv.cvtColor(im_contours_significant, cv.COLOR_BGR2RGB)
im_pil = Image.fromarray(img)
return im_pil
def convertImg(img):
nparr = np.array(img.convert('RGB'))
return nparr[:, :, ::-1].copy()
st.title("Table from Image using opencv")
image = Image.open('sports_data.png')
image_contoured = setCountours(convertImg(image))
info_placeholder = st.empty()
# tab1, tab2 = st.tabs(["Data","Contoured Image"])
# upload_image_button = st.button("Upload Image")
uploaded_file = st.file_uploader("Upload Image",type=['png'])
if uploaded_file is not None:
bytes_data = uploaded_file.getvalue()
image = Image.open(io.BytesIO(bytes_data))
image_contoured = setCountours(convertImg(image))
st.sidebar.header("Original Image")
st.sidebar.image(image)
col_b_1, col_b_2 = st.columns(2)
with col_b_1:
st.button("Convert",on_click=runalgo)
with col_b_2:
st.download_button('Download CSV', convert_DF_to_csv(st.session_state.df), file_name='data.csv')
col1, col2 = st.columns(2)
with col2:
if st.session_state.df.shape[0] != 0:
st.header("Data")
st.dataframe(st.session_state.df)
with col1:
st.header("Image with Contours")
st.image(image_contoured)