woo3 commited on
Commit
7727a4e
·
1 Parent(s): 3df763b

Upload cas_match.py

Browse files
Files changed (1) hide show
  1. cas_match.py +265 -0
cas_match.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Sun Jun 4 15:55:12 2023
4
+
5
+ @author: wooji
6
+ """
7
+ import re
8
+ import streamlit as st
9
+ import numpy as np
10
+ import pandas as pd
11
+ import streamlit as st
12
+ import pandas as pd
13
+ from io import StringIO
14
+ import pdfplumber
15
+ import xlrd
16
+ import re
17
+ import pdfplumber
18
+ import pandas as pd
19
+ import time
20
+ import os
21
+ import numpy as np
22
+ #import win32com
23
+ #from win32com.client import Dispatch
24
+ import docx2pdf
25
+ import docx
26
+ #import win32com.client as wc
27
+ #import win32com.client as win32
28
+ import pytesseract
29
+ from PIL import Image
30
+ import os
31
+ from pdf2image import convert_from_path,convert_from_bytes
32
+ from io import BytesIO
33
+
34
+ import openpyxl
35
+ import base64
36
+ st.title("MSDS报告CAS号提取程序")
37
+ #%%
38
+ from docx import Document
39
+
40
+ def get_tables(docx_path):
41
+ docStr = Document(docx_path)
42
+ numTables = docStr.tables
43
+ my_list = []
44
+ for table in numTables:
45
+ row_count = len(table.rows)
46
+ col_count = len(table.columns)
47
+ for i in range(row_count):
48
+ row = table.rows[i].cells
49
+ for j in range(col_count):
50
+ content = row[j].text
51
+ my_list.append(content)
52
+ my_list = ';'.join(my_list).strip('')
53
+ return my_list
54
+
55
+
56
+ def get_paragraphs(docx_path):
57
+ #打开word文档
58
+ document = Document(docx_path)
59
+ #获取所有段落
60
+ all_paragraphs = document.paragraphs
61
+ paragraph_texts = []
62
+ # 循环读取列表
63
+ for paragraph in all_paragraphs:
64
+ paragraph_texts.append(paragraph.text)
65
+ paragraph_texts = ';'.join(paragraph_texts).strip('')
66
+ return paragraph_texts
67
+
68
+
69
+
70
+ #%% 函数二、打开pdf文件,输出每一页pdf中的所有文字
71
+ def openpdf(path):
72
+ with pdfplumber.open(path) as pdf:
73
+ # pdf = pdfplumber.open(path)
74
+ item = []
75
+ for page in pdf.pages:
76
+ text = page.extract_text()
77
+ item.append(text)
78
+ # item = [''.join(i) for i in item]
79
+ item = ';'.join(item).strip('')
80
+ return item
81
+
82
+ #%% 函数三、将目标CAS号,和pdf中的内容进行比对。返回什么?
83
+ def extract(text,cas):
84
+ pattern = re.compile(cas,re.S)
85
+ r_list = pattern.findall(text)
86
+ return r_list
87
+ #%%
88
+ # data = pd.DataFrame(columns=['CAS','名称','匹配结果','备注'])
89
+ st.write('使用说明')
90
+ st.caption('支持解析的格式:.pdf(扫描版或非扫描版均支持)和.docx。可将MSDS文件夹直接拖拽到下方上传区域')
91
+ st.write('excel输出内容详解')
92
+ st.caption('第一列为文件名称,所有上传的文件均会显示在第一列,即便该文件格式不支持提取')
93
+ st.caption('第二列为文件中提取的CAS号,若为空则表明未提取到')
94
+ st.caption('第三列为化学物质名称,仅支持显示与清单匹配成功的化学物质的名称')
95
+ st.caption('第四列为匹配结果,共3种结果:3960种、优评优控、重点管控')
96
+ st.caption('第五列为备注,共3种结果:1、不支持该格式文件,请手动查看:说明此类文件不支持解析,请手动查看;2、图片pdf,建议人工复核:说明该pdf为图片,提取正确率较低,视情况可进行人工复核;3、未检测到CAS,请手动检查:说明在该文件中未检测到CAS,请人工确认')
97
+
98
+ st.caption('提取速度:提取一个电子pdf大约耗时4s,一个扫描版pdf大约耗时10~20s。具体速度由pdf的页数决定')
99
+ st.divider()
100
+ uploaded_file = st.file_uploader("请上传MSDS报告,可直接往里拖拽文件夹",accept_multiple_files=True)
101
+ @st.cache_data
102
+ def main(uploaded_file):
103
+ data = pd.DataFrame(columns=['CAS','名称','匹配结果','备注'])
104
+
105
+ begin = time.time()
106
+ # openpdf(uploaded_file)
107
+ cas = r'[0-9]+-[0-9][0-9]-[0-9][^0-9]'
108
+ # st.write(extract(openpdf(uploaded_file),cas))
109
+ for file in range(len(uploaded_file)):
110
+ if uploaded_file[file].name[-4:] == 'docx':
111
+ text = get_paragraphs(uploaded_file[file])
112
+ # text(get_tables(uploaded_file[file]))
113
+ # text = ';'.join(text).strip('')
114
+ elif uploaded_file[file].name[-3:] == 'pdf' or uploaded_file[file].name[-3:] == 'PDF':
115
+ text = openpdf(uploaded_file[file])
116
+ else:
117
+ cas_set = pd.DataFrame({'备注':{uploaded_file[file].name:'不支持该格式文件,请手动查看'}})
118
+ data = pd.concat([data,cas_set],axis=0)
119
+ continue
120
+ cas_extract = extract(text,cas)
121
+ if cas_extract != []:
122
+ for item in range(len(cas_extract)):
123
+ cas_iso = cas_extract[item]
124
+ cas_iso = cas_iso[0:len(cas_iso)-1]
125
+ cas_set = pd.DataFrame({'CAS':{uploaded_file[file].name:cas_iso}})
126
+ data = pd.concat([data,cas_set],axis=0)
127
+ #提取docx表格内的内容
128
+ elif uploaded_file[file].name[-4:] == 'docx':
129
+ text = get_tables(uploaded_file[file])
130
+ # text = ';'.join(text).strip('')
131
+ cas_extract = extract(text,cas)
132
+ if cas_extract != []:
133
+ for item in range(len(cas_extract)):
134
+ cas_iso = cas_extract[item]
135
+ cas_iso = cas_iso[0:len(cas_iso)-1]
136
+ cas_set = pd.DataFrame({'CAS':{uploaded_file[file].name:cas_iso}})
137
+ data = pd.concat([data,cas_set],axis=0)
138
+ else:
139
+ cas_set = pd.DataFrame({'备注':{uploaded_file[file].name:'未检测到CAS,请手动检查'}})
140
+ data = pd.concat([data,cas_set],axis=0)
141
+ else:
142
+ pages = convert_from_bytes(uploaded_file[file].getvalue()) # 上传的内容是什么?
143
+ text = []
144
+ for i,page in enumerate(pages):
145
+ buf = BytesIO()
146
+ page.save(buf,format="JPEG")
147
+ buf.seek(0)
148
+ img_page=Image.open(buf)
149
+ # st.write('here')
150
+ txt=pytesseract.image_to_string(img_page,lang='chi_sim')
151
+ text.append(txt)
152
+ text = ';'.join(text).strip('')
153
+ cas_extract = extract(text,cas)
154
+ if cas_extract != []:
155
+ cas_extract = extract(text,cas)
156
+ for item in range(len(cas_extract)):
157
+ cas_iso = cas_extract[item]
158
+ cas_iso = cas_iso[0:len(cas_iso)-1]
159
+ print(cas_iso)
160
+ # cas_set = pd.Series({uploaded_file[file].name:cas_iso+'图片pdf,请手动检查'}) #在这里加备注提示是扫描版pdf
161
+ #用dataframe承载
162
+ cas_set = pd.DataFrame({'CAS':{uploaded_file[file].name:cas_iso},'备注':{uploaded_file[file].name:'图片pdf,建议人工复核'}})
163
+ data = pd.concat([data,cas_set],axis=0)
164
+ else:
165
+ cas_set = pd.DataFrame({'备注':{uploaded_file[file].name:'未检测到CAS,请手动检查'}})
166
+ data = pd.concat([data,cas_set],axis=0)
167
+
168
+ # st.write(uploaded_file)
169
+ # convert_from_bytes(open('/home/belval/example.pdf','rb').read())
170
+
171
+
172
+
173
+ #%%数据整理
174
+ data_reset_index = data.reset_index(drop=False)
175
+ #修改列名
176
+ data_rename = data_reset_index.rename(columns={'index':'MSDS文件名称'})
177
+ #去除重复行
178
+ data_output = data_rename.drop_duplicates() #subset='pdf名称'可以查看是不是所有文件都包含在表格里
179
+ # target_data_base = pd.read_excel('C:/Users/wooji/Nutstore/1/Jiho华南所/鉴定中心-工作/MSDS/102-104物质清单.xlsx',sheet_name='基102-3960种',index_col=0)
180
+ # # target_data_pri = pd.read_excel('C:/Users/wooji/Nutstore/1/Jiho华南所/鉴定中心-工作/MSDS/物质清单.xlsx',sheet_name='优评优控',index_col=0)
181
+ # # target_data_key = pd.read_excel('C:/Users/wooji/Nutstore/1/Jiho华南所/鉴定中心-工作/MSDS/物质清单.xlsx',sheet_name='重点管控',index_col=0)
182
+ # target_cas_base = target_data_base[['CAS','名称']]
183
+ # target_cas_pri = target_data_pri[['CAS','名称']]
184
+ # target_cas_key = target_data_key[['CAS','名称']]
185
+ # target_cas_base = target_cas_base.reset_index(drop=True)
186
+ # target_cas_pri = target_cas_pri.reset_index(drop=True)
187
+ # target_cas_key = target_cas_key.reset_index(drop=True)
188
+ target_data = pd.read_excel('物质清单.xlsx',sheet_name='总表',index_col=0)
189
+ target_cas = target_data[['CAS','名称','清单']]
190
+ target_cas = target_cas.reset_index(drop=True)
191
+
192
+
193
+ #%%
194
+ for row in data_output.index:
195
+ # print(data_output.loc[row]['CAS号提取'])
196
+ for b in target_cas.index:
197
+ if data_output.loc[row]['CAS'] == target_cas.loc[b]['CAS']:
198
+ data_output.loc[row]['匹配结果'] =target_cas.loc[b]['清单']
199
+ data_output.loc[row]['名称'] = target_cas.loc[b]['名称']
200
+
201
+
202
+ data_final = data_output
203
+ # [['pdf名称','匹配结果','CAS号提取','名称','备注']]
204
+ end = time.time()
205
+ run_time = end - begin
206
+ st.write('运行耗时:'+ str(round(run_time,2))+'秒')
207
+ return data_final
208
+
209
+
210
+ if uploaded_file == []:
211
+ st.stop()
212
+ else:
213
+ data_final = main(uploaded_file)
214
+ data_final
215
+ data_final.to_excel('resuls.xlsx')
216
+ wb2 = openpyxl.load_workbook('resuls.xlsx')
217
+ wb2.save('results.xlsx')#注意!文件此时保存在内存中且为字节格式文件
218
+ data=open('results.xlsx','rb').read()#以只读模式读取且读取为二进制文件
219
+ b64 = base64.b64encode(data).decode('UTF-8')#解码并加密为base64
220
+ excel_name = st.text_input(':blue[请输入本次导入的文件所属企业名称,若为空则导出的excel默认取名为myresult.xlsx]')
221
+ st.warning('建议示例:广西xx企业-原辅料 or 广西xx企业-产品 ------- 输入完请按回车 ', icon="🚨")
222
+ if excel_name:
223
+ excel_name = excel_name + '.xlsx'
224
+ href = f'<a href="data:file/data;base64,{b64}" download={excel_name}>导出excel</a>'#定义下载链接,默认的下载文件名是myresults.xlsx
225
+ st.markdown(href, unsafe_allow_html=True)#输出到浏览器
226
+ wb2.close()
227
+ else:
228
+ href = f'<a href="data:file/data;base64,{b64}" download=myresult.xlsx>导出excel</a>'#定义下载链接,默认的下载文件名是myresults.xlsx
229
+ st.markdown(href, unsafe_allow_html=True)#输出到浏览器
230
+ wb2.close()
231
+
232
+
233
+ st.subheader('!!!单次使用完请刷新页面后再上传新的文件')
234
+
235
+
236
+ # else:
237
+ # excel_name = excel_name + '.xlsx'
238
+ # href = f'<a href="data:file/data;base64,{b64}" download={excel_name}>Download xlsx file</a>'#定义下载链接,默认的下载文件名是myresults.xlsx
239
+ # st.markdown(href, unsafe_allow_html=True)#输出到浏览器
240
+ # wb2.close()
241
+
242
+
243
+
244
+
245
+ ####直接写识别图片的代码
246
+ # stringio = StringIO(uploaded_file[file].getvalue().decode("utf-8"))
247
+ # st.write(stringio) ##这句是对的
248
+ # bytes_data = uploaded_file[file].read()
249
+ # st.write(bytes_data)
250
+ # st.write(uploaded_file[file])
251
+ # st.write(bytes_data)
252
+ # =============================================================================
253
+ # ####
254
+ # stringio = StringIO(uploaded_file[file].getvalue().decode("utf-8"))
255
+ # st.write(stringio)
256
+ # # To read file as string:
257
+ # string_data = stringio.read()
258
+ # st.write(string_data)
259
+ # ###
260
+ # =============================================================================
261
+
262
+
263
+
264
+
265
+