code / 数据获取 /pdf转txt.py
tradequant's picture
Upload 59 files
7ec53ba
# pip3 install "modelscope==1.7.2rc0" -f
# pip install pdfplumber
import glob
import pdfplumber
import re
import os
def check_lines(page, top, buttom):
lines = page.extract_words()[::]
text = ''
last_top = 0
last_check = 0
for each_line in lines:
if top == '' and buttom == '':
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
elif top == '':
if each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
else:
if each_line['top'] < top and each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and not re.search('(?:。|;|\d|报告全文)$', text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
last_top = each_line['top']
last_check = each_line['x1'] - page.width * 0.85
return text
def change_pdf_to_txt(name):
pdf = pdfplumber.open(name)
all_text = {}
allrow = 0
for i in range(len(pdf.pages)):
page = pdf.pages[i]
buttom = 0
tables = page.find_tables()
if len(tables) >= 1:
count = len(tables)
for table in tables:
if table.bbox[3] < buttom:
pass
else:
count = count - 1
top = table.bbox[1]
text = check_lines(page, top, buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
buttom = table.bbox[3]
new_table = table.extract()
r_count = 0
for r in range(len(new_table)):
row = new_table[r]
if row[0] == None:
r_count = r_count + 1
for c in range(len(row)):
if row[c] != None and row[c] != '' and row[c] != ' ':
if new_table[r - r_count][c] == None:
new_table[r - r_count][c] = row[c]
else:
new_table[r - r_count][c] = new_table[r -
r_count][c] + row[c]
new_table[r][c] = None
else:
r_count = 0
end_table = []
for row in new_table:
if row[0] != None:
cell_list = []
for cell in row:
if cell != None:
cell = cell.replace('\n', '')
else:
cell = ''
cell_list.append(cell)
end_table.append(cell_list)
for row in end_table:
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'excel'
all_text[allrow]['inside'] = str(row)
allrow = allrow + 1
if count == 0:
text = check_lines(page, '', buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
else:
text = check_lines(page, '', '')
text_list = text.split('\n')
for _t in range(len(text_list)):
all_text[allrow] = {}
all_text[allrow]['page'] = page
all_text[allrow]['allrow'] = allrow
all_text[allrow]['type'] = 'text'
all_text[allrow]['inside'] = text_list[_t]
allrow = allrow + 1
save_path_1 = f'{folder_path}\\' + \
name.split('\\')[-1].replace('.pdf', '.txt')
save_path_2 = f'{folder_path}\\' + \
name.split('\\')[-1].replace('.pdf', '_txt.txt')
for key in all_text.keys():
with open(save_path_1, 'a+', encoding='utf-8') as file:
file.write(str(all_text[key]) + '\n')
with open(save_path_2, 'a+', encoding='utf-8') as file:
file.write(str(all_text[key]['inside']) + '\n')
folder_path = '新建文件夹'
# 获取文件夹内所有文件名称
file_names = glob.glob(folder_path + '/*')
file_names = sorted(file_names, reverse=True)
print(file_names)
# 打印文件名称
name_list = []
for file_name in file_names:
print(file_name)
try:
name_list.append(file_name)
allname = file_name.split('\\')[-1]
date = allname.split('__')[0]
name = allname.split('__')[1]
year = allname.split('__')[4]
change_pdf_to_txt(file_name)
except Exception as e:
print(f"发生bug: {e}")