Spaces:
Running
Running
wangrongsheng
commited on
Commit
•
f64978e
1
Parent(s):
3668f92
Update app.py
Browse files
app.py
CHANGED
@@ -59,14 +59,14 @@ class Paper:
|
|
59 |
self.sl = sl
|
60 |
self.section_names = [] # 段落标题
|
61 |
self.section_texts = {} # 段落内容
|
|
|
62 |
if title == '':
|
63 |
self.pdf = fitz.open(self.path) # pdf文档
|
64 |
self.title = self.get_title()
|
65 |
self.parse_pdf()
|
66 |
else:
|
67 |
self.title = title
|
68 |
-
self.authers = authers
|
69 |
-
self.abs = abs
|
70 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
71 |
self.digit_num = [str(d+1) for d in range(10)]
|
72 |
self.first_image = ''
|
@@ -167,12 +167,13 @@ class Paper:
|
|
167 |
text = page.get_text("dict") # 获取页面上的文本信息
|
168 |
blocks = text["blocks"] # 获取文本块列表
|
169 |
for block in blocks: # 遍历每个文本块
|
170 |
-
if block["type"] == 0: # 如果是文字类型
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
176 |
max_font_sizes.sort()
|
177 |
print("max_font_sizes", max_font_sizes[-10:])
|
178 |
cur_title = ''
|
@@ -180,19 +181,20 @@ class Paper:
|
|
180 |
text = page.get_text("dict") # 获取页面上的文本信息
|
181 |
blocks = text["blocks"] # 获取文本块列表
|
182 |
for block in blocks: # 遍历每个文本块
|
183 |
-
if block["type"] == 0: # 如果是文字类型
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
cur_title
|
194 |
-
|
195 |
-
|
|
|
196 |
# break
|
197 |
title = cur_title.replace('\n', ' ')
|
198 |
return title
|
@@ -232,30 +234,12 @@ class Paper:
|
|
232 |
text = ''
|
233 |
text_list = []
|
234 |
section_dict = {}
|
235 |
-
|
236 |
-
# # 先处理Abstract章节
|
237 |
-
# for page_index, page in enumerate(self.pdf):
|
238 |
-
# cur_text = page.get_text()
|
239 |
-
# # 如果该页面是Abstract章节所在页面
|
240 |
-
# if page_index == list(self.section_page_dict.values())[0]:
|
241 |
-
# abs_str = "Abstract"
|
242 |
-
# # 获取Abstract章节的起始位置
|
243 |
-
# first_index = cur_text.find(abs_str)
|
244 |
-
# # 查找下一个章节的关键词,这里是Introduction
|
245 |
-
# intro_str = "Introduction"
|
246 |
-
# if intro_str in cur_text:
|
247 |
-
# second_index = cur_text.find(intro_str)
|
248 |
-
# elif intro_str.upper() in cur_text:
|
249 |
-
# second_index = cur_text.find(intro_str.upper())
|
250 |
-
# # 将Abstract章节内容加入字典中
|
251 |
-
# section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
|
252 |
-
# '').replace('\n', ' ').split('I.')[0].split("II.")[0]
|
253 |
-
|
254 |
# 再处理其他章节:
|
255 |
text_list = [page.get_text() for page in self.pdf]
|
256 |
for sec_index, sec_name in enumerate(self.section_page_dict):
|
257 |
print(sec_index, sec_name, self.section_page_dict[sec_name])
|
258 |
-
if sec_index <= 0:
|
259 |
continue
|
260 |
else:
|
261 |
# 直接考虑后面的内容:
|
|
|
59 |
self.sl = sl
|
60 |
self.section_names = [] # 段落标题
|
61 |
self.section_texts = {} # 段落内容
|
62 |
+
self.abs = abs
|
63 |
if title == '':
|
64 |
self.pdf = fitz.open(self.path) # pdf文档
|
65 |
self.title = self.get_title()
|
66 |
self.parse_pdf()
|
67 |
else:
|
68 |
self.title = title
|
69 |
+
self.authers = authers
|
|
|
70 |
self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
|
71 |
self.digit_num = [str(d+1) for d in range(10)]
|
72 |
self.first_image = ''
|
|
|
167 |
text = page.get_text("dict") # 获取页面上的文本信息
|
168 |
blocks = text["blocks"] # 获取文本块列表
|
169 |
for block in blocks: # 遍历每个文本块
|
170 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
171 |
+
if len(block["lines"][0]["spans"]):
|
172 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
173 |
+
max_font_sizes.append(font_size)
|
174 |
+
if font_size > max_font_size: # 如果字体大小大于当前最大值
|
175 |
+
max_font_size = font_size # 更新最大值
|
176 |
+
max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
177 |
max_font_sizes.sort()
|
178 |
print("max_font_sizes", max_font_sizes[-10:])
|
179 |
cur_title = ''
|
|
|
181 |
text = page.get_text("dict") # 获取页面上的文本信息
|
182 |
blocks = text["blocks"] # 获取文本块列表
|
183 |
for block in blocks: # 遍历每个文本块
|
184 |
+
if block["type"] == 0 and len(block['lines']): # 如果是文字类型
|
185 |
+
if len(block["lines"][0]["spans"]):
|
186 |
+
cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
|
187 |
+
font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
|
188 |
+
font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
|
189 |
+
# print(font_size)
|
190 |
+
if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
|
191 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
192 |
+
if len(cur_string) > 4 and "arXiv" not in cur_string:
|
193 |
+
# print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
|
194 |
+
if cur_title == '' :
|
195 |
+
cur_title += cur_string
|
196 |
+
else:
|
197 |
+
cur_title += ' ' + cur_string
|
198 |
# break
|
199 |
title = cur_title.replace('\n', ' ')
|
200 |
return title
|
|
|
234 |
text = ''
|
235 |
text_list = []
|
236 |
section_dict = {}
|
237 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
238 |
# 再处理其他章节:
|
239 |
text_list = [page.get_text() for page in self.pdf]
|
240 |
for sec_index, sec_name in enumerate(self.section_page_dict):
|
241 |
print(sec_index, sec_name, self.section_page_dict[sec_name])
|
242 |
+
if sec_index <= 0 and self.abs:
|
243 |
continue
|
244 |
else:
|
245 |
# 直接考虑后面的内容:
|