Spaces:
Build error
Build error
Upload app.py
Browse files
app.py
CHANGED
@@ -194,6 +194,23 @@ class Paper:
|
|
194 |
text_list = []
|
195 |
section_dict = {}
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
|
198 |
# 再处理其他章节:
|
199 |
text_list = [page.get_text() for page in self.pdf]
|
@@ -260,27 +277,198 @@ class Reader:
|
|
260 |
self.filter_keys = filter_keys # 用于在摘要中筛选的关键词
|
261 |
self.root_path = root_path
|
262 |
self.file_format = 'md' # or 'txt',如果为图片,则必须为'md'
|
263 |
-
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
def validateTitle(self, title):
|
266 |
# 将论文的乱七八糟的路径格式修正
|
267 |
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
|
268 |
new_title = re.sub(rstr, "_", title) # 替换为下划线
|
269 |
return new_title
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
def summary_with_chat(self, paper_list, key):
|
273 |
htmls = []
|
274 |
for paper_index, paper in enumerate(paper_list):
|
275 |
# 第一步先用title,abs,和introduction进行总结。
|
276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
277 |
chat_summary_text = self.chat_summary(text=text, key=str(key))
|
278 |
htmls.append(chat_summary_text)
|
279 |
-
|
280 |
-
|
281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
htmls.append("\n")
|
283 |
-
|
284 |
md_text = "\n".join(htmls)
|
285 |
|
286 |
return markdown.markdown(md_text)
|
@@ -289,31 +477,25 @@ class Reader:
|
|
289 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
290 |
stop=tenacity.stop_after_attempt(5),
|
291 |
reraise=True)
|
292 |
-
def chat_conclusion(self, text):
|
293 |
-
openai.api_key =
|
294 |
-
self.cur_api += 1
|
295 |
-
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list) - 1 else self.cur_api
|
296 |
response = openai.ChatCompletion.create(
|
297 |
model="gpt-3.5-turbo",
|
298 |
# prompt需要用英语替换,少占用token。
|
299 |
messages=[
|
300 |
-
{"role": "system",
|
301 |
-
|
302 |
-
# chatgpt 角色
|
303 |
-
{"role": "assistant",
|
304 |
-
"content": "This is the <summary> and <conclusion> part of an English literature, where <summary> you have already summarized, but <conclusion> part, I need your help to summarize the following questions:" + text},
|
305 |
-
# 背景知识,可以参考OpenReview的审稿流程
|
306 |
{"role": "user", "content": """
|
307 |
-
8.
|
308 |
-
- (1)
|
309 |
-
- (2)
|
310 |
.......
|
311 |
-
|
312 |
-
8. Conclusion:
|
313 |
-
- (1):xxx
|
314 |
-
- (2)
|
315 |
-
|
316 |
-
|
317 |
"""},
|
318 |
]
|
319 |
)
|
@@ -321,38 +503,32 @@ class Reader:
|
|
321 |
for choice in response.choices:
|
322 |
result += choice.message.content
|
323 |
print("conclusion_result:\n", result)
|
324 |
-
return result
|
325 |
-
|
326 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
327 |
stop=tenacity.stop_after_attempt(5),
|
328 |
reraise=True)
|
329 |
-
def chat_method(self, text):
|
330 |
-
openai.api_key =
|
331 |
-
self.cur_api += 1
|
332 |
-
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list) - 1 else self.cur_api
|
333 |
response = openai.ChatCompletion.create(
|
334 |
model="gpt-3.5-turbo",
|
335 |
messages=[
|
336 |
-
{"role": "system",
|
337 |
-
|
338 |
-
# chatgpt 角色
|
339 |
-
{"role": "assistant",
|
340 |
-
"content": "This is the <summary> and <Method> part of an English document, where <summary> you have summarized, but the <Methods> part, I need your help to read and summarize the following questions." + text},
|
341 |
-
# 背景知识
|
342 |
{"role": "user", "content": """
|
343 |
-
7.
|
344 |
- (1):...
|
345 |
- (2):...
|
346 |
- (3):...
|
347 |
- .......
|
348 |
-
|
349 |
-
7. Methods:
|
350 |
-
- (1):xxx
|
351 |
-
- (2):xxx
|
352 |
-
- (3):xxx
|
353 |
-
.......
|
354 |
-
|
355 |
-
|
356 |
"""},
|
357 |
]
|
358 |
)
|
@@ -365,44 +541,37 @@ class Reader:
|
|
365 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
366 |
stop=tenacity.stop_after_attempt(5),
|
367 |
reraise=True)
|
368 |
-
def chat_summary(self, text):
|
369 |
-
openai.api_key =
|
370 |
-
self.cur_api += 1
|
371 |
-
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list) - 1 else self.cur_api
|
372 |
-
|
373 |
response = openai.ChatCompletion.create(
|
374 |
model="gpt-3.5-turbo",
|
375 |
messages=[
|
376 |
-
{"role": "system",
|
377 |
-
|
378 |
-
# chatgpt 角色
|
379 |
-
{"role": "assistant",
|
380 |
-
"content": "This is the title, author, link, abstract and introduction of an English document. I need your help to read and summarize the following questions: " + text},
|
381 |
-
# 背景知识
|
382 |
{"role": "user", "content": """
|
383 |
-
1.
|
384 |
-
2.
|
385 |
-
3.
|
386 |
-
4.
|
387 |
-
5.
|
388 |
-
6.
|
389 |
-
- (1)
|
390 |
-
- (2)
|
391 |
-
- (3)
|
392 |
-
- (4)
|
393 |
-
|
394 |
-
1. Title: xxx
|
395 |
-
2. Authors: xxx
|
396 |
-
3. Affiliation: xxx
|
397 |
-
4. Keywords: xxx
|
398 |
-
5. Urls: xxx or xxx , xxx
|
399 |
-
6. Summary:
|
400 |
-
- (1):xxx
|
401 |
-
- (2):xxx
|
402 |
-
- (3):xxx
|
403 |
-
- (4):xxx
|
404 |
-
|
405 |
-
|
406 |
"""},
|
407 |
]
|
408 |
)
|
@@ -410,69 +579,8 @@ class Reader:
|
|
410 |
for choice in response.choices:
|
411 |
result += choice.message.content
|
412 |
print("summary_result:\n", result)
|
413 |
-
return result
|
414 |
-
|
415 |
-
stop=tenacity.stop_after_attempt(5),
|
416 |
-
reraise=True)
|
417 |
-
def chat_review(self, text):
|
418 |
-
openai.api_key = self.chat_api_list[self.cur_api]
|
419 |
-
self.cur_api += 1
|
420 |
-
self.cur_api = 0 if self.cur_api >= len(self.chat_api_list) - 1 else self.cur_api
|
421 |
-
|
422 |
-
response = openai.ChatCompletion.create(
|
423 |
-
model="gpt-3.5-turbo",
|
424 |
-
messages=[
|
425 |
-
{"role": "system",
|
426 |
-
"content": "You are a researcher in the field of [" + self.key_word + "] who is good at reviewing papers using concise statements"},
|
427 |
-
# chatgpt 角色
|
428 |
-
{"role": "assistant",
|
429 |
-
"content": "This is the title, author, link, abstract, introduction, method, experiments, and conclusions of an English document. I need your help to read and summarize the following questions: " + text},
|
430 |
-
# 背景知识
|
431 |
-
{"role": "user", "content": """
|
432 |
-
1. Mark the title of the paper (use English)
|
433 |
-
2. list all the authors' names (use English)
|
434 |
-
3. mark the first author's affiliation (use English)
|
435 |
-
4. mark the keywords of this article (use English)
|
436 |
-
5. link to the paper, Github code link (if available, fill in Github:None if not)
|
437 |
-
6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English)
|
438 |
-
- (1):What is the research background of this article?
|
439 |
-
- (2):What are the past methods? What are the problems with them? Is the approach well motivated?
|
440 |
-
- (3):What is the research methodology proposed in this paper?
|
441 |
-
- (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
|
442 |
-
Follow the format of the output that follows:
|
443 |
-
1. Title: xxx\n\n
|
444 |
-
2. Summary: \n\n
|
445 |
-
- (1):xxx;\n
|
446 |
-
- (2):xxx;\n
|
447 |
-
- (3):xxx;\n
|
448 |
-
- (4):xxx.\n\n
|
449 |
-
3. Strength: \n\n
|
450 |
-
- (1):xxx;\n
|
451 |
-
- (2):xxx;\n
|
452 |
-
- (3):xxx;\n
|
453 |
-
- (4):xxx.\n\n
|
454 |
-
4. Weakness: \n\n
|
455 |
-
- (1):xxx;\n
|
456 |
-
- (2):xxx;\n
|
457 |
-
- (3):xxx;\n
|
458 |
-
- (4):xxx.\n\n
|
459 |
-
5. Other questions: \n\n
|
460 |
-
- (1):grammars;\n
|
461 |
-
- (2):figures;\n
|
462 |
-
- (3):tables;\n
|
463 |
-
- (4):other detailed questions.\n\n
|
464 |
-
Be sure to use English answers, statements as concise and academic as possible, do not have too much repetitive information, numerical values using the original numbers, be sure to strictly follow the format, the corresponding content output to xxx, in accordance with \n line feed.
|
465 |
-
"""},
|
466 |
-
]
|
467 |
-
)
|
468 |
-
result = ''
|
469 |
-
for choice in response.choices:
|
470 |
-
result += choice.message.content
|
471 |
-
print("Review_result:\n", result)
|
472 |
-
return result
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
def export_to_markdown(self, text, file_name, mode='w'):
|
477 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
478 |
# html = markdown.markdown(text)
|
@@ -487,8 +595,10 @@ class Reader:
|
|
487 |
print(f"Query: {self.query}")
|
488 |
print(f"Sort: {self.sort}")
|
489 |
|
490 |
-
def upload_pdf(key,file):
|
491 |
-
|
|
|
|
|
492 |
# 判断PDF文件
|
493 |
if file and file.name.split(".")[-1].lower() != "pdf":
|
494 |
return '请勿上传非 PDF 文件!'
|
@@ -501,18 +611,20 @@ def upload_pdf(key,file):
|
|
501 |
return sum_info
|
502 |
|
503 |
# 标题
|
504 |
-
title = "
|
505 |
# 描述
|
506 |
description = '''<div align='center'>
|
507 |
|
508 |
-
Use ChatGPT to summary
|
509 |
|
|
|
510 |
|
511 |
</div>
|
512 |
'''
|
513 |
# 创建Gradio界面
|
514 |
ip = [
|
515 |
gradio.inputs.Textbox(label="请输入你的API-key(必填)", default=""),
|
|
|
516 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
517 |
]
|
518 |
|
|
|
194 |
text_list = []
|
195 |
section_dict = {}
|
196 |
|
197 |
+
# # 先处理Abstract章节
|
198 |
+
# for page_index, page in enumerate(self.pdf):
|
199 |
+
# cur_text = page.get_text()
|
200 |
+
# # 如果该页面是Abstract章节所在页面
|
201 |
+
# if page_index == list(self.section_page_dict.values())[0]:
|
202 |
+
# abs_str = "Abstract"
|
203 |
+
# # 获取Abstract章节的起始位置
|
204 |
+
# first_index = cur_text.find(abs_str)
|
205 |
+
# # 查找下一个章节的关键词,这里是Introduction
|
206 |
+
# intro_str = "Introduction"
|
207 |
+
# if intro_str in cur_text:
|
208 |
+
# second_index = cur_text.find(intro_str)
|
209 |
+
# elif intro_str.upper() in cur_text:
|
210 |
+
# second_index = cur_text.find(intro_str.upper())
|
211 |
+
# # 将Abstract章节内容加入字典中
|
212 |
+
# section_dict[abs_str] = cur_text[first_index+len(abs_str)+1:second_index].replace('-\n',
|
213 |
+
# '').replace('\n', ' ').split('I.')[0].split("II.")[0]
|
214 |
|
215 |
# 再处理其他章节:
|
216 |
text_list = [page.get_text() for page in self.pdf]
|
|
|
277 |
self.filter_keys = filter_keys # 用于在摘要中筛选的关键词
|
278 |
self.root_path = root_path
|
279 |
self.file_format = 'md' # or 'txt',如果为图片,则必须为'md'
|
280 |
+
self.save_image = False
|
281 |
+
if self.save_image:
|
282 |
+
self.gitee_key = self.config.get('Gitee', 'api')
|
283 |
+
else:
|
284 |
+
self.gitee_key = ''
|
285 |
+
|
286 |
+
def get_arxiv(self, max_results=30):
|
287 |
+
search = arxiv.Search(query=self.query,
|
288 |
+
max_results=max_results,
|
289 |
+
sort_by=self.sort,
|
290 |
+
sort_order=arxiv.SortOrder.Descending,
|
291 |
+
)
|
292 |
+
return search
|
293 |
+
|
294 |
+
def filter_arxiv(self, max_results=30):
|
295 |
+
search = self.get_arxiv(max_results=max_results)
|
296 |
+
print("all search:")
|
297 |
+
for index, result in enumerate(search.results()):
|
298 |
+
print(index, result.title, result.updated)
|
299 |
+
|
300 |
+
filter_results = []
|
301 |
+
filter_keys = self.filter_keys
|
302 |
+
|
303 |
+
print("filter_keys:", self.filter_keys)
|
304 |
+
# 确保每个关键词都能在摘要中找到,才算是目标论文
|
305 |
+
for index, result in enumerate(search.results()):
|
306 |
+
abs_text = result.summary.replace('-\n', '-').replace('\n', ' ')
|
307 |
+
meet_num = 0
|
308 |
+
for f_key in filter_keys.split(" "):
|
309 |
+
if f_key.lower() in abs_text.lower():
|
310 |
+
meet_num += 1
|
311 |
+
if meet_num == len(filter_keys.split(" ")):
|
312 |
+
filter_results.append(result)
|
313 |
+
# break
|
314 |
+
print("filter_results:", len(filter_results))
|
315 |
+
print("filter_papers:")
|
316 |
+
for index, result in enumerate(filter_results):
|
317 |
+
print(index, result.title, result.updated)
|
318 |
+
return filter_results
|
319 |
+
|
320 |
def validateTitle(self, title):
|
321 |
# 将论文的乱七八糟的路径格式修正
|
322 |
rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |'
|
323 |
new_title = re.sub(rstr, "_", title) # 替换为下划线
|
324 |
return new_title
|
325 |
|
326 |
+
def download_pdf(self, filter_results):
|
327 |
+
# 先创建文件夹
|
328 |
+
date_str = str(datetime.datetime.now())[:13].replace(' ', '-')
|
329 |
+
key_word = str(self.key_word.replace(':', ' '))
|
330 |
+
path = self.root_path + 'pdf_files/' + self.query.replace('au: ', '').replace('title: ', '').replace('ti: ', '').replace(':', ' ')[:25] + '-' + date_str
|
331 |
+
try:
|
332 |
+
os.makedirs(path)
|
333 |
+
except:
|
334 |
+
pass
|
335 |
+
print("All_paper:", len(filter_results))
|
336 |
+
# 开始下载:
|
337 |
+
paper_list = []
|
338 |
+
for r_index, result in enumerate(filter_results):
|
339 |
+
try:
|
340 |
+
title_str = self.validateTitle(result.title)
|
341 |
+
pdf_name = title_str+'.pdf'
|
342 |
+
# result.download_pdf(path, filename=pdf_name)
|
343 |
+
self.try_download_pdf(result, path, pdf_name)
|
344 |
+
paper_path = os.path.join(path, pdf_name)
|
345 |
+
print("paper_path:", paper_path)
|
346 |
+
paper = Paper(path=paper_path,
|
347 |
+
url=result.entry_id,
|
348 |
+
title=result.title,
|
349 |
+
abs=result.summary.replace('-\n', '-').replace('\n', ' '),
|
350 |
+
authers=[str(aut) for aut in result.authors],
|
351 |
+
)
|
352 |
+
# 下载完毕,开始解析:
|
353 |
+
paper.parse_pdf()
|
354 |
+
paper_list.append(paper)
|
355 |
+
except Exception as e:
|
356 |
+
print("download_error:", e)
|
357 |
+
pass
|
358 |
+
return paper_list
|
359 |
+
|
360 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
361 |
+
stop=tenacity.stop_after_attempt(5),
|
362 |
+
reraise=True)
|
363 |
+
def try_download_pdf(self, result, path, pdf_name):
|
364 |
+
result.download_pdf(path, filename=pdf_name)
|
365 |
+
|
366 |
+
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
367 |
+
stop=tenacity.stop_after_attempt(5),
|
368 |
+
reraise=True)
|
369 |
+
def upload_gitee(self, image_path, image_name='', ext='png'):
|
370 |
+
"""
|
371 |
+
上传到码云
|
372 |
+
:return:
|
373 |
+
"""
|
374 |
+
with open(image_path, 'rb') as f:
|
375 |
+
base64_data = base64.b64encode(f.read())
|
376 |
+
base64_content = base64_data.decode()
|
377 |
+
|
378 |
+
date_str = str(datetime.datetime.now())[:19].replace(':', '-').replace(' ', '-') + '.' + ext
|
379 |
+
path = image_name+ '-' +date_str
|
380 |
+
|
381 |
+
payload = {
|
382 |
+
"access_token": self.gitee_key,
|
383 |
+
"owner": self.config.get('Gitee', 'owner'),
|
384 |
+
"repo": self.config.get('Gitee', 'repo'),
|
385 |
+
"path": self.config.get('Gitee', 'path'),
|
386 |
+
"content": base64_content,
|
387 |
+
"message": "upload image"
|
388 |
+
}
|
389 |
+
# 这里需要修改成你的gitee的账户和仓库名,以及文件夹的名字:
|
390 |
+
url = f'https://gitee.com/api/v5/repos/'+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/'+path
|
391 |
+
rep = requests.post(url, json=payload).json()
|
392 |
+
print("rep:", rep)
|
393 |
+
if 'content' in rep.keys():
|
394 |
+
image_url = rep['content']['download_url']
|
395 |
+
else:
|
396 |
+
image_url = r"https://gitee.com/api/v5/repos/"+self.config.get('Gitee', 'owner')+'/'+self.config.get('Gitee', 'repo')+'/contents/'+self.config.get('Gitee', 'path')+'/' + path
|
397 |
+
|
398 |
+
return image_url
|
399 |
|
400 |
def summary_with_chat(self, paper_list, key):
|
401 |
htmls = []
|
402 |
for paper_index, paper in enumerate(paper_list):
|
403 |
# 第一步先用title,abs,和introduction进行总结。
|
404 |
+
text = ''
|
405 |
+
text += 'Title:' + paper.title
|
406 |
+
text += 'Url:' + paper.url
|
407 |
+
text += 'Abstrat:' + paper.abs
|
408 |
+
# intro
|
409 |
+
text += list(paper.section_text_dict.values())[0]
|
410 |
+
max_token = 2500 * 4
|
411 |
+
text = text[:max_token]
|
412 |
chat_summary_text = self.chat_summary(text=text, key=str(key))
|
413 |
htmls.append(chat_summary_text)
|
414 |
+
|
415 |
+
# TODO 往md文档中插入论文里的像素最大的一张图片,这个方案可以弄的更加智能一些:
|
416 |
+
first_image, ext = paper.get_image_path()
|
417 |
+
if first_image is None or self.gitee_key == '':
|
418 |
+
pass
|
419 |
+
else:
|
420 |
+
image_title = self.validateTitle(paper.title)
|
421 |
+
image_url = self.upload_gitee(image_path=first_image, image_name=image_title, ext=ext)
|
422 |
+
htmls.append("\n")
|
423 |
+
htmls.append("![Fig]("+image_url+")")
|
424 |
+
htmls.append("\n")
|
425 |
+
# 第二步总结方法:
|
426 |
+
# TODO,由于有些文章的方法章节名是算法名,所以简单的通过关键词来筛选,很难获取,后面需要用其他的方案去优化。
|
427 |
+
method_key = ''
|
428 |
+
for parse_key in paper.section_text_dict.keys():
|
429 |
+
if 'method' in parse_key.lower() or 'approach' in parse_key.lower():
|
430 |
+
method_key = parse_key
|
431 |
+
break
|
432 |
+
|
433 |
+
if method_key != '':
|
434 |
+
text = ''
|
435 |
+
method_text = ''
|
436 |
+
summary_text = ''
|
437 |
+
summary_text += "<summary>" + chat_summary_text
|
438 |
+
# methods
|
439 |
+
method_text += paper.section_text_dict[method_key]
|
440 |
+
# TODO 把这个变成tenacity的自动判别!
|
441 |
+
max_token = 2500 * 4
|
442 |
+
text = summary_text + "\n <Methods>:\n" + method_text
|
443 |
+
text = text[:max_token]
|
444 |
+
chat_method_text = self.chat_method(text=text, key=str(key))
|
445 |
+
htmls.append(chat_method_text)
|
446 |
+
else:
|
447 |
+
chat_method_text = ''
|
448 |
+
htmls.append("\n")
|
449 |
+
|
450 |
+
# 第三步总结全文,并打分:
|
451 |
+
conclusion_key = ''
|
452 |
+
for parse_key in paper.section_text_dict.keys():
|
453 |
+
if 'conclu' in parse_key.lower():
|
454 |
+
conclusion_key = parse_key
|
455 |
+
break
|
456 |
+
|
457 |
+
text = ''
|
458 |
+
conclusion_text = ''
|
459 |
+
summary_text = ''
|
460 |
+
summary_text += "<summary>" + chat_summary_text + "\n <Method summary>:\n" + chat_method_text
|
461 |
+
if conclusion_key != '':
|
462 |
+
# conclusion
|
463 |
+
conclusion_text += paper.section_text_dict[conclusion_key]
|
464 |
+
max_token = 2500 * 4
|
465 |
+
text = summary_text + "\n <Conclusion>:\n" + conclusion_text
|
466 |
+
else:
|
467 |
+
text = summary_text
|
468 |
+
text = text[:max_token]
|
469 |
+
chat_conclusion_text = self.chat_conclusion(text=text, key=str(key))
|
470 |
+
htmls.append(chat_conclusion_text)
|
471 |
htmls.append("\n")
|
|
|
472 |
md_text = "\n".join(htmls)
|
473 |
|
474 |
return markdown.markdown(md_text)
|
|
|
477 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
478 |
stop=tenacity.stop_after_attempt(5),
|
479 |
reraise=True)
|
480 |
+
def chat_conclusion(self, text, key):
|
481 |
+
openai.api_key = key
|
|
|
|
|
482 |
response = openai.ChatCompletion.create(
|
483 |
model="gpt-3.5-turbo",
|
484 |
# prompt需要用英语替换,少占用token。
|
485 |
messages=[
|
486 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的审稿人,你需要严格评审这篇文章"}, # chatgpt 角色
|
487 |
+
{"role": "assistant", "content": "这是一篇英文文献的<summary>和<conclusion>部分内容,其中<summary>你已经总结好了,但是<conclusion>部分,我需要你帮忙归纳下面问题:"+text}, # 背景知识,可以参考OpenReview的审稿流程
|
|
|
|
|
|
|
|
|
488 |
{"role": "user", "content": """
|
489 |
+
8. 做出如下总结:
|
490 |
+
- (1):这篇工作的意义如何?
|
491 |
+
- (2):从创新点、性能、工作量这三个维度,总结这篇文章的优点和缺点。
|
492 |
.......
|
493 |
+
按照后面的格式输出:
|
494 |
+
8. Conclusion:
|
495 |
+
- (1):xxx;
|
496 |
+
- (2):创新点: xxx; 性能: xxx; 工作量: xxx;
|
497 |
+
|
498 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,.......代表按照实际需求填写,如果没有可以不用写.
|
499 |
"""},
|
500 |
]
|
501 |
)
|
|
|
503 |
for choice in response.choices:
|
504 |
result += choice.message.content
|
505 |
print("conclusion_result:\n", result)
|
506 |
+
return result
|
507 |
+
|
508 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
509 |
stop=tenacity.stop_after_attempt(5),
|
510 |
reraise=True)
|
511 |
+
def chat_method(self, text, key):
|
512 |
+
openai.api_key = key
|
|
|
|
|
513 |
response = openai.ChatCompletion.create(
|
514 |
model="gpt-3.5-turbo",
|
515 |
messages=[
|
516 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
|
517 |
+
{"role": "assistant", "content": "这是一篇英文文献的<summary>和<Method>部分内容,其中<summary>你已经总结好了,但是<Methods>部分,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
|
|
|
|
|
|
|
|
|
518 |
{"role": "user", "content": """
|
519 |
+
7. 详细描述这篇文章的方法思路。比如说它的步骤是:
|
520 |
- (1):...
|
521 |
- (2):...
|
522 |
- (3):...
|
523 |
- .......
|
524 |
+
按照后面的格式输出:
|
525 |
+
7. Methods:
|
526 |
+
- (1):xxx;
|
527 |
+
- (2):xxx;
|
528 |
+
- (3):xxx;
|
529 |
+
.......
|
530 |
+
|
531 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要和之前的<summary>内容重复,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行,.......代表按照实际需求填写,如果没有可以不用写.
|
532 |
"""},
|
533 |
]
|
534 |
)
|
|
|
541 |
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
|
542 |
stop=tenacity.stop_after_attempt(5),
|
543 |
reraise=True)
|
544 |
+
def chat_summary(self, text, key):
|
545 |
+
openai.api_key = key
|
|
|
|
|
|
|
546 |
response = openai.ChatCompletion.create(
|
547 |
model="gpt-3.5-turbo",
|
548 |
messages=[
|
549 |
+
{"role": "system", "content": "你是一个["+self.key_word+"]领域的科研人员,善于使用精炼的语句总结论文"}, # chatgpt 角色
|
550 |
+
{"role": "assistant", "content": "这是一篇英文文献的标题,作者,链接,Abstract和Introduction部分内容,我需要你帮忙阅读并归纳下面问题:"+text}, # 背景知识
|
|
|
|
|
|
|
|
|
551 |
{"role": "user", "content": """
|
552 |
+
1. 标记出这篇文献的标题(加上中文翻译)
|
553 |
+
2. 列举所有的作者姓名 (使用英文)
|
554 |
+
3. 标记第一作者的单位(只输出中文翻译)
|
555 |
+
4. 标记出这篇文章的关键词(使用英文)
|
556 |
+
5. 论文链接,Github代码链接(如果有的话,没有的话请填写Github:None)
|
557 |
+
6. 按照下面四个点进行总结:
|
558 |
+
- (1):这篇文章的研究背景是什么?
|
559 |
+
- (2):过去的方法有哪些?它们存在什么问题?本文和过去的研究有哪些本质的区别?Is the approach well motivated?
|
560 |
+
- (3):本文提出的研究方法是什么?
|
561 |
+
- (4):本文方法在什么任务上,取得了什么性能?性能能否支持他们的目标?
|
562 |
+
按照后面的格式输出:
|
563 |
+
1. Title: xxx
|
564 |
+
2. Authors: xxx
|
565 |
+
3. Affiliation: xxx
|
566 |
+
4. Keywords: xxx
|
567 |
+
5. Urls: xxx or xxx , xxx
|
568 |
+
6. Summary:
|
569 |
+
- (1):xxx;
|
570 |
+
- (2):xxx;
|
571 |
+
- (3):xxx;
|
572 |
+
- (4):xxx.
|
573 |
+
|
574 |
+
务必使用中文回答(专有名词需要用英文标注),语句尽量简洁且学术,不要有太多重复的信息,数值使用原文数字, 务必严格按照格式,将对应内容输出到xxx中,按照\n换行.
|
575 |
"""},
|
576 |
]
|
577 |
)
|
|
|
579 |
for choice in response.choices:
|
580 |
result += choice.message.content
|
581 |
print("summary_result:\n", result)
|
582 |
+
return result
|
583 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
584 |
def export_to_markdown(self, text, file_name, mode='w'):
|
585 |
# 使用markdown模块的convert方法,将文本转换为html格式
|
586 |
# html = markdown.markdown(text)
|
|
|
595 |
print(f"Query: {self.query}")
|
596 |
print(f"Sort: {self.sort}")
|
597 |
|
598 |
+
def upload_pdf(key, text, file):
|
599 |
+
# 检查两个输入都不为空
|
600 |
+
if not key or not text or not file:
|
601 |
+
return "两个输入都不能为空,请输入字符并上传 PDF 文件!"
|
602 |
# 判断PDF文件
|
603 |
if file and file.name.split(".")[-1].lower() != "pdf":
|
604 |
return '请勿上传非 PDF 文件!'
|
|
|
611 |
return sum_info
|
612 |
|
613 |
# 标题
|
614 |
+
title = "ChatPaper"
|
615 |
# 描述
|
616 |
description = '''<div align='center'>
|
617 |
|
618 |
+
Use ChatGPT to summary the papers.
|
619 |
|
620 |
+
Star our Github [ChatPaper](https://github.com/kaixindelele/ChatPaper)
|
621 |
|
622 |
</div>
|
623 |
'''
|
624 |
# 创建Gradio界面
|
625 |
ip = [
|
626 |
gradio.inputs.Textbox(label="请输入你的API-key(必填)", default=""),
|
627 |
+
gradio.inputs.Textbox(label="请输入论文大标题索引(用英文逗号隔开,必填)", default="'Abstract,Introduction,Related Work,Background,Preliminary,Problem Formulation,Methods,Methodology,Method,Approach,Approaches,Materials and Methods,Experiment Settings,Experiment,Experimental Results,Evaluation,Experiments,Results,Findings,Data Analysis,Discussion,Results and Discussion,Conclusion,References'"),
|
628 |
gradio.inputs.File(label="请上传论文PDF(必填)")
|
629 |
]
|
630 |
|