czczup ShiwenNi commited on
Commit
be6c855
0 Parent(s):

Duplicate from ShiwenNi/ChatReviewer

Browse files

Co-authored-by: ShiwenNi <ShiwenNi@users.noreply.huggingface.co>

Files changed (5) hide show
  1. .gitattributes +34 -0
  2. README.md +14 -0
  3. app.py +209 -0
  4. get_paper_from_pdf.py +193 -0
  5. requirements.txt +10 -0
.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ChatReviewer
3
+ emoji: 💩
4
+ colorFrom: red
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.22.1
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ duplicated_from: ShiwenNi/ChatReviewer
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import os
3
+ import re
4
+ import jieba
5
+ from io import BytesIO
6
+ import datetime
7
+ import time
8
+ import openai, tenacity
9
+ import argparse
10
+ import configparser
11
+ import json
12
+ import tiktoken
13
+ import PyPDF2
14
+ import gradio
15
+
16
+
17
+ def contains_chinese(text):
18
+ for ch in text:
19
+ if u'\u4e00' <= ch <= u'\u9fff':
20
+ return True
21
+ return False
22
+
23
+ def insert_sentence(text, sentence, interval):
24
+ lines = text.split('\n')
25
+ new_lines = []
26
+
27
+ for line in lines:
28
+ if contains_chinese(line):
29
+ words = list(jieba.cut(line))
30
+ separator = ''
31
+ else:
32
+ words = line.split()
33
+ separator = ' '
34
+
35
+ new_words = []
36
+ count = 0
37
+
38
+ for word in words:
39
+ new_words.append(word)
40
+ count += 1
41
+
42
+ if count % interval == 0:
43
+ new_words.append(sentence)
44
+
45
+ new_lines.append(separator.join(new_words))
46
+
47
+ return '\n'.join(new_lines)
48
+
49
+ # 定义Reviewer类
50
+ class Reviewer:
51
+ # 初始化方法,设置属性
52
+ def __init__(self, api, review_format, paper_pdf, language):
53
+ self.api = api
54
+ self.review_format = review_format
55
+
56
+ self.language = language
57
+ self.paper_pdf = paper_pdf
58
+ self.max_token_num = 4097
59
+ self.encoding = tiktoken.get_encoding("gpt2")
60
+
61
+
62
+ def review_by_chatgpt(self, paper_list):
63
+ text = self.extract_chapter(self.paper_pdf)
64
+ chat_review_text, total_token_used = self.chat_review(text=text)
65
+ return chat_review_text, total_token_used
66
+
67
+
68
+
69
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
70
+ stop=tenacity.stop_after_attempt(5),
71
+ reraise=True)
72
+ def chat_review(self, text):
73
+ openai.api_key = self.api # 读取api
74
+ review_prompt_token = 1000
75
+ text_token = len(self.encoding.encode(text))
76
+ input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/(text_token+1))
77
+ input_text = "This is the paper for your review:" + text[:input_text_index]
78
+ messages=[
79
+ {"role": "system", "content": "You are a professional reviewer. Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ self.review_format +" Must be output in {}.".format(self.language)},
80
+ {"role": "user", "content": input_text},
81
+ ]
82
+
83
+ response = openai.ChatCompletion.create(
84
+ model="gpt-3.5-turbo",
85
+ messages=messages,
86
+ )
87
+ result = ''
88
+ for choice in response.choices:
89
+ result += choice.message.content
90
+ result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 25)
91
+ result += "\n\n⚠伦理声明/Ethics statement:\n--禁止直接复制生成的评论用于任何论文审稿工作!\n--Direct copying of generated comments for any paper review work is prohibited!"
92
+ print("********"*10)
93
+ print(result)
94
+ print("********"*10)
95
+ print("prompt_token_used:", response.usage.prompt_tokens)
96
+ print("completion_token_used:", response.usage.completion_tokens)
97
+ print("total_token_used:", response.usage.total_tokens)
98
+ print("response_time:", response.response_ms/1000.0, 's')
99
+
100
+ return result, response.usage.total_tokens
101
+
102
+
103
+
104
+
105
+
106
+ def extract_chapter(self, pdf_path):
107
+ file_object = BytesIO(pdf_path)
108
+ pdf_reader = PyPDF2.PdfReader(file_object)
109
+ # 获取PDF的总页数
110
+ num_pages = len(pdf_reader.pages)
111
+ # 初始化提取状态和提取文本
112
+ extraction_started = False
113
+ extracted_text = ""
114
+ # 遍历PDF中的每一页
115
+ for page_number in range(num_pages):
116
+ page = pdf_reader.pages[page_number]
117
+ page_text = page.extract_text()
118
+
119
+ # 如果找到了章节标题,开始提取
120
+ if 'Abstract'.lower() in page_text.lower() and not extraction_started:
121
+ extraction_started = True
122
+ page_number_start = page_number
123
+ # 如果提取已开始,将页面文本添加到提取文本中
124
+ if extraction_started:
125
+ extracted_text += page_text
126
+ # 如果找到下一章节标题,停止提取
127
+ if page_number_start + 1 < page_number:
128
+ break
129
+ return extracted_text
130
+
131
+ def main(api, review_format, paper_pdf, language):
132
+ start_time = time.time()
133
+ if not api or not review_format or not paper_pdf:
134
+ return "请输入完整内容!"
135
+ # 判断PDF文件
136
+ else:
137
+ # 创建一个Reader对象
138
+ reviewer1 = Reviewer(api, review_format, paper_pdf, language)
139
+ # 开始判断是路径还是文件:
140
+ comments, total_token_used = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
141
+ time_used = time.time() - start_time
142
+ output2 ="使用token数:"+ str(total_token_used)+"\n花费时间:"+ str(round(time_used, 2)) +"秒"
143
+ return comments, output2
144
+
145
+
146
+
147
+ ########################################################################################################
148
+ # 标题
149
+ title = "🤖ChatReviewer🤖"
150
+ # 描述
151
+
152
+ description = '''<div align='left'>
153
+
154
+ <strong>ChatReviewer是一款基于ChatGPT-3.5的API开发的智能论文分析与建议助手。</strong>其用途如下:
155
+
156
+ ⭐️对论文的优缺点进行快速总结和分析,提高科研人员的文献阅读和理解的效率,紧跟研究前沿。
157
+
158
+ ⭐️对自己的论文进行分析,根据ChatReviewer生成的改进建议进行查漏补缺,进一步提高自己的论文质量。
159
+
160
+ 如果觉得很卡,可以点击右上角的Duplicate this Space,把ChatReviewer复制到你自己的Space中!(🈲:禁止直接复制生成的评论用于任何论文审稿工作!)
161
+
162
+ 本项目的[Github](https://github.com/nishiwen1214/ChatReviewer),欢迎Star和Fork,也欢迎大佬赞助让本项目快速成长!💗([获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/))
163
+ </div>
164
+ '''
165
+
166
+ # 创建Gradio界面
167
+ inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
168
+ default="",
169
+ type='password'),
170
+ gradio.inputs.Textbox(lines=5,
171
+ label="请输入特定的分析要求和格式(否则为默认格式)",
172
+ default="""* Overall Review
173
+ Please briefly summarize the main points and contributions of this paper.
174
+ xxx
175
+ * Paper Strength
176
+ Please provide a list of the strengths of this paper, including but not limited to: innovative and practical methodology, insightful empirical findings or in-depth theoretical analysis,
177
+ well-structured review of relevant literature, and any other factors that may make the paper valuable to readers. (Maximum length: 2,000 characters)
178
+ (1) xxx
179
+ (2) xxx
180
+ (3) xxx
181
+ * Paper Weakness
182
+ Please provide a numbered list of your main concerns regarding this paper (so authors could respond to the concerns individually).
183
+ These may include, but are not limited to: inadequate implementation details for reproducing the study, limited evaluation and ablation studies for the proposed method,
184
+ correctness of the theoretical analysis or experimental results, lack of comparisons or discussions with widely-known baselines in the field, lack of clarity in exposition,
185
+ or any other factors that may impede the reader's understanding or benefit from the paper. Please kindly refrain from providing a general assessment of the paper's novelty without providing detailed explanations. (Maximum length: 2,000 characters)
186
+ (1) xxx
187
+ (2) xxx
188
+ (3) xxx
189
+ * Questions To Authors And Suggestions For Rebuttal
190
+ Please provide a numbered list of specific and clear questions that pertain to the details of the proposed method, evaluation setting, or additional results that would aid in supporting the authors' claims.
191
+ The questions should be formulated in a manner that, after the authors have answered them during the rebuttal, it would enable a more thorough assessment of the paper's quality. (Maximum length: 2,000 characters)
192
+ *Overall score (1-10)
193
+ The paper is scored on a scale of 1-10, with 10 being the full mark, and 6 stands for borderline accept. Then give the reason for your rating.
194
+ xxx"""
195
+ ),
196
+ gradio.inputs.File(label="请上传论文PDF(必填)",type="bytes"),
197
+ gradio.inputs.Radio(choices=["English", "Chinese"],
198
+ default="English",
199
+ label="选择输出语言"),
200
+ ]
201
+
202
+ chat_reviewer_gui = gradio.Interface(fn=main,
203
+ inputs=inp,
204
+ outputs = [gradio.Textbox(lines=25, label="分析结果"), gradio.Textbox(lines=2, label="资源统计")],
205
+ title=title,
206
+ description=description)
207
+
208
+ # Start server
209
+ chat_reviewer_gui .launch(quiet=True, show_api=False)
get_paper_from_pdf.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz, io, os
2
+ from PIL import Image
3
+ from collections import Counter
4
+ import json
5
+ import re
6
+
7
+ class Paper:
8
+ def __init__(self, path, title='', url='', abs='', authors=[]):
9
+ # 初始化函数,根据pdf路径初始化Paper对象
10
+ self.url = url # 文章链接
11
+ self.path = path # pdf路径
12
+ self.section_names = [] # 段落标题
13
+ self.section_texts = {} # 段落内容
14
+ self.abs = abs
15
+ self.title_page = 0
16
+ if title == '':
17
+ self.pdf = fitz.open(self.path) # pdf文档
18
+ self.title = self.get_title()
19
+ self.parse_pdf()
20
+ else:
21
+ self.title = title
22
+ self.authors = authors
23
+ self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
24
+ self.digit_num = [str(d + 1) for d in range(10)]
25
+ self.first_image = ''
26
+
27
+ def parse_pdf(self):
28
+ self.pdf = fitz.open(self.path) # pdf文档
29
+ self.text_list = [page.get_text() for page in self.pdf]
30
+ self.all_text = ' '.join(self.text_list)
31
+ self.extract_section_infomation()
32
+ self.section_texts.update({"title": self.title})
33
+ self.pdf.close()
34
+
35
+ # 定义一个函数,根据字体的大小,识别每个章节名称,并返回一个列表
36
+ def get_chapter_names(self, ):
37
+ # # 打开一个pdf文件
38
+ doc = fitz.open(self.path) # pdf文档
39
+ text_list = [page.get_text() for page in doc]
40
+ all_text = ''
41
+ for text in text_list:
42
+ all_text += text
43
+ # # 创建一个空列表,用于存储章节名称
44
+ chapter_names = []
45
+ for line in all_text.split('\n'):
46
+ line_list = line.split(' ')
47
+ if '.' in line:
48
+ point_split_list = line.split('.')
49
+ space_split_list = line.split(' ')
50
+ if 1 < len(space_split_list) < 5:
51
+ if 1 < len(point_split_list) < 5 and (
52
+ point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
53
+ # print("line:", line)
54
+ chapter_names.append(line)
55
+
56
+ return chapter_names
57
+
58
+ def get_title(self):
59
+ doc = self.pdf # 打开pdf文件
60
+ max_font_size = 0 # 初始化最大字体大小为0
61
+ max_string = "" # 初始化最大字体大小对应的字符串为空
62
+ max_font_sizes = [0]
63
+ for page_index, page in enumerate(doc): # 遍历每一页
64
+ text = page.get_text("dict") # 获取页面上的文本信息
65
+ blocks = text["blocks"] # 获取文本块列表
66
+ for block in blocks: # 遍历每个文本块
67
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
68
+ if len(block["lines"][0]["spans"]):
69
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
70
+ max_font_sizes.append(font_size)
71
+ if font_size > max_font_size: # 如果字体大小大于当前最大值
72
+ max_font_size = font_size # 更新最大值
73
+ max_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
74
+ max_font_sizes.sort()
75
+ # print("max_font_sizes", max_font_sizes[-10:])
76
+ cur_title = ''
77
+ for page_index, page in enumerate(doc): # 遍历每一页
78
+ text = page.get_text("dict") # 获取页面上的文本信息
79
+ blocks = text["blocks"] # 获取文本块列表
80
+ for block in blocks: # 遍历每个文本块
81
+ if block["type"] == 0 and len(block['lines']): # 如果是文字类型
82
+ if len(block["lines"][0]["spans"]):
83
+ cur_string = block["lines"][0]["spans"][0]["text"] # 更新最大值对应的字符串
84
+ font_flags = block["lines"][0]["spans"][0]["flags"] # 获取第一行第一段文字的字体特征
85
+ font_size = block["lines"][0]["spans"][0]["size"] # 获取第一行第一段文字的字体大小
86
+ # print(font_size)
87
+ if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
88
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
89
+ if len(cur_string) > 4 and "arXiv" not in cur_string:
90
+ # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
91
+ if cur_title == '':
92
+ cur_title += cur_string
93
+ else:
94
+ cur_title += ' ' + cur_string
95
+ self.title_page = page_index
96
+ # break
97
+ title = cur_title.replace('\n', ' ')
98
+ return title
99
+
100
+ def extract_section_infomation(self):
101
+ doc = fitz.open(self.path)
102
+
103
+ # 获取文档中所有字体大小
104
+ font_sizes = []
105
+ for page in doc:
106
+ blocks = page.get_text("dict")["blocks"]
107
+ for block in blocks:
108
+ if 'lines' not in block:
109
+ continue
110
+ lines = block["lines"]
111
+ for line in lines:
112
+ for span in line["spans"]:
113
+ font_sizes.append(span["size"])
114
+ most_common_size, _ = Counter(font_sizes).most_common(1)[0]
115
+
116
+ # 按照最频繁的字体大小确定标题字体大小的阈值
117
+ threshold = most_common_size * 1
118
+
119
+ section_dict = {}
120
+ last_heading = None
121
+ subheadings = []
122
+ heading_font = -1
123
+ # 遍历每一页并查找子标题
124
+ found_abstract = False
125
+ upper_heading = False
126
+ font_heading = False
127
+ for page in doc:
128
+ blocks = page.get_text("dict")["blocks"]
129
+ for block in blocks:
130
+ if not found_abstract:
131
+ try:
132
+ text = json.dumps(block)
133
+ except:
134
+ continue
135
+ if re.search(r"\bAbstract\b", text, re.IGNORECASE):
136
+ found_abstract = True
137
+ last_heading = "Abstract"
138
+ section_dict["Abstract"] = ""
139
+ if found_abstract:
140
+ if 'lines' not in block:
141
+ continue
142
+ lines = block["lines"]
143
+ for line in lines:
144
+ for span in line["spans"]:
145
+ # 如果当前文本是子标题
146
+ if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4: # 针对一些标题大小一样,但是全大写的论文
147
+ upper_heading = True
148
+ heading = span["text"].strip()
149
+ if "References" in heading: # reference 以后的内容不考虑
150
+ self.section_names = subheadings
151
+ self.section_texts = section_dict
152
+ return
153
+ subheadings.append(heading)
154
+ if last_heading is not None:
155
+ section_dict[last_heading] = section_dict[last_heading].strip()
156
+ section_dict[heading] = ""
157
+ last_heading = heading
158
+ if not upper_heading and span["size"] > threshold and re.match( # 正常情况下,通过字体大小判断
159
+ r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
160
+ span["text"].strip()):
161
+ font_heading = True
162
+ if heading_font == -1:
163
+ heading_font = span["size"]
164
+ elif heading_font != span["size"]:
165
+ continue
166
+ heading = span["text"].strip()
167
+ if "References" in heading: # reference 以后的内容不考虑
168
+ self.section_names = subheadings
169
+ self.section_texts = section_dict
170
+ return
171
+ subheadings.append(heading)
172
+ if last_heading is not None:
173
+ section_dict[last_heading] = section_dict[last_heading].strip()
174
+ section_dict[heading] = ""
175
+ last_heading = heading
176
+ # 否则将当前文本添加到上一个子标题的文本中
177
+ elif last_heading is not None:
178
+ section_dict[last_heading] += " " + span["text"].strip()
179
+ self.section_names = subheadings
180
+ self.section_texts = section_dict
181
+
182
+
183
+ def main():
184
+ path = r'demo.pdf'
185
+ paper = Paper(path=path)
186
+ paper.parse_pdf()
187
+ # for key, value in paper.section_text_dict.items():
188
+ # print(key, value)
189
+ # print("*"*40)
190
+
191
+
192
+ if __name__ == '__main__':
193
+ main()
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ PyMuPDF==1.21.1
2
+ jieba
3
+ tiktoken==0.2.0
4
+ tenacity==8.2.2
5
+ pybase64==1.2.3
6
+ Pillow==9.4.0
7
+ openai==0.27.0
8
+ markdown
9
+ gradio==3.20.1
10
+ PyPDF2