ChatReviewer

Running

App Files Files Community

czczup

ShiwenNi commited on May 3, 2023

Commit

be6c855

0 Parent(s):

Duplicate from ShiwenNi/ChatReviewer

Browse files

Co-authored-by: ShiwenNi <ShiwenNi@users.noreply.huggingface.co>

Files changed (5) hide show

.gitattributes +34 -0
README.md +14 -0
app.py +209 -0
get_paper_from_pdf.py +193 -0
requirements.txt +10 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: ChatReviewer
+emoji: 💩
+colorFrom: red
+colorTo: pink
+sdk: gradio
+sdk_version: 3.22.1
+app_file: app.py
+pinned: false
+license: apache-2.0
+duplicated_from: ShiwenNi/ChatReviewer
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import numpy as np
+import os
+import re
+import jieba
+from io import BytesIO
+import datetime
+import time
+import openai, tenacity
+import argparse
+import configparser
+import json
+import tiktoken
+import PyPDF2
+import gradio
+def contains_chinese(text):
+    for ch in text:
+        if u'\u4e00' <= ch <= u'\u9fff':
+            return True
+    return False
+def insert_sentence(text, sentence, interval):
+    lines = text.split('\n')
+    new_lines = []
+    for line in lines:
+        if contains_chinese(line):
+            words = list(jieba.cut(line))
+            separator = ''
+        else:
+            words = line.split()
+            separator = ' '
+        new_words = []
+        count = 0
+        for word in words:
+            new_words.append(word)
+            count += 1
+            if count % interval == 0:
+                new_words.append(sentence)
+        new_lines.append(separator.join(new_words))
+    return '\n'.join(new_lines)
+# 定义Reviewer类
+class Reviewer:
+    # 初始化方法，设置属性
+    def __init__(self, api, review_format, paper_pdf, language):
+        self.api = api
+        self.review_format = review_format
+        self.language = language
+        self.paper_pdf = paper_pdf
+        self.max_token_num = 4097
+        self.encoding = tiktoken.get_encoding("gpt2")
+    def review_by_chatgpt(self, paper_list):
+        text = self.extract_chapter(self.paper_pdf)
+        chat_review_text, total_token_used = self.chat_review(text=text)
+        return chat_review_text, total_token_used
+    @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10),
+                    stop=tenacity.stop_after_attempt(5),
+                    reraise=True)
+    def chat_review(self, text):
+        openai.api_key = self.api   # 读取api
+        review_prompt_token = 1000
+        text_token = len(self.encoding.encode(text))
+        input_text_index = int(len(text)*(self.max_token_num-review_prompt_token)/(text_token+1))
+        input_text = "This is the paper for your review:" + text[:input_text_index]
+        messages=[
+                {"role": "system", "content": "You are a professional reviewer. Now I will give you a paper. You need to give a complete review opinion according to the following requirements and format:"+ self.review_format +" Must be output in {}.".format(self.language)},
+                {"role": "user", "content": input_text},
+            ]
+        response = openai.ChatCompletion.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+        )
+        result = ''
+        for choice in response.choices:
+            result += choice.message.content
+        result = insert_sentence(result, '**Generated by ChatGPT, no copying allowed!**', 25)
+        result += "\n\n⚠伦理声明/Ethics statement：\n--禁止直接复制生成的评论用于任何论文审稿工作！\n--Direct copying of generated comments for any paper review work is prohibited!"
+        print("********"*10)
+        print(result)
+        print("********"*10)
+        print("prompt_token_used:", response.usage.prompt_tokens)
+        print("completion_token_used:", response.usage.completion_tokens)
+        print("total_token_used:", response.usage.total_tokens)
+        print("response_time:", response.response_ms/1000.0, 's')
+        return result, response.usage.total_tokens
+    def extract_chapter(self, pdf_path):
+        file_object = BytesIO(pdf_path)
+        pdf_reader = PyPDF2.PdfReader(file_object)
+        # 获取PDF的总页数
+        num_pages = len(pdf_reader.pages)
+        # 初始化提取状态和提取文本
+        extraction_started = False
+        extracted_text = ""
+        # 遍历PDF中的每一页
+        for page_number in range(num_pages):
+            page = pdf_reader.pages[page_number]
+            page_text = page.extract_text()
+            # 如果找到了章节标题，开始提取
+            if 'Abstract'.lower() in page_text.lower() and not extraction_started:
+                extraction_started = True
+                page_number_start = page_number
+            # 如果提取已开始，将页面文本添加到提取文本中
+            if extraction_started:
+                extracted_text += page_text
+                # 如果找到下一章节标题，停止提取
+                if page_number_start + 1 < page_number:
+                    break
+        return extracted_text
+def main(api, review_format, paper_pdf, language):
+    start_time = time.time()
+    if not api or not review_format or not paper_pdf:
+        return "请输入完整内容！"
+    # 判断PDF文件
+    else:
+        # 创建一个Reader对象
+        reviewer1 = Reviewer(api, review_format, paper_pdf, language)
+        # 开始判断是路径还是文件：
+        comments, total_token_used = reviewer1.review_by_chatgpt(paper_list=paper_pdf)
+    time_used = time.time() - start_time
+    output2 ="使用token数："+ str(total_token_used)+"\n花费时间："+ str(round(time_used, 2)) +"秒"
+    return comments, output2
+########################################################################################################
+# 标题
+title = "🤖ChatReviewer🤖"
+# 描述
+description = '''<div align='left'>
+<strong>ChatReviewer是一款基于ChatGPT-3.5的API开发的智能论文分析与建议助手。</strong>其用途如下：
+⭐️对论文的优缺点进行快速总结和分析，提高科研人员的文献阅读和理解的效率，紧跟研究前沿。
+⭐️对自己的论文进行分析，根据ChatReviewer生成的改进建议进行查漏补缺，进一步提高自己的论文质量。
+如果觉得很卡，可以点击右上角的Duplicate this Space，把ChatReviewer复制到你自己的Space中！（🈲：禁止直接复制生成的评论用于任何论文审稿工作！）
+本项目的[Github](https://github.com/nishiwen1214/ChatReviewer)，欢迎Star和Fork，也欢迎大佬赞助让本项目快速成长！💗（[获取Api Key](https://chatgpt.cn.obiscr.com/blog/posts/2023/How-to-get-api-key/)）
+</div>
+'''
+# 创建Gradio界面
+inp = [gradio.inputs.Textbox(label="请输入你的API-key(sk开头的字符串)",
+                          default="",
+                          type='password'),
+    gradio.inputs.Textbox(lines=5,
+        label="请输入特定的分析要求和格式(否则为默认格式)",
+        default="""* Overall Review
+Please briefly summarize the main points and contributions of this paper.
+xxx
+* Paper Strength
+Please provide a list of the strengths of this paper, including but not limited to: innovative and practical methodology, insightful empirical findings or in-depth theoretical analysis,
+well-structured review of relevant literature, and any other factors that may make the paper valuable to readers. (Maximum length: 2,000 characters)
+(1) xxx
+(2) xxx
+(3) xxx
+* Paper Weakness
+Please provide a numbered list of your main concerns regarding this paper (so authors could respond to the concerns individually).
+These may include, but are not limited to: inadequate implementation details for reproducing the study, limited evaluation and ablation studies for the proposed method,
+correctness of the theoretical analysis or experimental results, lack of comparisons or discussions with widely-known baselines in the field, lack of clarity in exposition,
+or any other factors that may impede the reader's understanding or benefit from the paper. Please kindly refrain from providing a general assessment of the paper's novelty without providing detailed explanations. (Maximum length: 2,000 characters)
+(1) xxx
+(2) xxx
+(3) xxx
+* Questions To Authors And Suggestions For Rebuttal
+Please provide a numbered list of specific and clear questions that pertain to the details of the proposed method, evaluation setting, or additional results that would aid in supporting the authors' claims.
+The questions should be formulated in a manner that, after the authors have answered them during the rebuttal, it would enable a more thorough assessment of the paper's quality. (Maximum length: 2,000 characters)
+*Overall score (1-10)
+The paper is scored on a scale of 1-10, with 10 being the full mark, and 6 stands for borderline accept. Then give the reason for your rating.
+xxx"""
+    ),
+    gradio.inputs.File(label="请上传论文PDF(必填)",type="bytes"),
+    gradio.inputs.Radio(choices=["English", "Chinese"],
+                        default="English",
+                        label="选择输出语言"),
+]
+chat_reviewer_gui = gradio.Interface(fn=main,
+                                 inputs=inp,
+                                 outputs = [gradio.Textbox(lines=25, label="分析结果"), gradio.Textbox(lines=2, label="资源统计")],
+                                 title=title,
+                                 description=description)
+# Start server
+chat_reviewer_gui .launch(quiet=True, show_api=False)

get_paper_from_pdf.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import fitz, io, os
+from PIL import Image
+from collections import Counter
+import json
+import re
+class Paper:
+    def __init__(self, path, title='', url='', abs='', authors=[]):
+        # 初始化函数，根据pdf路径初始化Paper对象
+        self.url = url  # 文章链接
+        self.path = path  # pdf路径
+        self.section_names = []  # 段落标题
+        self.section_texts = {}  # 段落内容
+        self.abs = abs
+        self.title_page = 0
+        if title == '':
+            self.pdf = fitz.open(self.path)  # pdf文档
+            self.title = self.get_title()
+            self.parse_pdf()
+        else:
+            self.title = title
+        self.authors = authors
+        self.roman_num = ["I", "II", 'III', "IV", "V", "VI", "VII", "VIII", "IIX", "IX", "X"]
+        self.digit_num = [str(d + 1) for d in range(10)]
+        self.first_image = ''
+    def parse_pdf(self):
+        self.pdf = fitz.open(self.path)  # pdf文档
+        self.text_list = [page.get_text() for page in self.pdf]
+        self.all_text = ' '.join(self.text_list)
+        self.extract_section_infomation()
+        self.section_texts.update({"title": self.title})
+        self.pdf.close()
+    # 定义一个函数，根据字体的大小，识别每个章节名称，并返回一个列表
+    def get_chapter_names(self, ):
+        # # 打开一个pdf文件
+        doc = fitz.open(self.path)  # pdf文档
+        text_list = [page.get_text() for page in doc]
+        all_text = ''
+        for text in text_list:
+            all_text += text
+        # # 创建一个空列表，用于存储章节名称
+        chapter_names = []
+        for line in all_text.split('\n'):
+            line_list = line.split(' ')
+            if '.' in line:
+                point_split_list = line.split('.')
+                space_split_list = line.split(' ')
+                if 1 < len(space_split_list) < 5:
+                    if 1 < len(point_split_list) < 5 and (
+                            point_split_list[0] in self.roman_num or point_split_list[0] in self.digit_num):
+                        # print("line:", line)
+                        chapter_names.append(line)
+        return chapter_names
+    def get_title(self):
+        doc = self.pdf  # 打开pdf文件
+        max_font_size = 0  # 初始化最大字体大小为0
+        max_string = ""  # 初始化最大字体大小对应的字符串为空
+        max_font_sizes = [0]
+        for page_index, page in enumerate(doc):  # 遍历每一页
+            text = page.get_text("dict")  # 获取页面上的文本信息
+            blocks = text["blocks"]  # 获取文本块列表
+            for block in blocks:  # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
+                    if len(block["lines"][0]["spans"]):
+                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
+                        max_font_sizes.append(font_size)
+                        if font_size > max_font_size:  # 如果字体大小大于当前最大值
+                            max_font_size = font_size  # 更新最大值
+                            max_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
+        max_font_sizes.sort()
+        # print("max_font_sizes", max_font_sizes[-10:])
+        cur_title = ''
+        for page_index, page in enumerate(doc):  # 遍历每一页
+            text = page.get_text("dict")  # 获取页面上的文本信息
+            blocks = text["blocks"]  # 获取文本块列表
+            for block in blocks:  # 遍历每个文本块
+                if block["type"] == 0 and len(block['lines']):  # 如果是文字类型
+                    if len(block["lines"][0]["spans"]):
+                        cur_string = block["lines"][0]["spans"][0]["text"]  # 更新最大值对应的字符串
+                        font_flags = block["lines"][0]["spans"][0]["flags"]  # 获取第一行第一段文字的字体特征
+                        font_size = block["lines"][0]["spans"][0]["size"]  # 获取第一行第一段文字的字体大小
+                        # print(font_size)
+                        if abs(font_size - max_font_sizes[-1]) < 0.3 or abs(font_size - max_font_sizes[-2]) < 0.3:
+                            # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
+                            if len(cur_string) > 4 and "arXiv" not in cur_string:
+                                # print("The string is bold.", max_string, "font_size:", font_size, "font_flags:", font_flags)
+                                if cur_title == '':
+                                    cur_title += cur_string
+                                else:
+                                    cur_title += ' ' + cur_string
+                                self.title_page = page_index
+                                # break
+        title = cur_title.replace('\n', ' ')
+        return title
+    def extract_section_infomation(self):
+        doc = fitz.open(self.path)
+        # 获取文档中所有字体大小
+        font_sizes = []
+        for page in doc:
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if 'lines' not in block:
+                    continue
+                lines = block["lines"]
+                for line in lines:
+                    for span in line["spans"]:
+                        font_sizes.append(span["size"])
+        most_common_size, _ = Counter(font_sizes).most_common(1)[0]
+        # 按照最频繁的字体大小确定标题字体大小的阈值
+        threshold = most_common_size * 1
+        section_dict = {}
+        last_heading = None
+        subheadings = []
+        heading_font = -1
+        # 遍历每一页并查找子标题
+        found_abstract = False
+        upper_heading = False
+        font_heading = False
+        for page in doc:
+            blocks = page.get_text("dict")["blocks"]
+            for block in blocks:
+                if not found_abstract:
+                    try:
+                        text = json.dumps(block)
+                    except:
+                        continue
+                    if re.search(r"\bAbstract\b", text, re.IGNORECASE):
+                        found_abstract = True
+                        last_heading = "Abstract"
+                        section_dict["Abstract"] = ""
+                if found_abstract:
+                    if 'lines' not in block:
+                        continue
+                    lines = block["lines"]
+                    for line in lines:
+                        for span in line["spans"]:
+                            # 如果当前文本是子标题
+                            if not font_heading and span["text"].isupper() and sum(1 for c in span["text"] if c.isupper() and ('A' <= c <='Z')) > 4:  # 针对一些标题大小一样,但是全大写的论文
+                                upper_heading = True
+                                heading = span["text"].strip()
+                                if "References" in heading:  # reference 以后的内容不考虑
+                                    self.section_names = subheadings
+                                    self.section_texts = section_dict
+                                    return
+                                subheadings.append(heading)
+                                if last_heading is not None:
+                                    section_dict[last_heading] = section_dict[last_heading].strip()
+                                section_dict[heading] = ""
+                                last_heading = heading
+                            if not upper_heading and span["size"] > threshold and re.match(  # 正常情况下,通过字体大小判断
+                                    r"[A-Z][a-z]+(?:\s[A-Z][a-z]+)*",
+                                    span["text"].strip()):
+                                font_heading = True
+                                if heading_font == -1:
+                                    heading_font = span["size"]
+                                elif heading_font != span["size"]:
+                                    continue
+                                heading = span["text"].strip()
+                                if "References" in heading:  # reference 以后的内容不考虑
+                                    self.section_names = subheadings
+                                    self.section_texts = section_dict
+                                    return
+                                subheadings.append(heading)
+                                if last_heading is not None:
+                                    section_dict[last_heading] = section_dict[last_heading].strip()
+                                section_dict[heading] = ""
+                                last_heading = heading
+                            # 否则将当前文本添加到上一个子标题的文本中
+                            elif last_heading is not None:
+                                section_dict[last_heading] += " " + span["text"].strip()
+        self.section_names = subheadings
+        self.section_texts = section_dict
+def main():
+    path = r'demo.pdf'
+    paper = Paper(path=path)
+    paper.parse_pdf()
+    # for key, value in paper.section_text_dict.items():
+    # print(key, value)
+    # print("*"*40)
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+PyMuPDF==1.21.1
+jieba
+tiktoken==0.2.0
+tenacity==8.2.2
+pybase64==1.2.3
+Pillow==9.4.0
+openai==0.27.0
+markdown
+gradio==3.20.1
+PyPDF2