gpt-academic2

Runtime error

App Files Files Community

qingxu99 commited on Apr 13, 2023

Commit

8ac9b45

•

1 Parent(s): 613be55

改善chatpdf的功能

Browse files

Files changed (5) hide show

crazy_functional.py +0 -7
crazy_functions/crazy_utils.py +168 -0
crazy_functions/批量翻译PDF文档_多线程.py +1 -166
crazy_functions/理解PDF文档内容.py +56 -132
version +2 -2

crazy_functional.py CHANGED Viewed

@@ -76,7 +76,6 @@ def get_crazy_functions():
     from crazy_functions.总结word文档 import 总结word文档
     from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
     from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
-    from crazy_functions.理解PDF文档内容 import 理解PDF文档内容
     from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
     from crazy_functions.Latex全文润色 import Latex中文润色
     from crazy_functions.Latex全文翻译 import Latex中译英
@@ -108,11 +107,6 @@ def get_crazy_functions():
             "Color": "stop",
             "Function": HotReload(总结word文档)
         },
-        # "[测试功能] 理解PDF文档内容（Tk文件选择接口，仅本地）": {
-        #     # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
-        #     "AsButton": False,  # 加入下拉菜单中
-        #     "Function": HotReload(理解PDF文档内容)
-        # },
         "[测试功能] 理解PDF文档内容（通用接口，读取文件输入区）": {
             # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
             "Color": "stop",
@@ -131,7 +125,6 @@ def get_crazy_functions():
             "AsButton": False,  # 加入下拉菜单中
             "Function": HotReload(Latex中文润色)
         },
         "[测试功能] Latex项目全文中译英（输入路径或上传压缩包）": {
             # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
             "Color": "stop",

     from crazy_functions.总结word文档 import 总结word文档
     from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
     from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
     from crazy_functions.理解PDF文档内容 import 理解PDF文档内容标准文件输入
     from crazy_functions.Latex全文润色 import Latex中文润色
     from crazy_functions.Latex全文翻译 import Latex中译英
             "Color": "stop",
             "Function": HotReload(总结word文档)
         },
         "[测试功能] 理解PDF文档内容（通用接口，读取文件输入区）": {
             # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
             "Color": "stop",
             "AsButton": False,  # 加入下拉菜单中
             "Function": HotReload(Latex中文润色)
         },
         "[测试功能] Latex项目全文中译英（输入路径或上传压缩包）": {
             # HotReload 的意思是热更新，修改函数插件代码后，不需要重启程序，代码直接生效
             "Color": "stop",

crazy_functions/crazy_utils.py CHANGED Viewed

@@ -360,3 +360,171 @@ def breakdown_txt_to_satisfy_token_limit_for_pdf(txt, get_token_fn, limit):
             # 这个中文的句号是故意的，作为一个标识而存在
             res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
             return [r.replace('。\n', '.') for r in res]

             # 这个中文的句号是故意的，作为一个标识而存在
             res = cut(txt.replace('.', '。\n'), must_break_at_empty_line=False)
             return [r.replace('。\n', '.') for r in res]
+def read_and_clean_pdf_text(fp):
+    """
+    这个函数用于分割pdf，用了很多trick，逻辑较乱，效果奇好
+    **输入参数说明**
+    - `fp`：需要读取和清理文本的pdf文件路径
+    **输出参数说明**
+    - `meta_txt`：清理后的文本内容字符串
+    - `page_one_meta`：第一页清理后的文本内容列表
+    **函数功能**
+    读取pdf文件并清理其中的文本内容，清理规则包括：
+    - 提取所有块元的文本信息，并合并为一个字符串
+    - 去除短块（字符数小于100）并替换为回车符
+    - 清理多余的空行
+    - 合并小写字母开头的段落块并替换为空格
+    - 清除重复的换行
+    - 将每个换行符替换为两个换行符，使每个段落之间有两个换行符分隔
+    """
+    import fitz, copy
+    import re
+    import numpy as np
+    from colorful import print亮黄, print亮绿
+    fc = 0
+    fs = 1
+    fb = 2
+    REMOVE_FOOT_NOTE = True
+    REMOVE_FOOT_FFSIZE_PERCENT = 0.95
+    def primary_ffsize(l):
+        fsize_statiscs = {}
+        for wtf in l['spans']:
+            if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
+            fsize_statiscs[wtf['size']] += len(wtf['text'])
+        return max(fsize_statiscs, key=fsize_statiscs.get)
+    def ffsize_same(a,b):
+        return abs((a-b)/max(a,b)) < 0.02
+    # file_content = ""
+    with fitz.open(fp) as doc:
+        meta_txt = []
+        meta_font = []
+        meta_line = []
+        meta_span = []
+        for index, page in enumerate(doc):
+            # file_content += page.get_text()
+            text_areas = page.get_text("dict")  # 获取页面上的文本信息
+            for t in text_areas['blocks']:
+                if 'lines' in t:
+                    pf = 998
+                    for l in t['lines']:
+                        txt_line = "".join([wtf['text'] for wtf in l['spans']])
+                        pf = primary_ffsize(l)
+                        meta_line.append([txt_line, pf, l['bbox'], l])
+                        for wtf in l['spans']: # for l in t['lines']:
+                            meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
+                    # meta_line.append(["NEW_BLOCK", pf])
+            # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
+            meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
+                '- ', '') for t in text_areas['blocks'] if 'lines' in t])
+            meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
+                             for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
+            if index == 0:
+                page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
+                    '- ', '') for t in text_areas['blocks'] if 'lines' in t]
+        # 获取正文主字体
+        fsize_statiscs = {}
+        for span in meta_span:
+            if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
+            fsize_statiscs[span[1]] += span[2]
+        main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
+        if REMOVE_FOOT_NOTE:
+            give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
+        # 切分和重新整合
+        mega_sec = []
+        sec = []
+        for index, line in enumerate(meta_line):
+            if index == 0:
+                sec.append(line[fc])
+                continue
+            if REMOVE_FOOT_NOTE:
+                if meta_line[index][fs] <= give_up_fize_threshold:
+                    continue
+            if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
+                # 尝试识别段落
+                if meta_line[index][fc].endswith('.') and\
+                    (meta_line[index-1][fc] != 'NEW_BLOCK') and \
+                    (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
+                    sec[-1] += line[fc]
+                    sec[-1] += "\n\n"
+                else:
+                    sec[-1] += " "
+                    sec[-1] += line[fc]
+            else:
+                if (index+1 < len(meta_line)) and \
+                    meta_line[index][fs] > main_fsize:
+                    # 单行 + 字体大
+                    mega_sec.append(copy.deepcopy(sec))
+                    sec = []
+                    sec.append("# " + line[fc])
+                else:
+                    # 尝试识别section
+                    if meta_line[index-1][fs] > meta_line[index][fs]:
+                        sec.append("\n" + line[fc])
+                    else:
+                        sec.append(line[fc])
+        mega_sec.append(copy.deepcopy(sec))
+        finals = []
+        for ms in mega_sec:
+            final = " ".join(ms)
+            final = final.replace('- ', ' ')
+            finals.append(final)
+        meta_txt = finals
+        def 把字符太少的块清除为回车(meta_txt):
+            for index, block_txt in enumerate(meta_txt):
+                if len(block_txt) < 100:
+                    meta_txt[index] = '\n'
+            return meta_txt
+        meta_txt = 把字符太少的块清除为回车(meta_txt)
+        def 清理多余的空行(meta_txt):
+            for index in reversed(range(1, len(meta_txt))):
+                if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
+                    meta_txt.pop(index)
+            return meta_txt
+        meta_txt = 清理多余的空行(meta_txt)
+        def 合并小写开头的段落块(meta_txt):
+            def starts_with_lowercase_word(s):
+                pattern = r"^[a-z]+"
+                match = re.match(pattern, s)
+                if match:
+                    return True
+                else:
+                    return False
+            for _ in range(100):
+                for index, block_txt in enumerate(meta_txt):
+                    if starts_with_lowercase_word(block_txt):
+                        if meta_txt[index-1] != '\n':
+                            meta_txt[index-1] += ' '
+                        else:
+                            meta_txt[index-1] = ''
+                        meta_txt[index-1] += meta_txt[index]
+                        meta_txt[index] = '\n'
+            return meta_txt
+        meta_txt = 合并小写开头的段落块(meta_txt)
+        meta_txt = 清理多余的空行(meta_txt)
+        meta_txt = '\n'.join(meta_txt)
+        # 清除重复的换行
+        for _ in range(5):
+            meta_txt = meta_txt.replace('\n\n', '\n')
+        # 换行 -> 双换行
+        meta_txt = meta_txt.replace('\n', '\n\n')
+        for f in finals:
+            print亮黄(f)
+            print亮绿('***************************')
+    return meta_txt, page_one_meta

crazy_functions/批量翻译PDF文档_多线程.py CHANGED Viewed

@@ -2,174 +2,9 @@ from toolbox import CatchException, report_execption, write_results_to_file
 from toolbox import update_ui
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
 from colorful import *
-def read_and_clean_pdf_text(fp):
-    """
-    这个函数用于分割pdf，用了很多trick，逻辑较乱，效果奇好，不建议任何人去读这个函数
-    **输入参数说明**
-    - `fp`：需要读取和清理文本的pdf文件路径
-    **输出参数说明**
-    - `meta_txt`：清理后的文本内容字符串
-    - `page_one_meta`：第一页清理后的文本内容列表
-    **函数功能**
-    读取pdf文件并清理其中的文本内容，清理规则包括：
-    - 提取所有块元的文本信息，并合并为一个字符串
-    - 去除短块（字符数小于100）并替换为回车符
-    - 清理多余的空行
-    - 合并小写字母开头的段落块并替换为空格
-    - 清除重复的换行
-    - 将每个换行符替换为两个换行符，使每个段落之间有两个换行符分隔
-    """
-    import fitz, copy
-    import re
-    import numpy as np
-    fc = 0
-    fs = 1
-    fb = 2
-    REMOVE_FOOT_NOTE = True
-    REMOVE_FOOT_FFSIZE_PERCENT = 0.95
-    def primary_ffsize(l):
-        fsize_statiscs = {}
-        for wtf in l['spans']:
-            if wtf['size'] not in fsize_statiscs: fsize_statiscs[wtf['size']] = 0
-            fsize_statiscs[wtf['size']] += len(wtf['text'])
-        return max(fsize_statiscs, key=fsize_statiscs.get)
-    def ffsize_same(a,b):
-        return abs((a-b)/max(a,b)) < 0.02
-    # file_content = ""
-    with fitz.open(fp) as doc:
-        meta_txt = []
-        meta_font = []
-        meta_line = []
-        meta_span = []
-        for index, page in enumerate(doc):
-            # file_content += page.get_text()
-            text_areas = page.get_text("dict")  # 获取页面上的文本信息
-            for t in text_areas['blocks']:
-                if 'lines' in t:
-                    pf = 998
-                    for l in t['lines']:
-                        txt_line = "".join([wtf['text'] for wtf in l['spans']])
-                        pf = primary_ffsize(l)
-                        meta_line.append([txt_line, pf, l['bbox'], l])
-                        for wtf in l['spans']: # for l in t['lines']:
-                            meta_span.append([wtf['text'], wtf['size'], len(wtf['text'])])
-                    # meta_line.append(["NEW_BLOCK", pf])
-            # 块元提取                           for each word segment with in line                       for each line         cross-line words                          for each block
-            meta_txt.extend([" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
-                '- ', '') for t in text_areas['blocks'] if 'lines' in t])
-            meta_font.extend([np.mean([np.mean([wtf['size'] for wtf in l['spans']])
-                             for l in t['lines']]) for t in text_areas['blocks'] if 'lines' in t])
-            if index == 0:
-                page_one_meta = [" ".join(["".join([wtf['text'] for wtf in l['spans']]) for l in t['lines']]).replace(
-                    '- ', '') for t in text_areas['blocks'] if 'lines' in t]
-        # 获取正文主字体
-        fsize_statiscs = {}
-        for span in meta_span:
-            if span[1] not in fsize_statiscs: fsize_statiscs[span[1]] = 0
-            fsize_statiscs[span[1]] += span[2]
-        main_fsize = max(fsize_statiscs, key=fsize_statiscs.get)
-        if REMOVE_FOOT_NOTE:
-            give_up_fize_threshold = main_fsize * REMOVE_FOOT_FFSIZE_PERCENT
-        # 切分和重新整合
-        mega_sec = []
-        sec = []
-        for index, line in enumerate(meta_line):
-            if index == 0:
-                sec.append(line[fc])
-                continue
-            if REMOVE_FOOT_NOTE:
-                if meta_line[index][fs] <= give_up_fize_threshold:
-                    continue
-            if ffsize_same(meta_line[index][fs], meta_line[index-1][fs]):
-                # 尝试识别段落
-                if meta_line[index][fc].endswith('.') and\
-                    (meta_line[index-1][fc] != 'NEW_BLOCK') and \
-                    (meta_line[index][fb][2] - meta_line[index][fb][0]) < (meta_line[index-1][fb][2] - meta_line[index-1][fb][0]) * 0.7:
-                    sec[-1] += line[fc]
-                    sec[-1] += "\n\n"
-                else:
-                    sec[-1] += " "
-                    sec[-1] += line[fc]
-            else:
-                if (index+1 < len(meta_line)) and \
-                    meta_line[index][fs] > main_fsize:
-                    # 单行 + 字体大
-                    mega_sec.append(copy.deepcopy(sec))
-                    sec = []
-                    sec.append("# " + line[fc])
-                else:
-                    # 尝试识别section
-                    if meta_line[index-1][fs] > meta_line[index][fs]:
-                        sec.append("\n" + line[fc])
-                    else:
-                        sec.append(line[fc])
-        mega_sec.append(copy.deepcopy(sec))
-        finals = []
-        for ms in mega_sec:
-            final = " ".join(ms)
-            final = final.replace('- ', ' ')
-            finals.append(final)
-        meta_txt = finals
-        def 把字符太少的块清除为回车(meta_txt):
-            for index, block_txt in enumerate(meta_txt):
-                if len(block_txt) < 100:
-                    meta_txt[index] = '\n'
-            return meta_txt
-        meta_txt = 把字符太少的块清除为回车(meta_txt)
-        def 清理多余的空行(meta_txt):
-            for index in reversed(range(1, len(meta_txt))):
-                if meta_txt[index] == '\n' and meta_txt[index-1] == '\n':
-                    meta_txt.pop(index)
-            return meta_txt
-        meta_txt = 清理多余的空行(meta_txt)
-        def 合并小写开头的段落块(meta_txt):
-            def starts_with_lowercase_word(s):
-                pattern = r"^[a-z]+"
-                match = re.match(pattern, s)
-                if match:
-                    return True
-                else:
-                    return False
-            for _ in range(100):
-                for index, block_txt in enumerate(meta_txt):
-                    if starts_with_lowercase_word(block_txt):
-                        if meta_txt[index-1] != '\n':
-                            meta_txt[index-1] += ' '
-                        else:
-                            meta_txt[index-1] = ''
-                        meta_txt[index-1] += meta_txt[index]
-                        meta_txt[index] = '\n'
-            return meta_txt
-        meta_txt = 合并小写开头的段落块(meta_txt)
-        meta_txt = 清理多余的空行(meta_txt)
-        meta_txt = '\n'.join(meta_txt)
-        # 清除重复的换行
-        for _ in range(5):
-            meta_txt = meta_txt.replace('\n\n', '\n')
-        # 换行 -> 双换行
-        meta_txt = meta_txt.replace('\n', '\n\n')
-        for f in finals:
-            print亮黄(f)
-            print亮绿('***************************')
-    return meta_txt, page_one_meta
 @CatchException
 def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
     import glob

 from toolbox import update_ui
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
+from .crazy_utils import read_and_clean_pdf_text
 from colorful import *
 @CatchException
 def 批量翻译PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, sys_prompt, web_port):
     import glob

crazy_functions/理解PDF文档内容.py CHANGED Viewed

@@ -1,142 +1,66 @@
 from toolbox import update_ui
 from toolbox import CatchException, report_execption
-import re
-import unicodedata
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 fast_debug = False
-def is_paragraph_break(match):
-    """
-    根据给定的匹配结果来判断换行符是否表示段落分隔。
-    如果换行符前为句子结束标志（句号，感叹号，问号），且下一个字符为大写字母，则换行符更有可能表示段落分隔。
-    也可以根据之前的内容长度来判断段落是否已经足够长。
-    """
-    prev_char, next_char = match.groups()
-    # 句子结束标志
-    sentence_endings = ".!?"
-    # 设定一个最小段落长度阈值
-    min_paragraph_length = 140
-    if prev_char in sentence_endings and next_char.isupper() and len(match.string[:match.start(1)]) > min_paragraph_length:
-        return "\n\n"
-    else:
-        return " "
-def normalize_text(text):
-    """
-    通过把连字（ligatures）等文本特殊符号转换为其基本形式来对文本进行归一化处理。
-    例如，将连字 "fi" 转换为 "f" 和 "i"。
-    """
-    # 对文本进行归一化处理，分解连字
-    normalized_text = unicodedata.normalize("NFKD", text)
-    # 替换其他特殊字符
-    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', normalized_text)
-    return cleaned_text
-def clean_text(raw_text):
-    """
-    对从 PDF 提取出的原始文本进行清洗和格式化处理。
-    1. 对原始文本进行归一化处理。
-    2. 替换跨行的连词，例如 “Espe-\ncially” 转换为 “Especially”。
-    3. 根据 heuristic 规则判断换行符是否是段落分隔，并相应地进行替换。
-    """
-    # 对文本进行归一化处理
-    normalized_text = normalize_text(raw_text)
-    # 替换跨行的连词
-    text = re.sub(r'(\w+-\n\w+)', lambda m: m.group(1).replace('-\n', ''), normalized_text)
-    # 根据前后相邻字符的特点，找到原文本中的换行符
-    newlines = re.compile(r'(\S)\n(\S)')
-    # 根据 heuristic 规则，用空格或段落分隔符替换原换行符
-    final_text = re.sub(newlines, lambda m: m.group(1) + is_paragraph_break(m) + m.group(2), text)
-    return final_text.strip()
 def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
-    import time, glob, os, fitz
     print('begin analysis on:', file_name)
-    with fitz.open(file_name) as doc:
-        file_content = ""
-        for page in doc:
-            file_content += page.get_text()
-        file_content = clean_text(file_content)
-        # print(file_content)
-    split_number = 10000
-    split_group = (len(file_content)//split_number)+1
-    for i in range(0,split_group):
-        if i==0:
-            prefix = "接下来请你仔细分析下面的论文，学习里面的内容（专业术语、公式、数学概念）.并且注意：由于论文内容较多，将分批次发送，每次发送完之后，你只需要回答“接受完成”"
-            i_say = prefix + f'文件名是{file_name}，文章内容第{i+1}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
-            i_say_show_user = f'文件名是：\n{file_name},\n由于论文内容过长，将分批请求（共{len(file_content)}字符，将分为{split_group}批，每批{split_number}字符）。\n当前发送{i+1}/{split_group}部分'
-        elif i==split_group-1:
-            i_say = f'你只需要回答“所有论文接受完成，请进行下一步”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:]}```'
-            i_say_show_user = f'当前发送{i+1}/{split_group}部分'
-        else:
-            i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
-            i_say_show_user = f'当前发送{i+1}/{split_group}部分'
-        chatbot.append((i_say_show_user, "[Local Message] waiting gpt response."))
-        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="")   # 带超时倒计时
-        while "完成" not in gpt_say:
-            i_say = f'你只需要回答“接受完成”。文章内容第{i+1}/{split_group}部分是 ```{file_content[i*split_number:(i+1)*split_number]}```'
-            i_say_show_user = f'出现error，重新发送{i+1}/{split_group}部分'
-            gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, llm_kwargs, chatbot, history=[], sys_prompt="")   # 带超时倒计时
-            time.sleep(1)
-        chatbot[-1] = (i_say_show_user, gpt_say)
-        history.append(i_say_show_user); history.append(gpt_say)
-        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-        time.sleep(2)
-    i_say = f'接下来，请你扮演一名专业的学术教授，利用你的所有知识并且结合这篇文章，回答我的问题。（请牢记：1.直到我说“退出”，你才能结束任务；2.所有问题需要紧密围绕文章内容;3.如果有公式，请使用tex渲染)'
-    chatbot.append((i_say, "[Local Message] waiting gpt response."))
-    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    # ** gpt request **
-    gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say, llm_kwargs, chatbot, history=history, sys_prompt="")   # 带超时倒计时
-    chatbot[-1] = (i_say, gpt_say)
-    history.append(i_say); history.append(gpt_say)
-    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-@CatchException
-def 理解PDF文档内容(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
-    import glob, os
-    # 基本信息：功能、贡献者
-    chatbot.append([
-        "函数插件功能？",
-        "理解PDF论文内容，并且将结合上下文内容，进行学术解答。函数插件贡献者: Hanzoe。"])
-    yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-    import tkinter as tk
-    from tkinter import filedialog
-    root = tk.Tk()
-    root.withdraw()
-    txt = filedialog.askopenfilename()
-    # 尝试导入依赖，如果缺少依赖，则给出安装建议
-    try:
-        import fitz
-    except:
-        report_execption(chatbot, history,
-            a = f"解析项目: {txt}",
-            b = f"导入软件依赖失败。使用该模块需要额外依赖，安装方法```pip install --upgrade pymupdf```。")
-        yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
-        return
-    # 清空历史，以免输入溢出
-    history = []
-    # 开始正式执行任务
-    yield from 解析PDF(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
 @CatchException
@@ -146,7 +70,7 @@ def 理解PDF文档内容标准文件输入(txt, llm_kwargs, plugin_kwargs, chat
     # 基本信息：功能、贡献者
     chatbot.append([
         "函数插件功能？",
-        "理解PDF论文内容，并且将结合上下文内容，进行学术解答。函数插件贡献者: Hanzoe。"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
     # 尝试导入依赖，如果缺少依赖，则给出安装建议

 from toolbox import update_ui
 from toolbox import CatchException, report_execption
+from .crazy_utils import read_and_clean_pdf_text
 from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
 fast_debug = False
 def 解析PDF(file_name, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
+    import tiktoken
     print('begin analysis on:', file_name)
+    file_content, page_one = read_and_clean_pdf_text(file_name)
+    # 递归地切割PDF文件，每一块（尽量是完整的一个section，比如introduction，experiment等，必要时再进行切割）
+    # 的长度必须小于 2500 个 Token
+    TOKEN_LIMIT_PER_FRAGMENT = 2500
+    from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
+    from toolbox import get_conf
+    enc = tiktoken.encoding_for_model(*get_conf('LLM_MODEL'))
+    def get_token_num(txt): return len(enc.encode(txt))
+    paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
+        txt=file_content,  get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
+    page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
+        txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
+    # 为了更好的效果，我们剥离Introduction之后的部分（如果有）
+    paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]
+    ############################## <第一步，从摘要中提取高价值信息，放到history中> ##################################
+    final_results = []
+    final_results.append(paper_meta)
+    ############################## <第二步，迭代地历遍整个文章，提取精炼信息> ##################################
+    i_say_show_user = f'首先你在英文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。"           # 用户提示
+    chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[])    # 更新UI
+    iteration_results = []
+    last_iteration_result = paper_meta  # 初始值是摘要
+    MAX_WORD_TOTAL = 4096
+    n_fragment = len(paper_fragments)
+    if n_fragment >= 20: print('文章极长，不能达到预期效果')
+    for i in range(n_fragment):
+        NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
+        i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i]}"
+        i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} words: {paper_fragments[i][:200]}"
+        gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user,  # i_say=真正给chatgpt的提问， i_say_show_user=给用户看的提问
+                                                                           llm_kwargs, chatbot,
+                                                                           history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
+                                                                           sys_prompt="Extract the main idea of this section."  # 提示
+                                                                        )
+        iteration_results.append(gpt_say)
+        last_iteration_result = gpt_say
+    ############################## <第三步，整理history> ##################################
+    final_results.extend(iteration_results)
+    final_results.append(f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。')
+    # 接下来两句话只显示在界面上，不起实际作用
+    i_say_show_user = f'接下来，你是一名专业的学术教授，利用以上信息，使用中文回答我的问题。'; gpt_say = "[Local Message] 收到。"
+    chatbot.append([i_say_show_user, gpt_say])
+    ############################## <第四步，设置一个token上限，防止回答时Token溢出> ##################################
+    from .crazy_utils import input_clipping
+    _, final_results = input_clipping("", final_results, max_token_limit=3200)
+    yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了
 @CatchException
     # 基本信息：功能、贡献者
     chatbot.append([
         "函数插件功能？",
+        "理解PDF论文内容，并且将结合上下文内容，进行学术解答。函数插件贡献者: Hanzoe, binary-husky"])
     yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
     # 尝试导入依赖，如果缺少依赖，则给出安装建议

version CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "version": 2.67,
   "show_feature": true,
-  "new_feature": "现可通过输入区更新临时api-key <-> 增强多线程稳定性（涉及代码解析、PDF翻译、自译解等） <-> 修复Token计数错误（解决PDF翻译的分割不合理的问题） <-> 如果一键更新失败，可前往github手动更新"
 }

 {
+  "version": 2.68,
   "show_feature": true,
+  "new_feature": "改善理解pdf（chatpdf）功能 <-> 如果一键更新失败，可前往github手动更新"
 }