新增谷歌学术统合小助手
Browse files- crazy_functional.py +6 -0
- crazy_functions/谷歌检索小助手.py +106 -0
- request_llm/bridge_chatgpt.py +4 -1
crazy_functional.py
CHANGED
@@ -72,6 +72,7 @@ def get_crazy_functions():
|
|
72 |
from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
|
73 |
from crazy_functions.总结word文档 import 总结word文档
|
74 |
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
|
|
75 |
|
76 |
function_plugins.update({
|
77 |
"批量翻译PDF文档(多线程)": {
|
@@ -90,6 +91,11 @@ def get_crazy_functions():
|
|
90 |
"AsButton": False, # 加入下拉菜单中
|
91 |
"Function": HotReload(批量总结PDF文档pdfminer)
|
92 |
},
|
|
|
|
|
|
|
|
|
|
|
93 |
"批量总结Word文档": {
|
94 |
"Color": "stop",
|
95 |
"Function": HotReload(总结word文档)
|
|
|
72 |
from crazy_functions.批量总结PDF文档pdfminer import 批量总结PDF文档pdfminer
|
73 |
from crazy_functions.总结word文档 import 总结word文档
|
74 |
from crazy_functions.批量翻译PDF文档_多线程 import 批量翻译PDF文档
|
75 |
+
from crazy_functions.谷歌检索小助手 import 谷歌检索小助手
|
76 |
|
77 |
function_plugins.update({
|
78 |
"批量翻译PDF文档(多线程)": {
|
|
|
91 |
"AsButton": False, # 加入下拉菜单中
|
92 |
"Function": HotReload(批量总结PDF文档pdfminer)
|
93 |
},
|
94 |
+
"谷歌学术检索助手(输入谷歌学术搜索页url)": {
|
95 |
+
"Color": "stop",
|
96 |
+
"AsButton": False, # 加入下拉菜单中
|
97 |
+
"Function": HotReload(谷歌检索小助手)
|
98 |
+
},
|
99 |
"批量总结Word文档": {
|
100 |
"Color": "stop",
|
101 |
"Function": HotReload(总结word文档)
|
crazy_functions/谷歌检索小助手.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
|
2 |
+
from toolbox import CatchException, report_execption, write_results_to_file
|
3 |
+
|
4 |
+
def get_meta_information(url, chatbot, history):
|
5 |
+
import requests
|
6 |
+
import arxiv
|
7 |
+
import difflib
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
from toolbox import get_conf
|
10 |
+
proxies, = get_conf('proxies')
|
11 |
+
headers = {
|
12 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36',
|
13 |
+
}
|
14 |
+
# 发送 GET 请求
|
15 |
+
response = requests.get(url, proxies=proxies, headers=headers)
|
16 |
+
|
17 |
+
# 解析网页内容
|
18 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
19 |
+
|
20 |
+
def string_similar(s1, s2):
|
21 |
+
return difflib.SequenceMatcher(None, s1, s2).quick_ratio()
|
22 |
+
|
23 |
+
profile = []
|
24 |
+
# 获取所有文章的标题和作者
|
25 |
+
for result in soup.select(".gs_ri"):
|
26 |
+
title = result.a.text.replace('\n', ' ').replace(' ', ' ')
|
27 |
+
author = result.select_one(".gs_a").text
|
28 |
+
try:
|
29 |
+
citation = result.select_one(".gs_fl > a[href*='cites']").text # 引用次数是链接中的文本,直接取出来
|
30 |
+
except:
|
31 |
+
citation = 'cited by 0'
|
32 |
+
abstract = result.select_one(".gs_rs").text.strip() # 摘要在 .gs_rs 中的文本,需要清除首尾空格
|
33 |
+
search = arxiv.Search(
|
34 |
+
query = title,
|
35 |
+
max_results = 1,
|
36 |
+
sort_by = arxiv.SortCriterion.Relevance,
|
37 |
+
)
|
38 |
+
paper = next(search.results())
|
39 |
+
if string_similar(title, paper.title) > 0.90: # same paper
|
40 |
+
abstract = paper.summary.replace('\n', ' ')
|
41 |
+
is_paper_in_arxiv = True
|
42 |
+
else: # different paper
|
43 |
+
abstract = abstract
|
44 |
+
is_paper_in_arxiv = False
|
45 |
+
paper = next(search.results())
|
46 |
+
print(title)
|
47 |
+
print(author)
|
48 |
+
print(citation)
|
49 |
+
profile.append({
|
50 |
+
'title':title,
|
51 |
+
'author':author,
|
52 |
+
'citation':citation,
|
53 |
+
'abstract':abstract,
|
54 |
+
'is_paper_in_arxiv':is_paper_in_arxiv,
|
55 |
+
})
|
56 |
+
|
57 |
+
chatbot[-1] = [chatbot[-1][0], title + f'\n\n是否在arxiv中(不在arxiv中无法获取完整摘要):{is_paper_in_arxiv}\n\n' + abstract]
|
58 |
+
msg = "正常"
|
59 |
+
yield chatbot, [], msg
|
60 |
+
return profile
|
61 |
+
|
62 |
+
@CatchException
|
63 |
+
def 谷歌检索小助手(txt, top_p, temperature, chatbot, history, systemPromptTxt, WEB_PORT):
|
64 |
+
# 基本信息:功能、贡献者
|
65 |
+
chatbot.append([
|
66 |
+
"函数插件功能?",
|
67 |
+
"分析用户提供的谷歌学术(google scholar)搜索页面中,出现的所有文章: binary-husky,插件初始化中..."])
|
68 |
+
yield chatbot, history, '正常'
|
69 |
+
|
70 |
+
# 尝试导入依赖,如果缺少依赖,则给出安装建议
|
71 |
+
try:
|
72 |
+
import arxiv
|
73 |
+
from bs4 import BeautifulSoup
|
74 |
+
except:
|
75 |
+
report_execption(chatbot, history,
|
76 |
+
a = f"解析项目: {txt}",
|
77 |
+
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade beautifulsoup4 arxiv```。")
|
78 |
+
yield chatbot, history, '正常'
|
79 |
+
return
|
80 |
+
|
81 |
+
# 清空历史,以免输入溢出
|
82 |
+
history = []
|
83 |
+
|
84 |
+
meta_paper_info_list = yield from get_meta_information(txt, chatbot, history)
|
85 |
+
|
86 |
+
if len(meta_paper_info_list[:10]) > 0:
|
87 |
+
i_say = "下面是一些学术文献的数据,请从中提取出以下内容。" + \
|
88 |
+
"1、英文题目;2、中文题目翻译;3、作者;4、arxiv公开(is_paper_in_arxiv);4、引用数量(cite);5、中文摘要翻译。" + \
|
89 |
+
f"以下是信息源:{str(meta_paper_info_list[:10])}"
|
90 |
+
|
91 |
+
inputs_show_user = f"请分析此页面中出现的所有文章:{txt}"
|
92 |
+
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
|
93 |
+
inputs=i_say, inputs_show_user=inputs_show_user,
|
94 |
+
top_p=top_p, temperature=temperature, chatbot=chatbot, history=[],
|
95 |
+
sys_prompt="你是一个学术翻译,请从数据中提取信息。你必须使用Markdown格式。你必须逐个文献进行处理。"
|
96 |
+
)
|
97 |
+
|
98 |
+
history.extend([ "第一批", gpt_say ])
|
99 |
+
meta_paper_info_list = meta_paper_info_list[10:]
|
100 |
+
|
101 |
+
chatbot.append(["状态?", "已经全部完成"])
|
102 |
+
msg = '正常'
|
103 |
+
yield chatbot, history, msg
|
104 |
+
res = write_results_to_file(history)
|
105 |
+
chatbot.append(("完成了吗?", res));
|
106 |
+
yield chatbot, history, msg
|
request_llm/bridge_chatgpt.py
CHANGED
@@ -104,7 +104,10 @@ def predict_no_ui_long_connection(inputs, top_p, temperature, history=[], sys_pr
|
|
104 |
result = ''
|
105 |
while True:
|
106 |
try: chunk = next(stream_response).decode()
|
107 |
-
except StopIteration:
|
|
|
|
|
|
|
108 |
if len(chunk)==0: continue
|
109 |
if not chunk.startswith('data:'):
|
110 |
error_msg = get_full_error(chunk.encode('utf8'), stream_response).decode()
|
|
|
104 |
result = ''
|
105 |
while True:
|
106 |
try: chunk = next(stream_response).decode()
|
107 |
+
except StopIteration:
|
108 |
+
break
|
109 |
+
except requests.exceptions.ConnectionError:
|
110 |
+
chunk = next(stream_response).decode() # 失败了,重试一次?再失败就没办法了。
|
111 |
if len(chunk)==0: continue
|
112 |
if not chunk.startswith('data:'):
|
113 |
error_msg = get_full_error(chunk.encode('utf8'), stream_response).decode()
|