watanabe3tipapa commited on
Commit
6b7ef92
1 Parent(s): 9bd5920

Upload 4 files

Browse files
tools/__pycache__/fetch_page.cpython-311.pyc ADDED
Binary file (4.41 kB). View file
 
tools/__pycache__/search_ddg.cpython-311.pyc ADDED
Binary file (3.14 kB). View file
 
tools/fetch_page.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import html2text
3
+ from readability import Document
4
+ from langchain.agents import Tool
5
+ from urllib.parse import urlparse, parse_qs, urlunparse
6
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+
8
+
9
+ def fetch_page(url, model_name='gpt-3.5-turbo', timeout_sec=10):
10
+ """Tool to fetch the content of a web page from a given URL.
11
+ - This returns `title`, `content`, and `has_next` indicator. `content` is returned in markdown format.
12
+ - By default, only up to 2,000 tokens of content are retrieved.
13
+ - If there is more content available on the page, the `has_next` value will be True.
14
+ - To read the continuation, you can increment the `page` parameter with the same URL and input them again.
15
+
16
+ Returns
17
+ -------
18
+ Dict[str, Any]:
19
+ - status: str
20
+ - page_content
21
+ - title: str
22
+ - content: str
23
+ - has_next: bool
24
+ """
25
+ # page parameter
26
+ parsed_url = urlparse(url)
27
+ parsed_qs = parse_qs(parsed_url.query)
28
+ page = int(parsed_qs.get("page", [1])[0]) - 1
29
+ url = urlunparse(
30
+ (parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", "")
31
+ )
32
+
33
+ try:
34
+ response = requests.get(url, timeout=timeout_sec)
35
+ response.encoding = 'utf-8'
36
+ except requests.exceptions.Timeout:
37
+ return {
38
+ "status": 500,
39
+ "page_content": {'error_message': 'Could not download page due to Timeout Error. Please try to fetch other pages.'}
40
+ }
41
+
42
+ if response.status_code != 200:
43
+ return {
44
+ "status": response.status_code,
45
+ "page_content": {'error_message': 'Could not download page. Please try to fetch other pages.'}
46
+ }
47
+
48
+ try:
49
+ doc = Document(response.text)
50
+ title = doc.title()
51
+ html_content = doc.summary()
52
+ content = html2text.html2text(html_content)
53
+ except:
54
+ return {
55
+ "status": 500,
56
+ "page_content": {'error_message': 'Could not parse page. Please try to fetch other pages.'}
57
+ }
58
+
59
+ text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
60
+ model_name=model_name,
61
+ chunk_size=1000,
62
+ chunk_overlap=0,
63
+ )
64
+ chunks = text_splitter.split_text(content)
65
+ if page >= len(chunks):
66
+ return {
67
+ "status": 500,
68
+ "page_content": {'error_message': 'page parameter looks invalid. Please try to fetch other pages.'}
69
+ }
70
+ else:
71
+ return {
72
+ "status": 200,
73
+ "page_content": {
74
+ "title": title,
75
+ "content": chunks[page],
76
+ "has_next": page < len(chunks) - 1
77
+ }
78
+ }
79
+
80
+
81
+ def get_fetch_page_tool():
82
+ fetch_page_tool_description = """
83
+ Tool to fetch the content of a web page from a given URL.
84
+
85
+ This returns `status` and `page_content` (`title`, `content` and `has_next` indicator).
86
+ If status is not 200, there was some error of fetching page. (Try fetch other pages.)
87
+ If a status code other than 200 is returned, please don't give up and make sure to check other pages.
88
+
89
+ By default, only up to 2,000 tokens of content are retrieved. If there is more content available on the page, the `has_next` value will be True.
90
+ To read the continuation, you can increment the `page` parameter with the same URL and input them again. (paging is start with 1, so next page is 2)
91
+ e.g. https://www.obamalibrary.gov/obamas/president-barack-obama?page=2
92
+ """
93
+ return Tool(
94
+ name='fetch_page',
95
+ func=fetch_page,
96
+ description=fetch_page_tool_description
97
+ )
tools/search_ddg.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import islice
2
+ from langchain.agents import Tool
3
+ from duckduckgo_search import DDGS
4
+
5
+
6
+ def search_ddg(query, max_result_num=5):
7
+ """
8
+ Tool for performing DuckDuckGo searches
9
+ - Please enter the keyword you want to search for and use it.
10
+ - The title, snippet (description), and URL of each page in the search results will be returned.
11
+
12
+ Sample Response of DuckDuckGo python library
13
+ --------------------------------------------
14
+ [
15
+ {
16
+ 'title': '日程・結果|Fifa 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|Jfa|日本サッカー協会',
17
+ 'href': 'https://www.jfa.jp/nadeshikojapan/womensworldcup2023/schedule_result/',
18
+ 'body': '日程・結果|FIFA 女子ワールドカップ オーストラリア&ニュージーランド 2023|なでしこジャパン|日本代表|JFA|日本サッカー協会. FIFA 女子ワールドカップ. オーストラリア&ニュージーランド 2023.'
19
+ }, ...
20
+ ]
21
+
22
+ Returns
23
+ -------
24
+ List[Dict[str, str]]:
25
+ - title
26
+ - snippet
27
+ - url
28
+ """
29
+ res = DDGS().text(query, region='wt-wt', safesearch='off', backend="lite")
30
+ return [
31
+ {
32
+ "title": r.get('title', ""),
33
+ "snippet": r.get('body', ""),
34
+ "url": r.get('href', "")
35
+ }
36
+ for r in islice(res, max_result_num)
37
+ ]
38
+
39
+
40
+ def get_search_ddg_tool():
41
+ search_tool_description = """
42
+ Tool for performing DuckDuckGo searches.
43
+ Please enter the keyword you want to search for and use it.
44
+ The title, snippet (description) and URL of each page in the search results will be returned.
45
+ The information available through this tool is QUITE CONDENSED and sometimes outdated.
46
+
47
+ If you can't find the information you're looking for, please make sure to use the `WEB Page Fetcher` tool to read the content of each page.
48
+ Feel free to use the most appropriate language for the context. (not necessary same as the user's language)
49
+ For example, for programming-related questions, it's best to search in English.
50
+ """
51
+ return Tool(
52
+ name='search_ddg',
53
+ func=search_ddg,
54
+ description=search_tool_description
55
+ )