ControllerExecutorFlowModule / wikipediaAPI.py
nbaldwin's picture
Update wikipediaAPI.py
2016f17
raw
history blame
5.65 kB
"""Util that calls Wikipedia. references: https://github.com/hwchase17/langchain/blob/9b615022e2b6a3591347ad77a3e21aad6cf24c49/docs/extras/modules/agents/tools/integrations/wikipedia.ipynb#L36"""
import logging
from typing import Any, Dict, List, Optional
from pydantic import BaseModel, root_validator
logger = logging.getLogger(__name__)
WIKIPEDIA_MAX_QUERY_LENGTH = 300
class WikipediaAPIWrapper(BaseModel):
"""Wrapper around WikipediaAPI.
To use, you should have the ``wikipedia`` python package installed.
This wrapper will use the Wikipedia API to conduct searches and
fetch page summaries. By default, it will return the page summaries
of the top-k results.
It limits the Document content by doc_content_chars_max.
:param top_k_results: The number of results to return.
:type top_k_results: int
:param lang: The language to use for the Wikipedia API.
:type lang: str
:param doc_content_chars_max: The maximum number of characters in the Document content.
:type doc_content_chars_max: int
:wiki_client: The Wikipedia API client.
"""
wiki_client: Any
top_k_results: int = 5
lang: str = "en"
doc_content_chars_max: int = 4000
@root_validator(pre=True)
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that the python package exists in environment.
:param values: The values to validate.
:type values: Dict
:return: The validated values.
:rtype: Dict
:raises ImportError: If the package is not installed.
"""
try:
import wikipedia
wikipedia.set_lang(values["lang"])
values["wiki_client"] = wikipedia
except ImportError:
raise ImportError(
"Could not import wikipedia python package. "
"Please install it with `pip install wikipedia`."
)
return values
def run(self, query: str) -> str:
"""Run Wikipedia search and get page summaries.
:param query: The query to search for.
:type query: str
:return: The page summaries.
:rtype: str
"""
page_titles = self.search_page_titles(query)
summaries = []
for page_title in page_titles:
if wiki_page := self._fetch_page(page_title):
if summary := self._formatted_page_summary(page_title, wiki_page):
summaries.append(summary)
if not summaries:
return "No good Wikipedia Search Result was found"
return "\n\n".join(summaries)[: self.doc_content_chars_max]
def _fetch_page(self, page: str) -> Optional[str]:
""" Fetch page content from Wikipedia.
:param page: The page to fetch.
:type page: str
:return: The page content.
:rtype: Optional[str]
"""
try:
return self.wiki_client.page(title=page, auto_suggest=False).content[: self.doc_content_chars_max]
except (
self.wiki_client.exceptions.PageError,
self.wiki_client.exceptions.DisambiguationError,
):
return None
def search_page_titles(self, query: str) -> List[str]:
"""Run Wikipedia search and get page summaries.
:param query: The query to search for.
:type query: str
:return: The page titles.
:rtype: List[str]
"""
return self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])[:self.top_k_results]
# def _page_to_document(self, page_title: str, wiki_page: Any) -> Document:
# main_meta = {
# "title": page_title,
# "summary": wiki_page.summary,
# "source": wiki_page.url,
# }
# add_meta = (
# {
# "categories": wiki_page.categories,
# "page_url": wiki_page.url,
# "image_urls": wiki_page.images,
# "related_titles": wiki_page.links,
# "parent_id": wiki_page.parent_id,
# "references": wiki_page.references,
# "revision_id": wiki_page.revision_id,
# "sections": wiki_page.sections,
# }
# if self.load_all_available_meta
# else {}
# )
# doc = Document(
# page_content=wiki_page.content[: self.doc_content_chars_max],
# metadata={
# **main_meta,
# **add_meta,
# },
# )
# return doc
@staticmethod
def _formatted_page_summary(page_title: str, wiki_page: Any) -> Optional[str]:
""" Format the page and summary in a single string.
:param page_title: The page title.
:type page_title: str
:param wiki_page: The Wikipedia page.
:type wiki_page: Any
:return: The formatted page summary.
:rtype: Optional[str]
"""
return f"Page: {page_title}\nSummary: {wiki_page.summary}"
# def load(self, query: str) -> List[Document]:
# """
# Run Wikipedia search and get the article text plus the meta information.
# See
#
# Returns: a list of documents.
#
# """
# page_titles = self.wiki_client.search(query[:WIKIPEDIA_MAX_QUERY_LENGTH])
# docs = []
# for page_title in page_titles[: self.top_k_results]:
# if wiki_page := self._fetch_page(page_title):
# if doc := self._page_to_document(page_title, wiki_page):
# docs.append(doc)
# return docs