import nltk import pandas as pd from langchain_community.utilities import GoogleSerperAPIWrapper from newspaper import Article from newspaper.article import ArticleException pd.options.mode.use_inf_as_na = True class GoogleSerperSearchAgent: def __init__(self, sources_path: str) -> None: with open(sources_path) as f: self._sources = [line.strip() for line in f.readlines()] self._api = GoogleSerperAPIWrapper() nltk.download("punkt") def run(self, query: str, limit_per_source: int = 3) -> tuple[pd.DataFrame, dict[str, float]]: results = [] for site in self._sources: results.extend(self._search(query, site, limit_per_source)) for result in results: article = self._fetch(result["url"]) if article is None: continue for key, value in article.items(): result[key] = value df = pd.DataFrame(results) df.dropna(subset=["content"], inplace=True) df = df[["title", "author", "date", "url", "content"]] return df, {"cost": 0.001 * len(self._sources)} def _search(self, query: str, site: str | None = None, limit: int = 3) -> list[dict[str, str]]: if site is not None: query += f" site:{site}" self._api.k = limit results = self._api.results(query)["organic"] results = [{"title": result["title"], "url": result["link"], "date": result.get("date", "")} for result in results] return results def _fetch(self, url: str) -> dict[str, str] | None: article = Article(url=url) try: article.download() except ArticleException: return None article.parse() article.nlp() return {"title": article.title, "author": article.authors[0] if article.authors else "", "content": article.text}