huggi / src /lib /server /websearch /searchWebLocal.ts
nsarrazin's picture
nsarrazin HF staff
Option for running websearch locally (#563)
e3af794 unverified
raw
history blame
1.3 kB
import { JSDOM, VirtualConsole } from "jsdom";
export async function searchWebLocal(query: string) {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);
const htmlString = await fetch("https://www.google.com/search?hl=en&q=" + query, {
signal: abortController.signal,
})
.then((response) => response.text())
.catch();
const virtualConsole = new VirtualConsole();
virtualConsole.on("error", () => {
// No-op to skip console errors.
});
// put the html string into a DOM
const dom = new JSDOM(htmlString ?? "", {
virtualConsole,
});
const { document } = dom.window;
// get all a documents with href tag
const links = document.querySelectorAll("a");
if (!links.length) {
throw new Error(`webpage doesn't have any "a" element`);
}
// take url that start wirth /url?q=
// and do not contain google.com links
// and strip them up to '&sa='
const linksHref = Array.from(links)
.filter((el) => el.href?.startsWith("/url?q=") && !el.href.includes("google.com/"))
.map((el) => {
const link = el.href;
return link.slice("/url?q=".length, link.indexOf("&sa="));
});
// remove duplicate links and map links to the correct object shape
return { organic_results: [...new Set(linksHref)].map((link) => ({ link })) };
}