|
import json |
|
import random |
|
import traceback |
|
from llmsearch import utilityV2 as ut |
|
|
|
|
|
def findnth(haystack, needle, n): |
|
parts = haystack.split(needle, n + 1) |
|
if len(parts) <= n + 1: |
|
return -1 |
|
return len(haystack) - len(parts[-1]) - len(needle) |
|
|
|
|
|
def extract_site(url): |
|
site = "" |
|
base = findnth(url, "/", 2) |
|
if base > 2: |
|
site = url[:base].split(".") |
|
if len(site) > 1: |
|
site = site[-2] |
|
site = site.replace("https://", "") |
|
site = site.replace("http://", "") |
|
return site |
|
|
|
|
|
site_stats = {} |
|
stats_loaded = False |
|
stats_dirty = False |
|
|
|
|
|
def open_site_stats(): |
|
global site_stats, stats_loaded, stats_dirty |
|
if stats_loaded: |
|
return |
|
try: |
|
with open("site_stats.json", "r") as f: |
|
site_stats = json.loads(f.read()) |
|
except: |
|
print("Failed to read site_stats.json") |
|
traceback.print_exc() |
|
|
|
|
|
def ckpt(): |
|
global site_stats, stats_dirty |
|
if not stats_dirty: |
|
return |
|
try: |
|
with open("site_stats.json", "w") as ss: |
|
ss.write(json.dumps(site_stats)) |
|
stats_dirty = False |
|
except Exception as e: |
|
print(f"Failed to write site_stats: {str(e)}") |
|
traceback.print_exc() |
|
|
|
|
|
def update_site_stats(site, char_cnt, get_time, extract_time, openai_time): |
|
global site_stats, stats_dirty |
|
open_site_stats() |
|
if site not in site_stats: |
|
site_stats[site] = { |
|
"name": site, |
|
"hits": 0, |
|
"chars": 0, |
|
"get": 0, |
|
"extract": 0, |
|
"openai": 0, |
|
} |
|
if "hits" not in site_stats[site]: |
|
site_stats[site]["hits"] = 0 |
|
site_stats[site]["hits"] = site_stats[site]["hits"] + 1 |
|
site_stats[site]["chars"] = char_cnt + site_stats[site]["chars"] |
|
site_stats[site]["get"] = get_time + site_stats[site]["get"] |
|
site_stats[site]["extract"] = extract_time + site_stats[site]["extract"] |
|
site_stats[site]["openai"] = openai_time + site_stats[site]["openai"] |
|
stats_dirty = True |
|
|
|
|
|
|
|
def retrieve(site): |
|
global site_stats |
|
if site not in site_stats: |
|
site_stats[site] = { |
|
"name": site, |
|
"hits": 0, |
|
"chars": 0, |
|
"get": 0, |
|
"extract": 0, |
|
"openai": 0, |
|
} |
|
return site_stats[site] |
|
|
|
|
|
def get_next(urls, sample_unknown=False): |
|
global site_stats |
|
|
|
candidates = [] |
|
for url in urls: |
|
site = extract_site(url) |
|
candidate = retrieve(site) |
|
if sample_unknown or (site in ut.sites and ut.sites[site] != 0): |
|
candidates.append((candidate, url)) |
|
if len(candidates) == 0: |
|
return [] |
|
if len(candidates) == 1: |
|
return candidates[0] |
|
|
|
if random.random() > 0.85: |
|
pick = int(random.random() * len(candidates)) |
|
return candidates[pick] |
|
|
|
|
|
candidates.sort( |
|
reverse=True, |
|
key=lambda item: ( |
|
(item[0]["chars"] * 1000000) |
|
/ (max(1000, item[0]["get"] + item[0]["extract"] + item[0]["openai"])) |
|
), |
|
) |
|
|
|
|
|
p = random.random() |
|
p2 = p * p * p |
|
return candidates[int(p2 * len(candidates))] |
|
|