|
import gradio as gr |
|
import requests |
|
import bs4 |
|
|
|
|
|
import os |
|
from huggingface_hub import HfApi, upload_file |
|
import json |
|
import uuid |
|
token=os.environ.get("HF_TOKEN") |
|
username="omnibus" |
|
dataset_name="tmp" |
|
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/' |
|
api=HfApi(token="") |
|
filename="urls" |
|
filename2="pages" |
|
|
|
|
|
|
|
def init(filename=filename,save_data=save_data): |
|
|
|
|
|
r = requests.get(f'{save_data}crawl/{filename}.json') |
|
print(f'status code main:: {r.status_code}') |
|
if r.status_code==200: |
|
lod = json.loads(r.text) |
|
else: |
|
lod={} |
|
return lod |
|
|
|
|
|
def sort_doc(in_list: list, steps_in: int, control: int=0, prev_list: str=None): |
|
prev_list=init() |
|
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62} |
|
key_cnt=len(in_list) |
|
print(key_cnt) |
|
|
|
|
|
|
|
|
|
control_char=list(control_json['control']) |
|
char_len=len(control_char) |
|
if not steps_in: |
|
n_cnt=0 |
|
nx=key_cnt |
|
while True: |
|
if nx >= 1: |
|
n_cnt+=1 |
|
nx = nx/char_len |
|
else: |
|
steps=n_cnt |
|
break |
|
if steps_in: |
|
steps=steps_in |
|
if control: |
|
control_len=control_json['leng']-control |
|
control_char=list(control_json['control'][:control_len]) |
|
control_val=list(control_json['control'][control_len:]) |
|
val_len=len(control_val) |
|
control_val_box=[] |
|
for ea in control_val: |
|
control_val_box.append(ea) |
|
print(f'CONTROL_VAL_BOX:: {control_val_box}') |
|
|
|
json_out={} |
|
|
|
big_cnt=0 |
|
cnt=0 |
|
go=True |
|
step_cont_box=[] |
|
|
|
if prev_list: |
|
print("LOD") |
|
last_key=list(prev_list.keys())[-1] |
|
print(last_key) |
|
for ea_dig in last_key: |
|
ea_dig=control_json['control'].index(ea_dig) |
|
ea_dig=int(ea_dig) |
|
print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}') |
|
|
|
step_cont_box.append(ea_dig) |
|
print(step_cont_box) |
|
cnt=int(step_cont_box[-1])+1 |
|
if not prev_list: |
|
print("NOT LOD") |
|
for ii in range(steps): |
|
print(ii) |
|
step_cont_box.append(0) |
|
|
|
pos=len(step_cont_box)-1 |
|
if go: |
|
for i, ea in enumerate(in_list): |
|
if go: |
|
if cnt > char_len-1: |
|
|
|
go1=True |
|
for ii,ev in enumerate(step_cont_box): |
|
if go: |
|
if ev >= char_len-1: |
|
step_cont_box[ii]=0 |
|
if go1==True: |
|
step_cont_box[ii-1]=step_cont_box[ii-1]+1 |
|
go1=False |
|
cnt=1 |
|
else: |
|
step_cont_box[pos]=cnt |
|
cnt+=1 |
|
|
|
out_js="" |
|
for iii,j in enumerate(step_cont_box): |
|
print(j) |
|
out_js = out_js+control_char[j] |
|
json_out[out_js]=in_list[i] |
|
big_cnt+=1 |
|
if big_cnt==key_cnt: |
|
print("DONE") |
|
go=False |
|
return json_out |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def sort_doc_OG(in_list,steps_in=0,control=None): |
|
r = requests.get(f'{save_data}crawl/{filename}.json') |
|
print(f'status code main:: {r.status_code}') |
|
if r.status_code==200: |
|
lod = json.loads(r.text) |
|
|
|
|
|
|
|
else: |
|
lod={} |
|
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62} |
|
text=str(in_list) |
|
key_cnt=len(in_list) |
|
print(key_cnt) |
|
control_char=list(control_json['control']) |
|
char_len=len(control_char) |
|
if not steps_in: |
|
n_cnt=0 |
|
nx=key_cnt |
|
while True: |
|
if nx >= 1: |
|
n_cnt+=1 |
|
nx = nx/char_len |
|
else: |
|
print("#######") |
|
print(n_cnt) |
|
print(nx) |
|
print("#######") |
|
steps=n_cnt |
|
break |
|
if steps_in: |
|
steps=steps_in |
|
|
|
if control: |
|
control_len=control_json['leng']-steps |
|
control_char_val=list(control_json['control'][:control_len]) |
|
control_val=list(control_json['control'][control_len:]) |
|
val_len=len(control_val) |
|
|
|
json_out=lod |
|
noun_list={} |
|
step_list=[] |
|
|
|
big_cnt=0 |
|
cnt=0 |
|
go=True |
|
|
|
|
|
step_cont_box=[] |
|
if lod: |
|
print("LOD") |
|
last_key=list(lod.keys())[-1] |
|
print(last_key) |
|
for ea_dig in last_key: |
|
ea_dig=control_json['control'].index(ea_dig) |
|
ea_dig=int(ea_dig) |
|
print(f'{ea_dig} :: {list(control_json["control"][ea_dig])[0]}') |
|
|
|
step_cont_box.append(ea_dig) |
|
print(step_cont_box) |
|
cnt=int(step_cont_box[-1])+1 |
|
if not lod: |
|
print("NOT LOD") |
|
for ii in range(steps): |
|
print(ii) |
|
step_cont_box.append(0) |
|
|
|
mod=0 |
|
pos=len(step_cont_box)-1 |
|
|
|
if go: |
|
for i, ea in enumerate(in_list): |
|
|
|
if go and ea not in list(lod.values()): |
|
if cnt > char_len-1: |
|
|
|
go1=True |
|
for ii,ev in enumerate(step_cont_box): |
|
if go: |
|
if ev >= char_len-1: |
|
step_cont_box[ii]=0 |
|
if go1==True: |
|
step_cont_box[ii-1]=step_cont_box[ii-1]+1 |
|
go1=False |
|
cnt=1 |
|
else: |
|
step_cont_box[pos]=cnt |
|
cnt+=1 |
|
|
|
out_js="" |
|
for iii,j in enumerate(step_cont_box): |
|
print(j) |
|
out_js = out_js+control_char[j] |
|
sen_obj=in_list[i] |
|
|
|
json_out[out_js]=sen_obj |
|
|
|
|
|
|
|
|
|
|
|
big_cnt+=1 |
|
if big_cnt==key_cnt: |
|
print("DONE") |
|
go=False |
|
|
|
return json_out |
|
|
|
|
|
link_box = [] |
|
|
|
def link_find(url): |
|
node1={} |
|
node2={} |
|
out = [] |
|
print(f'Try URL:: {url}') |
|
source = requests.get(url) |
|
if source.status_code ==200: |
|
print("YES") |
|
|
|
soup = bs4.BeautifulSoup(source.content,'html.parser') |
|
|
|
rawp=(f'RAW TEXT RETURNED: {soup.text}') |
|
cnt=0 |
|
cnt+=len(rawp) |
|
rawt=soup.text |
|
|
|
|
|
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} |
|
node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]} |
|
|
|
q=("a","p","span","content","article") |
|
for p in soup.find_all("a"): |
|
url0=p.get('href') |
|
try: |
|
if url0.startswith("//"): |
|
print(url0) |
|
uri1=url.split("//")[0] |
|
|
|
|
|
|
|
uri=f'{uri1}{url0}' |
|
|
|
elif url0.startswith("/") and not url0.startswith("//"): |
|
uri1=url.split("//")[0] |
|
uri2=url.split("//")[1] |
|
uri3=uri2.split("/")[0] |
|
uri=f'{uri1}//{uri3}' |
|
uri=f'{uri}{url0}' |
|
|
|
else: |
|
uri=url0 |
|
|
|
node1['LINKS'].append(uri) |
|
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) |
|
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) |
|
node2['LINKS'].append(uri) |
|
|
|
link_box.append(uri) |
|
|
|
except Exception as e: |
|
print (e) |
|
else: |
|
print("NO") |
|
pass |
|
|
|
return node1,node2 |
|
|
|
|
|
def sitemap_test(url,file_state,level): |
|
url_page=[] |
|
url_front=[] |
|
url_json=[] |
|
for each_url in url: |
|
uri="" |
|
uri0="" |
|
if url != "" and url != None: |
|
link1,link2=link_find(each_url) |
|
if level >=2: |
|
for i,ea in enumerate(link1['TREE']): |
|
print(ea) |
|
try: |
|
out_list1,out_list2=link_find(f"{uri}{ea['URL']}") |
|
link1['TREE'][i]=out_list1 |
|
link2['TREE'][i]=out_list2 |
|
|
|
|
|
if level>=3: |
|
for n,na in enumerate(link1['TREE'][i]['TREE']): |
|
print(na) |
|
try: |
|
out_list1,out_list2=link_find(f"{uri0}{na['URL']}") |
|
link1['TREE'][i]['TREE'][n]=out_list1 |
|
link2['TREE'][i]['TREE'][n]=out_list2 |
|
|
|
except Exception as e: |
|
print (e) |
|
except Exception as e: |
|
print (e) |
|
try: |
|
for ea_link in link2['LINKS']: |
|
print(ea_link) |
|
try: |
|
url_list=ea_link.split("/") |
|
url_front.append(url_list[:3]) |
|
|
|
except Exception as e: |
|
print(e) |
|
except Exception as e: |
|
print(e) |
|
uri_key=sort_doc(url_front,file_state,8) |
|
|
|
|
|
uid=uuid.uuid4() |
|
with open(f'{uid}.json', 'w') as f: |
|
json_hist=json.dumps(uri_key, indent=4) |
|
f.write(json_hist) |
|
f.close() |
|
|
|
upload_file( |
|
path_or_fileobj =f"{uid}.json", |
|
path_in_repo = f"crawl/{filename}.json", |
|
repo_id =f"{username}/{dataset_name}", |
|
repo_type = "dataset", |
|
token=token, |
|
) |
|
|
|
return link1,link2,uri_key |
|
|
|
|
|
def sitemap(url,file_state,level): |
|
uri="" |
|
uri0="" |
|
if url != "" and url != None: |
|
link1,link2=link_find(url) |
|
if level >=2: |
|
for i,ea in enumerate(link1['TREE']): |
|
print(ea) |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
out_list1,out_list2=link_find(f"{uri}{ea['URL']}") |
|
link1['TREE'][i]=out_list1 |
|
link2['TREE'][i]=out_list2 |
|
|
|
|
|
if level>=3: |
|
for n,na in enumerate(link1['TREE'][i]['TREE']): |
|
print(na) |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
|
|
out_list1,out_list2=link_find(f"{uri0}{na['URL']}") |
|
link1['TREE'][i]['TREE'][n]=out_list1 |
|
link2['TREE'][i]['TREE'][n]=out_list2 |
|
|
|
except Exception as e: |
|
print (e) |
|
except Exception as e: |
|
print (e) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uri_key=sort_doc(link2['LINKS'],file_state,8) |
|
|
|
|
|
|
|
uid=uuid.uuid4() |
|
|
|
|
|
|
|
|
|
with open(f'{uid}.json', 'w') as f: |
|
json_hist=json.dumps(uri_key, indent=4) |
|
f.write(json_hist) |
|
f.close() |
|
|
|
upload_file( |
|
path_or_fileobj =f"{uid}.json", |
|
path_in_repo = f"crawl/{filename}.json", |
|
repo_id =f"{username}/{dataset_name}", |
|
repo_type = "dataset", |
|
token=token, |
|
) |
|
|
|
return link1,link2,uri_key |
|
|
|
|
|
|
|
def sitemap_OG(url,level): |
|
uri="" |
|
if url != "" and url != None: |
|
link1=link_find(url) |
|
if level >=2: |
|
for i,ea in enumerate(link1): |
|
print(ea) |
|
try: |
|
if not ea['URL'].startswith("http"): |
|
uri1=url.split("//")[0] |
|
uri2=url.split("//")[1] |
|
uri3=uri2.split("/")[0] |
|
uri=f'{uri1}//{uri3}' |
|
print(uri) |
|
out_list=link_find(f"{uri}{ea['URL']}") |
|
link1[i]['TREE']=out_list |
|
if level>=3: |
|
for n,na in enumerate(link1[i]['TREE']): |
|
print(na) |
|
try: |
|
if not na['URL'].startswith("http"): |
|
uri11=url.split("//")[0] |
|
uri22=url.split("//")[1] |
|
uri33=uri22.split("/")[0] |
|
uri0=f'{uri11}//{uri33}' |
|
print(uri0) |
|
out_list1=link_find(f"{uri0}{na['URL']}") |
|
link1[i]['TREE'][n]['TREE']=out_list1 |
|
except Exception as e: |
|
print (e) |
|
except Exception as e: |
|
print (e) |
|
return link1 |
|
|
|
def test(): |
|
seed_box=[] |
|
with open("./seed.txt") as f: |
|
this = f.readlines() |
|
f.close() |
|
for ea in this: |
|
ea=ea.strip().strip("\n") |
|
seed_box.append(ea) |
|
|
|
try: |
|
a,b,c = sitemap_test(seed_box,None,1) |
|
except Exception as e: |
|
print (e) |
|
|
|
with gr.Blocks() as app: |
|
file_state=gr.State() |
|
with gr.Row(): |
|
with gr.Column(scale=3): |
|
with gr.Row(): |
|
inp=gr.Textbox(label="URL") |
|
level=gr.Slider(minimum=1,maximum=1,step=1,value=1) |
|
btn=gr.Button() |
|
test_btn=gr.Button("Test") |
|
key_json=gr.JSON() |
|
outp=gr.JSON() |
|
with gr.Column(scale=1): |
|
outmap=gr.JSON() |
|
|
|
btn.click(sitemap,[inp,level],[outp,outmap,key_json]) |
|
app.launch() |