Spaces:
Runtime error
Runtime error
File size: 8,477 Bytes
ae4e988 0d9e44b 0e81052 0d9e44b 707168a 0d9e44b 707168a 0e81052 707168a 8786019 707168a 0e81052 707168a 0e81052 707168a 0d9e44b 707168a 0d9e44b 707168a 437cf54 1b682b7 437cf54 573c6d4 75dab34 366c803 0d9e44b 827c354 437cf54 8849abb 192d263 250787d 192d263 e89aaf6 192d263 0bd2623 efc018b 8849abb 51d8734 192d263 448ec0d 8849abb 0d9e44b e30ed28 437cf54 1b682b7 62f0b09 0d9e44b a57fdc7 c30642b 437cf54 f0e1870 b962252 8849abb 6660108 a57fdc7 f0e1870 3b1a5cc f0e1870 8849abb a57fdc7 6c531ab f0e1870 3b1a5cc f0e1870 8849abb a57fdc7 f0e1870 1f5e831 0d9e44b 3b1a5cc a1a7561 3b44741 a57fdc7 5e66ae2 a57fdc7 0d9e44b a57fdc7 0d9e44b ae4e988 |
|
import gradio as gr
import requests
import bs4
def sort_doc(in_list,steps_in=8,control=None):
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
text=str(in_list)
########################################
sen_list=in_list
######################################
key_cnt=len(in_list)
print(key_cnt)
control_char=list(control_json['control'])
char_len=len(control_char)
if not steps_in:
n_cnt=0
nx=key_cnt
while True:
if nx >= 1:
n_cnt+=1
nx = nx/char_len
else:
print("#######")
print(n_cnt)
print(nx)
print("#######")
steps=n_cnt
break
if steps_in:
steps=steps_in
if control:
control_len=control_json['leng']-steps
control_char_val=list(control_json['control'][:control_len])
control_val=list(control_json['control'][control_len:])
val_len=len(control_val)
json_out={}
noun_list={}
step_list=[]
big_cnt=0
cnt=0
go=True
step_cont_box=[]
for ii in range(steps):
print(ii)
step_cont_box.append(0)
#print (step_cont_box)
mod=0
pos=len(step_cont_box)-1
if go:
for i, ea in enumerate(in_list):
if go:
if cnt > char_len-1:
#print(step_cont_box)
go1=True
for ii,ev in enumerate(step_cont_box):
if go:
if ev >= char_len-1:
step_cont_box[ii]=0
if go1==True:
step_cont_box[ii-1]=step_cont_box[ii-1]+1
go1=False
cnt=1
else:
step_cont_box[pos]=cnt
cnt+=1
print(step_cont_box)
out_js=""
for iii,j in enumerate(step_cont_box):
print(j)
out_js = out_js+control_char[j]
sen_obj=in_list[i]
#sen_obj=proc_sen(sen_list,i)
#json_out[out_js]={'nouns':ea}
json_out[out_js]=sen_obj
print ("#################")
print (out_js)
print (sen_obj)
print ("#################")
big_cnt+=1
if big_cnt==key_cnt:
print("DONE")
go=False
#noun_list=proc_nouns(json_out)
return json_out
link_box = []
def link_find(url):
out = []
source = requests.get(url)
if source.status_code ==200:
print("YES")
#soup = bs4.BeautifulSoup(source.content,'lxml')
soup = bs4.BeautifulSoup(source.content,'html.parser')
rawp=(f'RAW TEXT RETURNED: {soup.text}')
cnt=0
cnt+=len(rawp)
rawt=soup.text
#out.append(rawp)
#out.append("HTML fragments: ")
node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]}
node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]}
q=("a","p","span","content","article")
for p in soup.find_all("a"):
url0=p.get('href')
if url0.startswith("//"):
print(url0)
uri1=url.split("//")[0]
#uri2=url.split("//")[1]
#uri3=uri2.split("/")[0]
#uri=f'{uri1}//{uri3}'
uri=f'{uri1}{url0}'
print(uri)
elif url0.startswith("/") and not url0.startswith("//"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
uri=f'{uri}{url0}'
print(uri)
else:
uri=url0
node1['LINKS'].append(uri)
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
node2['LINKS'].append(uri)
#node2['LINK_KEY'].append(uri_key)
link_box.append(uri)
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
else:
print("NO")
pass
return node1,node2
#https://huggingface.co/spaces/Omnibus/crawl
def sitemap(url,level):
uri=""
uri0=""
if url != "" and url != None:
link1,link2=link_find(url)
if level >=2:
for i,ea in enumerate(link1['TREE']):
print(ea)
try:
#if not ea['URL'].startswith("http"):
# uri1=url.split("//")[0]
# uri2=url.split("//")[1]
# uri3=uri2.split("/")[0]
# uri=f'{uri1}//{uri3}'
# print(uri)
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
link1['TREE'][i]=out_list1
link2['TREE'][i]=out_list2
#link1['TREE'].append(out_list)
if level>=3:
for n,na in enumerate(link1['TREE'][i]['TREE']):
print(na)
try:
#if not na['URL'].startswith("http"):
# uri11=url.split("//")[0]
# uri22=url.split("//")[1]
# uri33=uri22.split("/")[0]
# uri0=f'{uri11}//{uri33}'
# print(uri0)
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
link1['TREE'][i]['TREE'][n]=out_list1
link2['TREE'][i]['TREE'][n]=out_list2
#link1['TREE'][i]['TREE'].append(out_list1)
except Exception as e:
print (e)
except Exception as e:
print (e)
uri_key=sort_doc(link_box)
return link1,link2,uri_key
def sitemap_OG(url,level):
uri=""
if url != "" and url != None:
link1=link_find(url)
if level >=2:
for i,ea in enumerate(link1):
print(ea)
try:
if not ea['URL'].startswith("http"):
uri1=url.split("//")[0]
uri2=url.split("//")[1]
uri3=uri2.split("/")[0]
uri=f'{uri1}//{uri3}'
print(uri)
out_list=link_find(f"{uri}{ea['URL']}")
link1[i]['TREE']=out_list
if level>=3:
for n,na in enumerate(link1[i]['TREE']):
print(na)
try:
if not na['URL'].startswith("http"):
uri11=url.split("//")[0]
uri22=url.split("//")[1]
uri33=uri22.split("/")[0]
uri0=f'{uri11}//{uri33}'
print(uri0)
out_list1=link_find(f"{uri0}{na['URL']}")
link1[i]['TREE'][n]['TREE']=out_list1
except Exception as e:
print (e)
except Exception as e:
print (e)
return link1
with gr.Blocks(theme="Nymbo/Alyx_Theme") as app:
with gr.Row():
with gr.Column(scale=3):
with gr.Row():
inp=gr.Textbox(label="URL")
level=gr.Slider(minimum=1,maximum=2,step=1,value=1)
btn=gr.Button()
key_json=gr.JSON()
outp=gr.JSON()
with gr.Column(scale=1):
outmap=gr.JSON()
btn.click(sitemap,[inp,level],[outp,outmap,key_json])
app.launch() |