File size: 2,386 Bytes
ae4e988
 
 
 
437cf54
 
 
 
 
 
 
 
 
 
75dab34
 
437cf54
 
 
 
62f0b09
437cf54
c30642b
437cf54
f0e1870
b962252
6660108
437cf54
f0e1870
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e343b9b
ae4e988
3b44741
 
 
ae4e988
 
f0e1870
ae4e988
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import gradio as gr
import requests
import bs4

def link_find(url):
    out = []
    source = requests.get(url)
    if source.status_code ==200:
        #soup = bs4.BeautifulSoup(source.content,'lxml')
        soup = bs4.BeautifulSoup(source.content,'html.parser')
        
        rawp=(f'RAW TEXT RETURNED: {soup.text}')
        cnt=0
        cnt+=len(rawp)
        #out.append(rawp)
        #out.append("HTML fragments: ")
        q=("a","p","span","content","article")
        for p in soup.find_all("a"):
            out.append({"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string,"TREE":[]})    
    else:
        pass
    return out
#https://huggingface.co/spaces/Omnibus/crawl

def sitemap(url,level):
    uri=""
    if url != "" and url != None:    
        link1=link_find(url)
        if level >=2:
            for i,ea in enumerate(link1):
                print(ea)
                try:
                    if not ea['URL'].startswith("http"):
                        uri1=url.split("//")[0]
                        uri2=url.split("//")[1]
                        uri3=uri2.split("/")[0]
                        uri=f'{uri1}//{uri3}'
                        print(uri)
                    out_list=link_find(f"{uri}{ea['URL']}")
                    link1[i]['TREE']=out_list
                    if level>=3:
                        for n,na in enumerate(link1[i]['TREE']):
                            print(na)
                            try:
                                if not na['URL'].startswith("http"):
                                    uri11=url.split("//")[0]
                                    uri22=url.split("//")[1]
                                    uri33=uri22.split("/")[0]
                                    uri0=f'{uri11}//{uri33}'
                                    print(uri0)
                                out_list1=link_find(f"{uri0}{na['URL']}")
                                link1[i]['TREE'][n]['TREE']=out_list1
                            except Exception as e:
                                print (e)
                except Exception as e:
                    print (e)
    return link1
with gr.Blocks() as app:
    with gr.Row():
        inp=gr.Textbox(label="URL")
        level=gr.Slider(minimum=1,maximum=3,step=1,value=2)
    btn=gr.Button()
    outp=gr.JSON()
    btn.click(sitemap,[inp,level],outp)
app.launch()