import gradio as gr import urllib.request import requests import bs4 import lxml def find_all(url,q=None,num=None): rawp = [] source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') # title of the page print(soup.title) # get attributes: print(soup.title.name) # get values: print(soup.title.string) # beginning navigation: print(soup.title.parent.name) rawp.append([tag.name for tag in soup.find_all()] ) print([tag.name for tag in soup.find_all()]) return rawp def find_it(url,q=None,num=None): out = [] out_l = [] z="" source = urllib.request.urlopen(url).read() soup = bs4.BeautifulSoup(source,'lxml') for p in soup.find_all(f'{q}'): if num != "": z=p.get(f'{num}') try: test = soup.select(f'{p.name}:first-child') #print(p.findChildren()) except Exception as e: print (e) #out.append(p) out.append([{q:p.string,"additional":z,"parent":p.parent.name,"previous":[b for b in p.previous],"first-child":[b.name for b in p.children],"content":p}]) if p.string !=None: out_l.append(p.string) else: out_l.append(z) #out.append(p.parent.name) print(dir(p)) print(p.parent.name) for url in soup.find_all('a'): print(url.get('href')) #print(soup.get_text()) return out,out_l def find_it2(url): response = requests.get(url,a1=None,q2=None,q3=None) try: response.raise_for_status() soup = BeautifulSoup(response.content, 'lxml') out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) return out except Exception as e: print (e) return e with gr.Blocks() as app: with gr.Row(): with gr.Column(scale=1): inp = gr.Textbox() with gr.Column(scale=2): q = gr.Textbox(value="p") with gr.Column(scale=2): num = gr.Textbox() with gr.Row(): all_btn = gr.Button("Load") find_btn = gr.Button("Find") with gr.Row(): rawp = gr.JSON() outp = gr.JSON() outl = gr.Textbox() all_btn.click(find_all,[inp,q,num],[rawp]) find_btn.click(find_it,[inp,q,num],[outp,outl]) app.launch()