import gradio as gr from googlesearch import search import requests from bs4 import BeautifulSoup from gradio_client import Client from urllib.parse import urljoin import pandas as pd from io import StringIO import json import groq import os api_key = os.getenv('groq') client = groq.Client(api_key=api_key) json_schema = """ { "name": "", "email": "", "phone": "", "ort": "" } """ def llm(message): message = f"return a json object with contact details \n fill in the avaiable values which can be found here:\n {message} \n return valid json only" try: completion = client.chat.completions.create( model="llama3-70b-8192", messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"{message}"} ], ) return completion.choices[0].message.content except Exception as e: return f"Error in response generation: {str(e)}" def list_of_clubs(ort): base_url = "https://vereine-in-deutschland.net" all_links_text = [] initial_url = f"{base_url}/vereine/Bayern/{ort}" try: response = requests.get(initial_url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Determine the last page link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)') last_page = 10 if link_element and 'href' in link_element.attrs: href = link_element['href'] last_page = int(href.split('/')[-1]) # Loop through all pages and collect links for page_number in range(1, last_page + 1): page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}" response = requests.get(page_url) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') target_div = soup.select_one('div.row-cols-1:nth-child(4)') if target_div: #links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)] texts = [a.text for a in target_div.find_all('a', href=True)] all_links_text.extend(texts) else: print(f"Target div not found on page {page_number}") except Exception as e: return str(e), [] all_links_text = all_links_text[0::2] return all_links_text def google_search(query): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } # Führt die Suche durch und erhält das erste Ergebnis for result in search(query, num_results=1): url = result break response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') #first_div = soup.find('div', class_='MjjYud') first_div = soup.find('body') return first_div.text.strip() def process_ort(ort): links_text = list_of_clubs(ort) vereine = [] for verein in links_text: #query = f"impressum {verein}" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" } #search_results = google_search(search_term) url = f"https://www.google.com/search?q=impressum {verein}" response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') impressum_div = soup.find('body') #return impressum_div.text #uri = f"https://www.google.com/search?q={query}" #response = requests.get(uri) #soup = BeautifulSoup(response.text, 'html.parser') #first_div = soup.find('body') #erg = llm(first_div.text) #return first_div.text contact_detailes = impressum_div.text #json_object = llm(contact_detailes) vereine.append(contact_detailes) #vereine.append(json_object) #vereine.extend(json_object) return vereine return links_text demo = gr.Interface( #fn=google_search, fn=process_ort, inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."), outputs="text", #outputs=gr.JSON(), title="google websearch", description="Geben Sie eine Suchanfrage ein..." ) demo.launch()