Spaces:

mgokg
/

google_search

Running

App Files Files Community

mgokg commited on Jan 5

Commit

6c2fae1

verified ·

1 Parent(s): 9e7dfc2

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -0

app.py CHANGED Viewed

@@ -26,6 +26,52 @@ def llm(message):
     except Exception as e:
         return f"Error in response generation: {str(e)}"
 def google_search(query):
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
@@ -42,6 +88,10 @@ def google_search(query):
     return first_div.text.strip()
 demo = gr.Interface(
     fn=google_search,
     inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."),

     except Exception as e:
         return f"Error in response generation: {str(e)}"
+def list_of_clubs(ort):
+    base_url = "https://vereine-in-deutschland.net"
+    all_links_text = []
+    initial_url = f"{base_url}/vereine/Bayern/{ort}"
+    try:
+        response = requests.get(initial_url)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # Determine the last page
+        link_element = soup.select_one('li.page-item:nth-child(8) > a:nth-child(1)')
+        last_page = 10
+        if link_element and 'href' in link_element.attrs:
+            href = link_element['href']
+            last_page = int(href.split('/')[-1])
+        # Loop through all pages and collect links
+        for page_number in range(1, last_page + 1):
+            page_url = f"{base_url}/vereine/Bayern/{ort}/p/{page_number}"
+            response = requests.get(page_url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, 'html.parser')
+            target_div = soup.select_one('div.row-cols-1:nth-child(4)')
+            if target_div:
+                #links = [urljoin(base_url, a['href']) for a in target_div.find_all('a', href=True)]
+                texts = [a.text for a in target_div.find_all('a', href=True)]
+                all_links_text.extend(texts)
+            else:
+                print(f"Target div not found on page {page_number}")
+    except Exception as e:
+        return str(e), []
+    all_links_text = all_links_text[0::2]
+    return all_links_text
+def extract_vereinsname(url):
+    parts = url.split('/')
+    vereinsname = parts[-1]
+    vereinsname = vereinsname.replace("-", " ")
+    return vereinsname
 def google_search(query):
     headers = {
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
     return first_div.text.strip()
+def process_ort(ort):
+    links_text = list_of_clubs(ort)
+    return links_text
 demo = gr.Interface(
     fn=google_search,
     inputs=gr.Textbox(lines=1, placeholder="Geben Sie Ihre Suchanfrage ein..."),