Spaces:

malteos
/

seed-crawl-annotator

Sleeping

App Files Files Community

malte.ostendorff@telekom.de commited on Dec 22, 2024

Commit

323a625

1 Parent(s): 23900aa

dataset integration

Browse files

Files changed (3) hide show

app.py +250 -124
requirements.txt +6 -0
texts.py +68 -0

app.py CHANGED Viewed

@@ -1,22 +1,67 @@
 from __future__ import annotations
 import os
 import random
 import time
 import gradio as gr
 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
 from PIL import Image
 from io import BytesIO
 import base64
 import trafilatura
 from huggingface_hub import whoami
 from languages import ISO_CODE_TO_LANGUAGE_NAME
-OFFLINE = os.environ.get("OFFLINE", False)
 def get_candidate_urls():
     return [
@@ -46,7 +91,7 @@ def fetch_screenshot_and_text_from_url(url):
     height = 350
     text = ""
-    if OFFLINE:
         screenshot = Image.new('RGB', (350, height))
         text = f"Some dummy text for {url} (offline mode enabled)"
@@ -103,143 +148,224 @@ with gr.Blocks(fill_height=True) as demo:
     # Seed Crawl Annotator
     """)
-    profile_state = gr.State([])
-    gr.LoginButton()
-    with gr.Column(visible=False) as wrapper_col:
-        login_status = gr.Markdown("no")
-        def handle_login(profile: gr.OAuthProfile | None) -> dict:
-            if profile:
-                gr.Info(f"Logged in as {profile.username}")
-                return {
-                    profile_state: f"{profile.username}",
-                    wrapper_col: gr.update(visible=True),
-                    login_status:  "yes",
-                }
-            else:
-                gr.Warning(f"You need to login to use this app.")
-                return {
-                    profile_state: [],
-                    wrapper_col: gr.update(visible=False),
-                    login_status:  "no",
-                }
-        demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col, login_status])
-        url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
-        with gr.Row():
-            set_random_btn = gr.Button("Set Random URL", variant="secondary", interactive=True)
-            load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
-        with gr.Row():
-            extracted_text = gr.Textbox(
-                label="Extracted text",
-                max_lines=15,
-                lines=15,
-                visible=True,
-                placeholder="Click on `Load URL` to fetch Web page's text content."
-            )
-            screenshot_scrollable = gr.HTML("", visible=False)
-        with gr.Column(visible=False) as output_col:
             with gr.Row():
-                language_codes = gr.Dropdown(
-                        [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
-                        label="Language codes",
-                        multiselect=True,
-                        # allow_custom_value=True,
                 )
-                categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
-            with gr.Row():
-                do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
-                dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
-                # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
-        def set_random_url():
-            candidate_urls = get_candidate_urls()
-            selected_url = random.choice(candidate_urls)
-            return selected_url
-        set_random_btn.click(fn=set_random_url, outputs=url_field)
-        def load_url(url):
-            screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
-            if not screenshot_html_str or not text:
-                gr.Error("Could not fetch data for url")
-            else:
-                return {
-                    screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
-                    extracted_text:  gr.update(value=text, visible=True),
-                    output_col: gr.update(visible=True),
-                    language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
-                    categories:  gr.update(value=None),
-                }
-        load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
-        def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
-            if profile_state:
-                # html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
-                gr.Info("✅ Thanks for your feedback")
-            else:
-                gr.Error("❌ Feedback could not be saved")
-                # html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
-            return {
-                url_field: "",
-                output_col: gr.update(visible=False),
-                extracted_text: gr.update(value=None, visible=True),
-                screenshot_scrollable: gr.update(value="", visible=False),
-            }
-        # def do_crawl(profile_state, url, language_codes, categories):
-        #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
-        # def dont_crawl(profile_state, url, language_codes, categories):
-        #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
-        do_crawl_btn.click(
-            fn=do_crawl,
-            inputs=[profile_state, url_field, language_codes, categories],
-            outputs=[
-                url_field,
-                output_col,
-                extracted_text,
-                screenshot_scrollable
-            ],
-            api_name="do_crawl",
-        )
-        dont_crawl_btn.click(
-            fn=do_crawl,
-            inputs=[profile_state, url_field, language_codes, categories],
-            outputs=[
-                url_field,
-                output_col,
-                extracted_text,
-                screenshot_scrollable
-            ],
-            api_name="do_crawl",
-        )
-        # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
-        # def random_subpage(url):
-        #     new_url = "http://example.com"
-        #     return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
-        # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")

 from __future__ import annotations
+import json
 import os
 import random
 import time
 import gradio as gr
+import pandas as pd
 from selenium import webdriver
 from selenium.common.exceptions import WebDriverException
 from PIL import Image
 from io import BytesIO
 import base64
+from datetime import datetime
+from pathlib import Path
+from uuid import uuid4
 import trafilatura
+from datasets import load_dataset
+from datasets import Features, Value, Sequence
+from huggingface_hub import CommitScheduler
 from huggingface_hub import whoami
 from languages import ISO_CODE_TO_LANGUAGE_NAME
+from texts import ABOUT_TEXT
+DISABLE_FETCH_URL = os.environ.get("DISABLE_FETCH_URL", False)
+if DISABLE_FETCH_URL:
+    print("Fetch URL is disabled: Only dummy screenshot and text will be returned.")
+DATASET_REPO_ID = os.environ.get("DATASET_REPO_ID", "malteos/seed-crawl-urls")
+JSON_DATASET_DIR = Path("jsonl_dataset")
+JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
+# Each instance of this space will spawn a unique file for each type of result
+# For the life of that space, it will append to that file pushed to a dataset every so often
+# It also is append_only, so no previous data will be overwritten
+JSON_DATASET_PATH = JSON_DATASET_DIR / f"urls-{uuid4()}.jsonl"
+if os.getenv("HF_TOKEN"):
+    scheduler = CommitScheduler(
+        repo_id=DATASET_REPO_ID,
+        repo_type="dataset",
+        folder_path=JSON_DATASET_DIR,
+        path_in_repo="data",
+    )
+else:
+    scheduler = None
+    print("No HF_TOKEN found, results will not be uploaded to the hub.")
+def save_to_jsonl(obj: dict) -> None:
+    if scheduler:
+        with scheduler.lock:
+            with JSON_DATASET_PATH.open("a") as f:
+                json.dump(obj, f)
+                f.write("\n")
 def get_candidate_urls():
     return [
     height = 350
     text = ""
+    if DISABLE_FETCH_URL:
         screenshot = Image.new('RGB', (350, height))
         text = f"Some dummy text for {url} (offline mode enabled)"
     # Seed Crawl Annotator
     """)
+    with gr.Tab("Contribute"):
+        gr.Markdown("Welcome! This is a crowd-sourced effort to improve crawling of low-resource languages. Your contributions will be part of a public dataset.")
+        profile_state = gr.State([])
+        gr.LoginButton()
+        with gr.Column(visible=False) as wrapper_col:
+            login_status = gr.Markdown("no", visible=False)
+            def handle_login(profile: gr.OAuthProfile | None) -> dict:
+                if profile:
+                    gr.Info(f"Logged in as {profile.username}")
+                    return {
+                        profile_state: f"{profile.username}",
+                        wrapper_col: gr.update(visible=True),
+                        login_status:  "yes",
+                    }
+                else:
+                    gr.Warning(f"You need to login to use this app.")
+                    return {
+                        profile_state: [],
+                        wrapper_col: gr.update(visible=False),
+                        login_status:  "no",
+                    }
+            demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col, login_status])
+            url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
+            with gr.Row():
+                set_random_btn = gr.Button("Pick Random URL", variant="secondary", interactive=True)
+                load_btn = gr.Button("Annotate URL", variant="primary", interactive=True)
             with gr.Row():
+                extracted_text = gr.Textbox(
+                    label="Extracted text",
+                    max_lines=15,
+                    lines=15,
+                    visible=True,
+                    placeholder="Click on `Load URL` to fetch Web page's text content."
                 )
+                screenshot_scrollable = gr.HTML("", visible=False)
+            with gr.Column(visible=False) as output_col:
+                with gr.Row():
+                    language_codes = gr.Dropdown(
+                            [("unknown", "unknown")] + [(f"{code}: {name}", code) for code, name in ISO_CODE_TO_LANGUAGE_NAME.items()],
+                            label="Language codes",
+                            multiselect=True,
+                            # allow_custom_value=True,
+                    )
+                    categories = gr.CheckboxGroup(["News", "Culture/History", "Government", "Political Parties", "Other"], label="Categories")
+                with gr.Row():
+                    do_crawl_btn = gr.Button("✅ Do Crawl", elem_classes="success")
+                    dont_crawl_btn = gr.Button("❌ Don't Crawl", elem_classes="error")
+                    # random_subpage_btn = gr.Button("🔁 Load Another Subpage", variant="secondary")
+            def set_random_url():
+                candidate_urls = get_candidate_urls()
+                selected_url = random.choice(candidate_urls)
+                return selected_url
+            set_random_btn.click(fn=set_random_url, outputs=url_field)
+            def load_url(url):
+                screenshot_html_str, text = fetch_screenshot_and_text_from_url(url)
+                if not screenshot_html_str or not text:
+                    gr.Error("Could not fetch data for url")
+                else:
+                    return {
+                        screenshot_scrollable: gr.update(value=screenshot_html_str, visible=True),
+                        extracted_text:  gr.update(value=text, visible=True),
+                        output_col: gr.update(visible=True),
+                        language_codes: "unknown", # Reset by set to invalid value # gr.update(None, label=url),
+                        categories:  gr.update(value=None),
+                    }
+            load_btn.click(fn=load_url, inputs=url_field, outputs=[screenshot_scrollable, extracted_text, output_col, language_codes, categories], api_name="load_url")
+            def do_crawl_error_handler(msg):
+                # error response
+                print("error -> no changes")
+                gr.Warning(f"❌ Error: {msg}")
+                return {
+                    url_field: gr.update(),
+                    output_col: gr.update(),
+                    extracted_text: gr.update(),
+                    screenshot_scrollable: gr.update(),
+                }
+            def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
+                print(f"{url=}")
+                print(f"{language_codes=}")
+                print(f"{categories=}")
+                print(f"{do_crawl=}")
+                if not profile_state:
+                    return do_crawl_error_handler("You are not authenticated.")
+                elif len(url) <= 0:
+                    return do_crawl_error_handler("URL is empty.")
+                elif len(categories) <= 0:
+                    return do_crawl_error_handler("You must select at least one category.")
+                elif len(language_codes) <= 0:
+                    return do_crawl_error_handler("You must select at least one language.")
+                else:
+                    #
+                    save_to_jsonl({
+                        "url": url,
+                        "language_codes": language_codes,
+                        "categories": categories,
+                        "do_crawl": int(do_crawl),
+                        "username": profile_state,
+                        "submission_datetime":  datetime.now().isoformat(),
+                    })
+                    # html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
+                    gr.Info("✅ Thanks for your feedback. Let's continue!")
+                    return {
+                        url_field: "",  # TODO fetch new url
+                        output_col: gr.update(visible=False),
+                        extracted_text: gr.update(value=None, visible=True),
+                        screenshot_scrollable: gr.update(value="", visible=False),
+                    }
+            # def do_crawl(profile_state, url, language_codes, categories):
+            #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=True)
+            # def dont_crawl(profile_state, url, language_codes, categories):
+            #     return do_crawl_or_not(profile_state, url, language_codes, categories, do_crawl=False)
+            do_crawl_btn.click(
+                fn=do_crawl,
+                inputs=[profile_state, url_field, language_codes, categories],
+                outputs=[
+                    url_field,
+                    output_col,
+                    extracted_text,
+                    screenshot_scrollable
+                ],
+                api_name="do_crawl",
+            )
+            dont_crawl_btn.click(
+                fn=do_crawl,
+                inputs=[profile_state, url_field, language_codes, categories],
+                outputs=[
+                    url_field,
+                    output_col,
+                    extracted_text,
+                    screenshot_scrollable
+                ],
+                api_name="do_crawl",
+            )
+            # dont_crawl_btn.click(fn=dont_crawl, inputs=[profile_state, url, language_codes, categories], outputs=[url, output_col, extracted_text, screenshot_scrollable], api_name="dont_crawl")
+            # def random_subpage(url):
+            #     new_url = "http://example.com"
+            #     return [new_url, *fetch_screenshot_and_text_from_url(new_url)]
+            # random_subpage_btn.click(fn=random_subpage, inputs=url, outputs=[url, screenshot_scrollable, extracted_text, output_col], api_name="load_random_subpage")
+    with gr.Tab("Browse Contributions"):
+        gr.Markdown("This page lists all the data we have collected so far. Please note that the list might be out-of-sync.")
+        """
+        dataset_info:
+        - config_name: base
+            features:
+            - name: url
+                dtype: string
+            - name: language_codes
+                list: string
+            - name: categories
+                list: string
+            - name: do_crawl
+                dtype: int32
+            - name: username
+                dtype: string
+            - name: submission_datetime
+                dtype: string
+        """
+        features = Features({
+            "url": Value("string"),
+            "language_codes": Sequence(Value(dtype="string")),
+            "categories": Sequence(Value(dtype="string")),
+            "do_crawl": Value("int32"),
+            "username": Value("string"),
+            "submission_datetime": Value("string"),
+        })
+        try:
+            ds = load_dataset(DATASET_REPO_ID, data_files={"train": "data/*.jsonl"}, features=features)
+            df = ds["train"].to_pandas()
+            gr.Dataframe(df)
+        except ValueError as e:
+            print(e)
+            gr.Markdown("> Error: Dataset cannot be loaded.")
+    with gr.Tab("About"):
+        gr.Markdown(ABOUT_TEXT)

requirements.txt CHANGED Viewed

@@ -11,3 +11,9 @@ trafilatura==2.0.0
 # ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
 lxml_html_clean

 # ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
 lxml_html_clean
+datasets
+huggingface-hub>=0.19
+hf-transfer>=0.1.4
+# protobuf<4
+# click<8.1
+# pydantic~=1.0

texts.py ADDED Viewed

	@@ -0,0 +1,68 @@

+ABOUT_TEXT = """
+## Web Languages Project
+Welcome! This is a crowd-sourced effort to improve crawling
+of low-resource languages. This dataset is public.
+[Common Crawl](https://commoncrawl.github.io/cc-crawl-statistics/plots/languages)
+recognizes a lot of languages, and we can see that we don't have
+enough of languages like Hindi (500 million speakers!), smaller
+country languages like Hungarian, and regional languages like Catalan.
+We are interested in languages from all over the world. If you choose
+to help, you'll be helping create lists of websites related to
+languages that you read or speak.
+### How can I contribute?
+If you look below you'll see a huge list of living languages. If you
+see one that looks interesting, click on it. You'll see a
+language-specific document, probably mostly blank, that you can fill
+out.
+There are 2 ways to add to this document. If you aren't very familiar
+with Github, you can copy the entire document into an email, fill it
+out, and send it to web-languages ZAT commoncrawl ZOT org. We'll do the rest.
+If you are familiar with Github, and are logged in, click on the pen
+icon in the upper right corner to start editing the document.
+Github will request that you fork the repo. Do that, edit the
+document, and finally create a pull request.
+To see a partially completed example, look at the
+[Welsh](living/welsh.md) entry.
+Sometimes asking a Large Language Model can be helpful: "What are some
+top websites written in the Welsh language?"
+### What kind of websites are you looking for?
+If you look at the template, we have requested urls in a few
+categories: News, Culture/History, Government, Political Parties, and
+Other. Remember that we're looking for websites in this particular
+language. If the language is only a part of the website, and that's
+visible in the URL as https://example.com/catalan/, then that's the
+URL you should add.
+For a language like Hindi, with 500 million speakers, there are a lot
+of websites to choose from. Please suggest websites that are important
+and influential, and please think about diversity. Are all geographic
+regions represented?
+For a country-wide language like Hungarian, there are still probably a
+wide variety of websites to choose from. If a website is all English,
+however, that's not what we're looking for.
+For a regional language like Catalan, things are trickier. Catalan has
+multiple names -- it's called Valencian in some parts of Spain -- and
+use of the Catalan language is a part of a vigorous debate in Spanish
+national and regional politics. You might not be able to find
+Catalan-language content for every political party, and government
+websites might offer Catalan content one day and remove it after
+the next election. In that case, please do the best you can.
+If your favorite language has its own Wikipedia -- [check the list here](https://en.wikipedia.org/wiki/List_of_Wikipedias) --
+please include this link under "Other".
+"""