malte.ostendorff@telekom.de commited on
Commit
b0a0635
·
1 Parent(s): e3a23db

update deps

Browse files
Files changed (3) hide show
  1. README.md +11 -1
  2. app.py +21 -15
  3. requirements.txt +10 -2
README.md CHANGED
@@ -12,6 +12,11 @@ license: mit
12
 
13
  # Annotate Web Languages
14
 
 
 
 
 
 
15
  ## Usage
16
 
17
  Run the Gradio app
@@ -30,8 +35,13 @@ conda activate seed-crawl-annotator
30
  pip install -r requirements.txt
31
  ```
32
 
33
-
34
  Build Docker image
35
  ```bash
36
  docker build -t seed-crawl-annotator .
 
 
 
 
 
 
37
  ```
 
12
 
13
  # Annotate Web Languages
14
 
15
+ ## Requirements
16
+
17
+ - Python 3.10 (same as HF space)
18
+ - Chromium
19
+
20
  ## Usage
21
 
22
  Run the Gradio app
 
35
  pip install -r requirements.txt
36
  ```
37
 
 
38
  Build Docker image
39
  ```bash
40
  docker build -t seed-crawl-annotator .
41
+ ```
42
+
43
+ Run app within container
44
+ ```bash
45
+ docker run --rm -p 7860:7860 -e HF_TOKEN=$HF_TOKEN seed-crawl-annotator gradio app.py
46
+
47
  ```
app.py CHANGED
@@ -18,6 +18,16 @@ from languages import ISO_CODE_TO_LANGUAGE_NAME
18
 
19
  OFFLINE = os.environ.get("OFFLINE", False)
20
 
 
 
 
 
 
 
 
 
 
 
21
  def pil_image_to_base64(image):
22
  # Save the image to a BytesIO buffer
23
  buffer = BytesIO()
@@ -96,23 +106,26 @@ with gr.Blocks(fill_height=True) as demo:
96
  profile_state = gr.State([])
97
  gr.LoginButton()
98
 
99
-
100
  with gr.Column(visible=False) as wrapper_col:
 
 
101
  def handle_login(profile: gr.OAuthProfile | None) -> dict:
102
  if profile:
103
  gr.Info(f"Logged in as {profile.username}")
104
  return {
105
  profile_state: f"{profile.username}",
106
  wrapper_col: gr.update(visible=True),
 
107
  }
108
  else:
109
  gr.Warning(f"You need to login to use this app.")
110
  return {
111
- profile_state: None,
112
  wrapper_col: gr.update(visible=False),
 
113
  }
114
 
115
- demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col])
116
 
117
  url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
118
 
@@ -149,14 +162,7 @@ with gr.Blocks(fill_height=True) as demo:
149
 
150
 
151
  def set_random_url():
152
- candidate_urls = [
153
- "http://example.com",
154
- "https://wikipedia.org/",
155
- "https://occiglot.eu",
156
- "https://ostendorff.org",
157
- "https://fr.wikipedia.org/",
158
- "https://amazon.com/"
159
- ]
160
  selected_url = random.choice(candidate_urls)
161
  return selected_url
162
 
@@ -183,11 +189,11 @@ with gr.Blocks(fill_height=True) as demo:
183
  def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
184
 
185
  if profile_state:
186
- html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
187
- gr.Info("Thanks for your feedback")
188
  else:
189
- gr.Error("Feedback could not be saved")
190
- html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
191
 
192
  return {
193
  url_field: "",
 
18
 
19
  OFFLINE = os.environ.get("OFFLINE", False)
20
 
21
+ def get_candidate_urls():
22
+ return [
23
+ "http://example.com",
24
+ "https://wikipedia.org/",
25
+ "https://occiglot.eu",
26
+ "https://ostendorff.org",
27
+ "https://fr.wikipedia.org/",
28
+ "https://amazon.com/"
29
+ ]
30
+
31
  def pil_image_to_base64(image):
32
  # Save the image to a BytesIO buffer
33
  buffer = BytesIO()
 
106
  profile_state = gr.State([])
107
  gr.LoginButton()
108
 
 
109
  with gr.Column(visible=False) as wrapper_col:
110
+ login_status = gr.Markdown("no")
111
+
112
  def handle_login(profile: gr.OAuthProfile | None) -> dict:
113
  if profile:
114
  gr.Info(f"Logged in as {profile.username}")
115
  return {
116
  profile_state: f"{profile.username}",
117
  wrapper_col: gr.update(visible=True),
118
+ login_status: "yes",
119
  }
120
  else:
121
  gr.Warning(f"You need to login to use this app.")
122
  return {
123
+ profile_state: [],
124
  wrapper_col: gr.update(visible=False),
125
+ login_status: "no",
126
  }
127
 
128
+ demo.load(handle_login, inputs=None, outputs=[profile_state, wrapper_col, login_status])
129
 
130
  url_field = gr.Textbox(label="Website URL", placeholder="Enter a URL you want to annotate", interactive=True)
131
 
 
162
 
163
 
164
  def set_random_url():
165
+ candidate_urls = get_candidate_urls()
 
 
 
 
 
 
 
166
  selected_url = random.choice(candidate_urls)
167
  return selected_url
168
 
 
189
  def do_crawl(profile_state, url, language_codes, categories, do_crawl=True):
190
 
191
  if profile_state:
192
+ # html_str = f"<b>Thanks {profile_state}, we have saved your feedback!</b>"
193
+ gr.Info("Thanks for your feedback")
194
  else:
195
+ gr.Error("Feedback could not be saved")
196
+ # html_str = f"<b>Feedback could not be saved.</b> You are not authenticated."
197
 
198
  return {
199
  url_field: "",
requirements.txt CHANGED
@@ -1,5 +1,13 @@
1
- selenium >=4.0.0, < 5.0.0
2
  gradio==5.9.1
 
 
 
3
  Pillow>=8.3.1,<9.0
 
4
  trafilatura==2.0.0
5
- gradio[oauth]
 
 
 
 
 
1
+
2
  gradio==5.9.1
3
+ gradio[oauth]
4
+
5
+ selenium >=4.0.0, < 5.0.0
6
  Pillow>=8.3.1,<9.0
7
+
8
  trafilatura==2.0.0
9
+
10
+ # trafilatura fix
11
+ # ImportError: lxml.html.clean module is now a separate project lxml_html_clean.
12
+ lxml_html_clean
13
+