Tristan Thrush commited on
Commit
bce177f
1 Parent(s): e91bd7c

added hit-to-huggingface dataset code. cleaned everything up

Browse files
Files changed (4) hide show
  1. README.md +30 -0
  2. app.py +52 -20
  3. collect.py +20 -9
  4. requirements.txt +2 -1
README.md CHANGED
@@ -11,3 +11,33 @@ license: bigscience-bloom-rail-1.0
11
  ---
12
 
13
  A basic example of dynamic adversarial data collection with a Gradio app.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  A basic example of dynamic adversarial data collection with a Gradio app.
14
+
15
+ *Instructions for someone to use for their own project:*
16
+
17
+ **Setting up the Space**
18
+ 1. Clone this repo and deploy it on your own Hugging Face space.
19
+ 2. Add one of your Hugging Face tokens to the secrets for your space, with the
20
+ name `HF_TOKEN`. Now, create an empty Hugging Face dataset on the hub. Put
21
+ the url of this dataset in the secrets for your space, with the name
22
+ `DATASET_REPO_URL`. It can be a private or public dataset. When you run this
23
+ space on mturk in the following lines, the app will use your token to
24
+ automatically store new hits to your dataset.
25
+
26
+ **Running Data Collection**
27
+ 1. On your local repo that you pulled, create a copy of `config.py.example`,
28
+ just called `config.py`. Now, put keys from your AWS account in `config.py`.
29
+ These keys should be for an AWS account that has the
30
+ AmazonMechanicalTurkFullAccess permission. You also need to
31
+ create an mturk requestor account associated with your AWS account.
32
+ 2. Run `python collect.py` locally. If you run it with the `--live_mode` flag,
33
+ it launches HITs on mturk, using the app you deployed on the space as the
34
+ data collection UI and backend. NOTE: this means that you will need to pay
35
+ real workers. If you don't use the `--live_mode` flag, then it will run the
36
+ HITs on mturk sandbox, which is identical to the normal mturk, but just for
37
+ testing. You can create a worker account and go to the sandbox version to
38
+ test your HIT.
39
+
40
+ **Profit**
41
+ Now, you should be watching hits come into your Hugging Face dataset
42
+ automatically!
43
+
app.py CHANGED
@@ -1,13 +1,24 @@
1
  # Basic example for doing model-in-the-loop dynamic adversarial data collection
2
  # using Gradio Blocks.
3
-
4
  import random
5
  from urllib.parse import parse_qs
6
 
7
  import gradio as gr
8
  import requests
9
  from transformers import pipeline
10
-
 
 
 
 
 
 
 
 
 
 
 
11
  pipe = pipeline("sentiment-analysis")
12
 
13
  demo = gr.Blocks()
@@ -16,7 +27,7 @@ with demo:
16
  total_cnt = 2 # How many examples per HIT
17
  dummy = gr.Textbox(visible=False) # dummy for passing assignmentId
18
 
19
- # We keep track of state as a Variable
20
  state_dict = {"assignmentId": "", "cnt": 0, "fooled": 0, "data": [], "metadata": {}}
21
  state = gr.JSON(state_dict, visible=False)
22
 
@@ -47,6 +58,9 @@ with demo:
47
  toggle_example_submit = gr.update(visible=not done)
48
  new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['fooled']} fooled)"
49
 
 
 
 
50
  query = parse_qs(dummy[1:])
51
  state["assignmentId"] = query["assignmentId"][0]
52
 
@@ -64,33 +78,51 @@ with demo:
64
  with gr.Column(visible=False) as final_submit:
65
  submit_hit_button = gr.Button("Submit HIT")
66
 
67
- # Submit state to MTurk backend for ExternalQuestion
68
- # Update the URL below to switch from Sandbox to real data collection
69
- def _submit(state, dummy):
70
- query = parse_qs(dummy[1:])
71
- assert "assignmentId" in query, "No assignment ID provided, unable to submit"
72
- state["assignmentId"] = query["assignmentId"][0]
73
- url = f"https://workersandbox.mturk.com/mturk/externalSubmit?assignmentId={state['assignmentId']}&colorChoice=blue"
74
- x = requests.post(url)
75
- return str(x) + " With assignmentId " + state["assignmentId"] + "\n" + x.text, state, dummy
76
 
77
  # Button event handlers
 
 
 
 
 
 
78
  submit_ex_button.click(
79
  _predict,
80
  inputs=[text_input, label_input, state, dummy],
81
  outputs=[label_output, text_output, state, example_submit, final_submit, state_display, dummy],
82
- _js="function(text_input, label_input, state, dummy) { console.log(text_input); console.log(label_input); console.log(state); console.log(dummy); return [text_input, label_input, state, window.location.search]; }",
83
  )
84
 
85
- def _something(state):
86
- print(state)
87
- return state
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  submit_hit_button.click(
90
- _something,
91
  inputs=[state],
92
- outputs=[state],
93
- _js="function(state) { console.log(state); const form = document.createElement('form'); form.action='https://workersandbox.mturk.com/mturk/externalSubmit'; form.method='post'; for (const key in state) {const hiddenField = document.createElement('input'); hiddenField.type = 'hidden'; hiddenField.name = key; hiddenField.value = state[key]; form.appendChild(hiddenField)}; document.body.appendChild(form); console.log(state); console.log(form); form.submit(); return [state];}",
94
  )
95
 
96
- demo.launch(share=True)
 
1
  # Basic example for doing model-in-the-loop dynamic adversarial data collection
2
  # using Gradio Blocks.
3
+ import os
4
  import random
5
  from urllib.parse import parse_qs
6
 
7
  import gradio as gr
8
  import requests
9
  from transformers import pipeline
10
+ from huggingface_hub import Repository
11
+
12
+ # These variables are for storing the mturk HITs in a Hugging Face dataset.
13
+ DATA_FILENAME = "data.jsonl"
14
+ DATA_FILE = os.path.join("data", DATA_FILENAME)
15
+ DATASET_REPO_URL = os.environ.get(DATASET_REPO_URL)
16
+ HF_TOKEN = os.environ.get("HF_TOKEN")
17
+ repo = Repository(
18
+ local_dir="data", clone_from=DATASET_REPO_URL, use_auth_token=HF_TOKEN
19
+ )
20
+
21
+ # Now let's run the app!
22
  pipe = pipeline("sentiment-analysis")
23
 
24
  demo = gr.Blocks()
 
27
  total_cnt = 2 # How many examples per HIT
28
  dummy = gr.Textbox(visible=False) # dummy for passing assignmentId
29
 
30
+ # We keep track of state as a JSON
31
  state_dict = {"assignmentId": "", "cnt": 0, "fooled": 0, "data": [], "metadata": {}}
32
  state = gr.JSON(state_dict, visible=False)
33
 
 
58
  toggle_example_submit = gr.update(visible=not done)
59
  new_state_md = f"State: {state['cnt']}/{total_cnt} ({state['fooled']} fooled)"
60
 
61
+ # We need to store the assignmentId in the state before submit_hit_button
62
+ # is clicked. We can do this here in _predict, which is called before
63
+ # submit_hit_button is clicked
64
  query = parse_qs(dummy[1:])
65
  state["assignmentId"] = query["assignmentId"][0]
66
 
 
78
  with gr.Column(visible=False) as final_submit:
79
  submit_hit_button = gr.Button("Submit HIT")
80
 
81
+ # Store the HIT data into a Hugging Face dataset.
82
+ # The HIT is also stored and logged on mturk when post_hit_js is run below.
83
+ # This _store_in_huggingface_dataset function just demonstrates how easy it is
84
+ # to automatically create a Hugging Face dataset from mturk.
85
+ def _store_in_huggingface_dataset(state, dummy):
86
+ with open(DATA_FILE, "a") as jsonlfile:
87
+ jsonlfile.write(json.dumps(state))
88
+ repo.push_to_hub()
 
89
 
90
  # Button event handlers
91
+ get_window_location_search_js = """
92
+ function(text_input, label_input, state, dummy) {
93
+ return [text_input, label_input, state, window.location.search];
94
+ }
95
+ """
96
+
97
  submit_ex_button.click(
98
  _predict,
99
  inputs=[text_input, label_input, state, dummy],
100
  outputs=[label_output, text_output, state, example_submit, final_submit, state_display, dummy],
101
+ _js=get_window_location_search_js,
102
  )
103
 
104
+ post_hit_js = """
105
+ function(state) {
106
+ const form = document.createElement('form');
107
+ form.action = 'https://workersandbox.mturk.com/mturk/externalSubmit';
108
+ form.method = 'post';
109
+ for (const key in state) {
110
+ const hiddenField = document.createElement('input');
111
+ hiddenField.type = 'hidden';
112
+ hiddenField.name = key;
113
+ hiddenField.value = state[key];
114
+ form.appendChild(hiddenField)
115
+ };
116
+ document.body.appendChild(form);
117
+ form.submit();
118
+ }
119
+ """
120
 
121
  submit_hit_button.click(
122
+ _store_in_huggingface_dataset,
123
  inputs=[state],
124
+ outputs=None,
125
+ _js=post_hit_js,
126
  )
127
 
128
+ demo.launch()
collect.py CHANGED
@@ -5,20 +5,33 @@ import boto3
5
  from boto.mturk.question import ExternalQuestion
6
 
7
  from config import MTURK_KEY, MTURK_SECRET
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
- MTURK_REGION = "us-east-1"
10
- MTURK_SANDBOX = "https://mturk-requester-sandbox.us-east-1.amazonaws.com"
11
 
12
  mturk = boto3.client(
13
  "mturk",
14
  aws_access_key_id=MTURK_KEY,
15
  aws_secret_access_key=MTURK_SECRET,
16
  region_name=MTURK_REGION,
17
- endpoint_url=MTURK_SANDBOX,
18
  )
19
 
20
- # The + in the URL makes the Space easily embeddable in an iframe
21
- question = ExternalQuestion("https://hf.space/embed/Tristan/dadc/+?__theme=light",
22
  frame_height=600
23
  )
24
 
@@ -27,7 +40,7 @@ new_hit = mturk.create_hit(
27
  Description="Hello",
28
  Keywords="fool the model",
29
  Reward="0.15",
30
- MaxAssignments=1,
31
  LifetimeInSeconds=172800,
32
  AssignmentDurationInSeconds=600,
33
  AutoApprovalDelayInSeconds=14400,
@@ -35,8 +48,6 @@ new_hit = mturk.create_hit(
35
  )
36
 
37
  print(
38
- "Sandbox link: https://workersandbox.mturk.com/mturk/preview?groupId="
39
  + new_hit["HIT"]["HITGroupId"]
40
  )
41
-
42
- print("Hit Id:", new_hit["HIT"]["HITId"])
 
5
  from boto.mturk.question import ExternalQuestion
6
 
7
  from config import MTURK_KEY, MTURK_SECRET
8
+ import argparse
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--mturk_region", default="us-east-1", help="The region for mturk (default: us-east-1)")
12
+ parser.add_argument("--space_name", default="Tristan/dadc", help="Name of the accompanying Hugging Face space (default: Tristan/dadc)")
13
+ parser.add_argument("--num_assignments", type=int, default=5, help="The number of times that the HIT can be accepted and completed.")
14
+ parser.add_argument("--live_mode", action="store_true", help="""
15
+ Whether to run in live mode with real turkers. This will charge your account money.
16
+ If you don't use this flag, the HITs will be deployed on the sandbox version of mturk,
17
+ which will not charge your account money.
18
+ """
19
+ )
20
+
21
+ args = parser.parse_args()
22
 
23
+ MTURK_URL = f"https://mturk-requester{"" if args.live_mode else "-sandbox"}.{args.mturk_region}.amazonaws.com"
 
24
 
25
  mturk = boto3.client(
26
  "mturk",
27
  aws_access_key_id=MTURK_KEY,
28
  aws_secret_access_key=MTURK_SECRET,
29
  region_name=MTURK_REGION,
30
+ endpoint_url=MTURK_URL,
31
  )
32
 
33
+ # This is the URL that makes the space embeddable in an mturk iframe
34
+ question = ExternalQuestion(f"https://hf.space/embed/{args.space_name}/+?__theme=light",
35
  frame_height=600
36
  )
37
 
 
40
  Description="Hello",
41
  Keywords="fool the model",
42
  Reward="0.15",
43
+ MaxAssignments=args.num_assignments,
44
  LifetimeInSeconds=172800,
45
  AssignmentDurationInSeconds=600,
46
  AutoApprovalDelayInSeconds=14400,
 
48
  )
49
 
50
  print(
51
+ f"Link: https://worker{"" if args.live_mode else "sandbox"}.mturk.com/mturk/preview?groupId="
52
  + new_hit["HIT"]["HITGroupId"]
53
  )
 
 
requirements.txt CHANGED
@@ -2,4 +2,5 @@ requests
2
  torch
3
  transformers
4
  gradio
5
- boto3
 
 
2
  torch
3
  transformers
4
  gradio
5
+ boto3
6
+ huggingface_hub