meg-huggingface
commited on
Commit
·
ffe4d51
1
Parent(s):
7dd405e
Inferring compute needs and code cleanup
Browse files- app.py +23 -18
- main_backend_toxicity.py +43 -33
- scripts/fix_harness_import.py +0 -11
- src/backend/compute_memory_requirements.py +59 -0
- src/backend/inference_endpoint.py +36 -11
- src/backend/manage_requests.py +60 -43
- src/backend/model_utils.py +98 -0
- src/backend/run_toxicity_eval.py +46 -31
- src/envs.py +19 -27
app.py
CHANGED
@@ -1,18 +1,20 @@
|
|
1 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
2 |
-
from src.logging import configure_root_logger
|
3 |
-
configure_root_logger()
|
4 |
-
|
5 |
from functools import partial
|
6 |
|
7 |
import gradio as gr
|
|
|
|
|
8 |
import main_backend_toxicity
|
9 |
-
from src.display.log_visualizer import log_file_to_html_string
|
10 |
from src.display.css_html_js import dark_mode_gradio_js
|
11 |
-
from src.
|
12 |
-
from src.
|
|
|
13 |
|
|
|
14 |
logger = setup_logger(__name__)
|
15 |
|
|
|
|
|
|
|
16 |
intro_md = f"""
|
17 |
# Intro
|
18 |
This is a visual for the auto evaluator.
|
@@ -22,36 +24,39 @@ links_md = f"""
|
|
22 |
# Important links
|
23 |
|
24 |
| Description | Link |
|
25 |
-
|
26 |
-
| Leaderboard
|
27 |
-
|
|
28 |
-
| Results Repo
|
29 |
"""
|
30 |
|
|
|
31 |
def auto_eval():
|
32 |
logger.info("Triggering Auto Eval")
|
33 |
main_backend_toxicity.run_auto_eval()
|
34 |
|
|
|
35 |
reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
|
36 |
|
37 |
with gr.Blocks(js=dark_mode_gradio_js) as backend_ui:
|
38 |
gr.Markdown(intro_md)
|
39 |
with gr.Tab("Application"):
|
40 |
output_html = gr.HTML(partial(log_file_to_html_string,
|
41 |
-
reverse=reverse_order_checkbox),
|
|
|
42 |
with gr.Row():
|
43 |
download_button = gr.DownloadButton("Download Log File",
|
44 |
value=log_file)
|
45 |
with gr.Accordion('Log View Configuration', open=False):
|
46 |
reverse_order_checkbox.render()
|
47 |
-
#
|
48 |
button = gr.Button("Manually Run Evaluation")
|
|
|
49 |
gr.Markdown(links_md)
|
50 |
-
# This
|
51 |
-
# and the UI will error out if it takes
|
52 |
-
#
|
53 |
# dummy = gr.Markdown(main_backend_toxicity.run_auto_eval(), every=REFRESH_RATE, visible=False)
|
54 |
-
button.click(fn=auto_eval, inputs=[], outputs=[])
|
55 |
|
56 |
if __name__ == '__main__':
|
57 |
scheduler = BackgroundScheduler()
|
@@ -59,4 +64,4 @@ if __name__ == '__main__':
|
|
59 |
scheduler.start()
|
60 |
backend_ui.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0",
|
61 |
show_error=True,
|
62 |
-
server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
1 |
from functools import partial
|
2 |
|
3 |
import gradio as gr
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
|
6 |
import main_backend_toxicity
|
|
|
7 |
from src.display.css_html_js import dark_mode_gradio_js
|
8 |
+
from src.display.log_visualizer import log_file_to_html_string
|
9 |
+
from src.envs import REFRESH_RATE, REPO_ID, REQUESTS_REPO, RESULTS_REPO
|
10 |
+
from src.logging import configure_root_logger, setup_logger, log_file
|
11 |
|
12 |
+
configure_root_logger()
|
13 |
logger = setup_logger(__name__)
|
14 |
|
15 |
+
HF_URL = "https://huggingface.co"
|
16 |
+
REFRESH_VISUAL = 10
|
17 |
+
|
18 |
intro_md = f"""
|
19 |
# Intro
|
20 |
This is a visual for the auto evaluator.
|
|
|
24 |
# Important links
|
25 |
|
26 |
| Description | Link |
|
27 |
+
|----------------|------|
|
28 |
+
| Leaderboard | [{REPO_ID}]({HF_URL}/spaces/{REPO_ID}) |
|
29 |
+
| Requests Repo | [{REQUESTS_REPO}]({HF_URL}/datasets/{REQUESTS_REPO}) |
|
30 |
+
| Results Repo | [{RESULTS_REPO}]({HF_URL}/datasets/{RESULTS_REPO}) |
|
31 |
"""
|
32 |
|
33 |
+
|
34 |
def auto_eval():
|
35 |
logger.info("Triggering Auto Eval")
|
36 |
main_backend_toxicity.run_auto_eval()
|
37 |
|
38 |
+
|
39 |
reverse_order_checkbox = gr.Checkbox(label="Reverse Order", value=True)
|
40 |
|
41 |
with gr.Blocks(js=dark_mode_gradio_js) as backend_ui:
|
42 |
gr.Markdown(intro_md)
|
43 |
with gr.Tab("Application"):
|
44 |
output_html = gr.HTML(partial(log_file_to_html_string,
|
45 |
+
reverse=reverse_order_checkbox),
|
46 |
+
every=REFRESH_VISUAL)
|
47 |
with gr.Row():
|
48 |
download_button = gr.DownloadButton("Download Log File",
|
49 |
value=log_file)
|
50 |
with gr.Accordion('Log View Configuration', open=False):
|
51 |
reverse_order_checkbox.render()
|
52 |
+
# Button to trigger evaluation
|
53 |
button = gr.Button("Manually Run Evaluation")
|
54 |
+
button.click(fn=auto_eval, inputs=[], outputs=[])
|
55 |
gr.Markdown(links_md)
|
56 |
+
# This dummy var was in the original demo. It will run the eval before
|
57 |
+
# fully loading the UI, and the UI will error out if it takes long.
|
58 |
+
# Changed to use BackgroundScheduler instead.
|
59 |
# dummy = gr.Markdown(main_backend_toxicity.run_auto_eval(), every=REFRESH_RATE, visible=False)
|
|
|
60 |
|
61 |
if __name__ == '__main__':
|
62 |
scheduler = BackgroundScheduler()
|
|
|
64 |
scheduler.start()
|
65 |
backend_ui.queue(default_concurrency_limit=40).launch(server_name="0.0.0.0",
|
66 |
show_error=True,
|
67 |
+
server_port=7860)
|
main_backend_toxicity.py
CHANGED
@@ -1,51 +1,52 @@
|
|
1 |
import pprint
|
2 |
import re
|
|
|
3 |
from huggingface_hub import snapshot_download, delete_inference_endpoint
|
4 |
|
5 |
from src.backend.inference_endpoint import create_endpoint
|
6 |
-
from src.backend.
|
7 |
-
|
|
|
|
|
8 |
from src.backend.sort_queue import sort_models_by_priority
|
9 |
-
|
10 |
-
from src.envs import (QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO,
|
11 |
EVAL_RESULTS_PATH_BACKEND, API, TOKEN)
|
12 |
-
#, LIMIT, ACCELERATOR, VENDOR, REGION
|
13 |
from src.logging import setup_logger
|
14 |
|
15 |
logger = setup_logger(__name__)
|
16 |
|
17 |
pp = pprint.PrettyPrinter(width=80)
|
18 |
|
19 |
-
PENDING_STATUS = "PENDING"
|
20 |
-
RUNNING_STATUS = "RUNNING"
|
21 |
-
FINISHED_STATUS = "FINISHED"
|
22 |
-
FAILED_STATUS = "FAILED"
|
23 |
|
24 |
-
snapshot_download(repo_id=RESULTS_REPO, revision="main",
|
25 |
-
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
def run_auto_eval():
|
28 |
-
current_pending_status = [PENDING_STATUS]
|
29 |
|
|
|
30 |
# pull the eval dataset from the hub and parse any eval requests
|
31 |
# check completed evals and set them to finished
|
32 |
check_completed_evals(
|
33 |
api=API,
|
34 |
-
checked_status=RUNNING_STATUS,
|
35 |
completed_status=FINISHED_STATUS,
|
36 |
failed_status=FAILED_STATUS,
|
37 |
-
hf_repo=
|
38 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
39 |
hf_repo_results=RESULTS_REPO,
|
40 |
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
41 |
)
|
42 |
|
43 |
-
# Get all eval
|
44 |
-
eval_requests = get_eval_requests(
|
45 |
-
|
|
|
46 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
47 |
|
48 |
-
logger.info(
|
|
|
49 |
|
50 |
if len(eval_requests) == 0:
|
51 |
return
|
@@ -57,29 +58,38 @@ def run_auto_eval():
|
|
57 |
api=API,
|
58 |
eval_request=eval_request,
|
59 |
set_to_status=RUNNING_STATUS,
|
60 |
-
hf_repo=
|
61 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
62 |
)
|
63 |
|
64 |
-
logger.info(
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
endpoint_name = endpoint_name_tmp[:32]
|
69 |
-
endpoint_url = create_endpoint(endpoint_name, model_repository)
|
70 |
logger.info("Created an endpoint url at %s" % endpoint_url)
|
71 |
-
results =
|
72 |
logger.info("FINISHED!")
|
73 |
logger.info(results)
|
74 |
logger.info(f'Completed Evaluation of {eval_request.json_filepath}')
|
75 |
set_eval_request(api=API,
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
81 |
delete_inference_endpoint(endpoint_name)
|
82 |
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
if __name__ == "__main__":
|
85 |
-
run_auto_eval()
|
|
|
1 |
import pprint
|
2 |
import re
|
3 |
+
|
4 |
from huggingface_hub import snapshot_download, delete_inference_endpoint
|
5 |
|
6 |
from src.backend.inference_endpoint import create_endpoint
|
7 |
+
from src.backend.manage_requests import check_completed_evals, \
|
8 |
+
get_eval_requests, set_eval_request, PENDING_STATUS, FINISHED_STATUS, \
|
9 |
+
FAILED_STATUS, RUNNING_STATUS
|
10 |
+
from src.backend.run_toxicity_eval import compute_results
|
11 |
from src.backend.sort_queue import sort_models_by_priority
|
12 |
+
from src.envs import (REQUESTS_REPO, EVAL_REQUESTS_PATH_BACKEND, RESULTS_REPO,
|
|
|
13 |
EVAL_RESULTS_PATH_BACKEND, API, TOKEN)
|
|
|
14 |
from src.logging import setup_logger
|
15 |
|
16 |
logger = setup_logger(__name__)
|
17 |
|
18 |
pp = pprint.PrettyPrinter(width=80)
|
19 |
|
|
|
|
|
|
|
|
|
20 |
|
21 |
+
snapshot_download(repo_id=RESULTS_REPO, revision="main",
|
22 |
+
local_dir=EVAL_RESULTS_PATH_BACKEND, repo_type="dataset",
|
23 |
+
max_workers=60, token=TOKEN)
|
24 |
+
snapshot_download(repo_id=REQUESTS_REPO, revision="main",
|
25 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND, repo_type="dataset",
|
26 |
+
max_workers=60, token=TOKEN)
|
27 |
|
|
|
|
|
28 |
|
29 |
+
def run_auto_eval():
|
30 |
# pull the eval dataset from the hub and parse any eval requests
|
31 |
# check completed evals and set them to finished
|
32 |
check_completed_evals(
|
33 |
api=API,
|
|
|
34 |
completed_status=FINISHED_STATUS,
|
35 |
failed_status=FAILED_STATUS,
|
36 |
+
hf_repo=REQUESTS_REPO,
|
37 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
38 |
hf_repo_results=RESULTS_REPO,
|
39 |
local_dir_results=EVAL_RESULTS_PATH_BACKEND
|
40 |
)
|
41 |
|
42 |
+
# Get all eval requests that are PENDING
|
43 |
+
eval_requests = get_eval_requests(hf_repo=REQUESTS_REPO,
|
44 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND)
|
45 |
+
# Sort the evals by priority (first submitted, first run)
|
46 |
eval_requests = sort_models_by_priority(api=API, models=eval_requests)
|
47 |
|
48 |
+
logger.info(
|
49 |
+
f"Found {len(eval_requests)} {PENDING_STATUS} eval requests")
|
50 |
|
51 |
if len(eval_requests) == 0:
|
52 |
return
|
|
|
58 |
api=API,
|
59 |
eval_request=eval_request,
|
60 |
set_to_status=RUNNING_STATUS,
|
61 |
+
hf_repo=REQUESTS_REPO,
|
62 |
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
63 |
)
|
64 |
|
65 |
+
logger.info(
|
66 |
+
f'Starting Evaluation of {eval_request.json_filepath} on Inference endpoints')
|
67 |
+
endpoint_name = _make_endpoint_name(eval_request)
|
68 |
+
endpoint_url = create_endpoint(endpoint_name, eval_request.model)
|
|
|
|
|
69 |
logger.info("Created an endpoint url at %s" % endpoint_url)
|
70 |
+
results = compute_results(endpoint_url, eval_request)
|
71 |
logger.info("FINISHED!")
|
72 |
logger.info(results)
|
73 |
logger.info(f'Completed Evaluation of {eval_request.json_filepath}')
|
74 |
set_eval_request(api=API,
|
75 |
+
eval_request=eval_request,
|
76 |
+
set_to_status=FINISHED_STATUS,
|
77 |
+
hf_repo=REQUESTS_REPO,
|
78 |
+
local_dir=EVAL_REQUESTS_PATH_BACKEND,
|
79 |
+
)
|
80 |
+
# Delete endpoint when we're done.
|
81 |
delete_inference_endpoint(endpoint_name)
|
82 |
|
83 |
|
84 |
+
def _make_endpoint_name(eval_request):
|
85 |
+
model_repository = eval_request.model
|
86 |
+
# Naming convention for endpoints
|
87 |
+
endpoint_name_tmp = re.sub("[/.]", "-",
|
88 |
+
model_repository.lower()) + "-toxicity-eval"
|
89 |
+
# Endpoints apparently can't have more than 32 characters.
|
90 |
+
endpoint_name = endpoint_name_tmp[:32]
|
91 |
+
return endpoint_name
|
92 |
+
|
93 |
+
|
94 |
if __name__ == "__main__":
|
95 |
+
run_auto_eval()
|
scripts/fix_harness_import.py
DELETED
@@ -1,11 +0,0 @@
|
|
1 |
-
"""This file should be used after pip install -r requirements.
|
2 |
-
It creates a folder not ported during harness package creation (as they don't use a Manifest file atm and it ignore `.json` files).
|
3 |
-
It will need to be updated if we want to use the harness' version of big bench to actually copy the json files.
|
4 |
-
"""
|
5 |
-
import os
|
6 |
-
|
7 |
-
import lm_eval
|
8 |
-
|
9 |
-
if __name__ == "__main__":
|
10 |
-
lm_eval_path = lm_eval.__path__[0]
|
11 |
-
os.makedirs(os.path.join(lm_eval_path, "datasets", "bigbench_resources"), exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/backend/compute_memory_requirements.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from src.backend.model_utils import calculate_memory, get_model
|
2 |
+
from src.logging import setup_logger
|
3 |
+
|
4 |
+
logger = setup_logger(__name__)
|
5 |
+
|
6 |
+
|
7 |
+
def get_instance_needs(model_name: str, access_token: str):
|
8 |
+
"""Scales up compute based on size and price."""
|
9 |
+
needed_space = get_size(model_name, access_token)
|
10 |
+
if needed_space:
|
11 |
+
if needed_space < 20:
|
12 |
+
# Cheapest
|
13 |
+
return 'x1', 'nvidia-a10g'
|
14 |
+
elif needed_space < 60:
|
15 |
+
return 'x4', 'nvidia-t4'
|
16 |
+
elif needed_space < 80:
|
17 |
+
return 'x1', 'nvidia-a100'
|
18 |
+
elif needed_space < 95:
|
19 |
+
return 'x4', 'nvidia-a10g'
|
20 |
+
elif needed_space < 150:
|
21 |
+
return 'x2', 'nvidia-a100'
|
22 |
+
# Not doing any higher (for now) as that would start costing a lot.
|
23 |
+
else:
|
24 |
+
# A default size to start trying to scale up from.
|
25 |
+
return 'x4', 'nvidia-l4'
|
26 |
+
|
27 |
+
|
28 |
+
# Code based in part on https://huggingface.co/spaces/hf-accelerate/model-memory-usage
|
29 |
+
def get_size(model_name: str, access_token: str, library="auto",
|
30 |
+
dtype="float32"):
|
31 |
+
"""
|
32 |
+
This is just to get a size estimate of the model.
|
33 |
+
Assuming dtype float32, which isn't always true.
|
34 |
+
Only works for transformers and timm models AFAIK.
|
35 |
+
"""
|
36 |
+
model = get_model(model_name, library, access_token)
|
37 |
+
data = calculate_memory(model, dtype)
|
38 |
+
size = data[0]['Total Size']
|
39 |
+
split_size = size.split()
|
40 |
+
# Assuming we're working in GB.
|
41 |
+
try:
|
42 |
+
assert split_size[1] == 'GB'
|
43 |
+
num_gigs = float(split_size[0])
|
44 |
+
except AssertionError:
|
45 |
+
logger.warning(
|
46 |
+
"Tried to get model size and it's not GB, it's %s" % size)
|
47 |
+
logger.warning(
|
48 |
+
"Have not implemented handling for this, just going with 30GB.")
|
49 |
+
num_gigs = 30
|
50 |
+
return num_gigs
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
# Debugging here
|
55 |
+
import os
|
56 |
+
|
57 |
+
num_gigs_debug = get_size("upstage/SOLAR-10.7B-v1.0",
|
58 |
+
access_token=os.environ.get("HF_TOKEN"))
|
59 |
+
print(num_gigs_debug)
|
src/backend/inference_endpoint.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import sys
|
2 |
from time import sleep
|
|
|
|
|
3 |
from huggingface_hub import create_inference_endpoint, get_inference_endpoint
|
|
|
|
|
4 |
from src.backend.run_toxicity_eval import get_generation
|
|
|
5 |
from src.logging import setup_logger
|
6 |
-
import requests
|
7 |
|
8 |
logger = setup_logger(__name__)
|
9 |
TIMEOUT = 20
|
@@ -12,10 +16,15 @@ MAX_REPLICA = 1
|
|
12 |
|
13 |
def create_endpoint(endpoint_name, repository, framework='pytorch',
|
14 |
task='text-generation', accelerator='gpu', vendor='aws',
|
15 |
-
region='us-east-1', type='protected'
|
16 |
-
|
17 |
logger.info("Creating endpoint %s..." % endpoint_name)
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
19 |
try:
|
20 |
endpoint = get_inference_endpoint(endpoint_name)
|
21 |
have_endpoint = True
|
@@ -55,12 +64,14 @@ def create_endpoint(endpoint_name, repository, framework='pytorch',
|
|
55 |
def wait_for_endpoint(endpoint):
|
56 |
# TODO: HANDLE 'paused'
|
57 |
i = 0
|
58 |
-
while endpoint.status in ['updating', 'pending',
|
|
|
59 |
if i >= 20:
|
60 |
-
logger.error("Model failed to respond. Exiting.")
|
61 |
sys.exit()
|
62 |
logger.info(
|
63 |
-
"Waiting %d seconds to check again if the endpoint is running." %
|
|
|
64 |
sleep(TIMEOUT)
|
65 |
endpoint.fetch()
|
66 |
logger.info("Endpoint status: %s." % (endpoint.status))
|
@@ -68,21 +79,35 @@ def wait_for_endpoint(endpoint):
|
|
68 |
|
69 |
|
70 |
def update_endpoint_exception(endpoint):
|
|
|
|
|
|
|
|
|
|
|
71 |
raw_info = endpoint.raw
|
72 |
cur_instance_size = raw_info['compute']['instanceSize']
|
73 |
cur_instance_type = raw_info['compute']['instanceType']
|
74 |
-
|
|
|
|
|
|
|
|
|
75 |
endpoint.update(instance_size='x1', instance_type='nvidia-a100',
|
76 |
max_replica=MAX_REPLICA)
|
77 |
-
elif (cur_instance_type, cur_instance_size) == ('a100', 'x1'):
|
78 |
endpoint.update(instance_size='x4', instance_type='nvidia-a10g',
|
79 |
max_replica=MAX_REPLICA)
|
|
|
|
|
|
|
80 |
else:
|
81 |
logger.error(
|
82 |
-
"Getting expensive to
|
|
|
83 |
sys.exit()
|
84 |
return endpoint
|
85 |
|
86 |
|
87 |
if __name__ == '__main__':
|
88 |
-
generation_url = create_endpoint('this-is-a-test',
|
|
|
|
1 |
import sys
|
2 |
from time import sleep
|
3 |
+
|
4 |
+
import requests
|
5 |
from huggingface_hub import create_inference_endpoint, get_inference_endpoint
|
6 |
+
|
7 |
+
from src.backend.compute_memory_requirements import get_instance_needs
|
8 |
from src.backend.run_toxicity_eval import get_generation
|
9 |
+
from src.envs import TOKEN
|
10 |
from src.logging import setup_logger
|
|
|
11 |
|
12 |
logger = setup_logger(__name__)
|
13 |
TIMEOUT = 20
|
|
|
16 |
|
17 |
def create_endpoint(endpoint_name, repository, framework='pytorch',
|
18 |
task='text-generation', accelerator='gpu', vendor='aws',
|
19 |
+
region='us-east-1', type='protected'):
|
20 |
+
"""Tries to automagically create a running endpoint for the given model."""
|
21 |
logger.info("Creating endpoint %s..." % endpoint_name)
|
22 |
+
endpoint = None
|
23 |
+
instance_size, instance_type = get_instance_needs(repository, TOKEN)
|
24 |
+
logger.info("Estimating the following instance size and type: %s, %s" % (
|
25 |
+
instance_size, instance_type))
|
26 |
+
# Useful in debugging, when models are being run over and over:
|
27 |
+
# Check if the endpoint is already there.
|
28 |
try:
|
29 |
endpoint = get_inference_endpoint(endpoint_name)
|
30 |
have_endpoint = True
|
|
|
64 |
def wait_for_endpoint(endpoint):
|
65 |
# TODO: HANDLE 'paused'
|
66 |
i = 0
|
67 |
+
while endpoint.status in ['updating', 'pending',
|
68 |
+
'initializing']: # not in ['failed', 'running', 'scaledToZero']
|
69 |
if i >= 20:
|
70 |
+
logger.error("Model failed to respond after 20 tries. Exiting.")
|
71 |
sys.exit()
|
72 |
logger.info(
|
73 |
+
"Waiting %d seconds to check again if the endpoint is running." %
|
74 |
+
TIMEOUT)
|
75 |
sleep(TIMEOUT)
|
76 |
endpoint.fetch()
|
77 |
logger.info("Endpoint status: %s." % (endpoint.status))
|
|
|
79 |
|
80 |
|
81 |
def update_endpoint_exception(endpoint):
|
82 |
+
"""
|
83 |
+
Endpoints can fail from too little memory, as well as for missing
|
84 |
+
flash attention, etc. This function tries new compute setups,
|
85 |
+
scaling up the compute power until it's running or expensive.
|
86 |
+
"""
|
87 |
raw_info = endpoint.raw
|
88 |
cur_instance_size = raw_info['compute']['instanceSize']
|
89 |
cur_instance_type = raw_info['compute']['instanceType']
|
90 |
+
|
91 |
+
if (cur_instance_type, cur_instance_size) == ('nvidia-a10g', 'x1'):
|
92 |
+
endpoint.update(instance_size='x4', instance_type='nvidia-t4',
|
93 |
+
max_replica=MAX_REPLICA)
|
94 |
+
elif (cur_instance_type, cur_instance_size) == ('nvidia-t4', 'x4'):
|
95 |
endpoint.update(instance_size='x1', instance_type='nvidia-a100',
|
96 |
max_replica=MAX_REPLICA)
|
97 |
+
elif (cur_instance_type, cur_instance_size) == ('nvidia-a100', 'x1'):
|
98 |
endpoint.update(instance_size='x4', instance_type='nvidia-a10g',
|
99 |
max_replica=MAX_REPLICA)
|
100 |
+
elif (cur_instance_type, cur_instance_size) == ('nvidia-l4', 'x4'):
|
101 |
+
endpoint.update(instance_size='x2', instance_type='nvidia-a100',
|
102 |
+
max_replica=MAX_REPLICA)
|
103 |
else:
|
104 |
logger.error(
|
105 |
+
"Getting expensive to run this model without human oversight."
|
106 |
+
" Exiting.")
|
107 |
sys.exit()
|
108 |
return endpoint
|
109 |
|
110 |
|
111 |
if __name__ == '__main__':
|
112 |
+
generation_url = create_endpoint('this-is-a-test',
|
113 |
+
'Qwen/Qwen2-7B')
|
src/backend/manage_requests.py
CHANGED
@@ -1,15 +1,22 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
from dataclasses import dataclass
|
|
|
4 |
from typing import Optional
|
5 |
-
from datetime import datetime, timezone
|
6 |
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
8 |
from src.envs import TOKEN
|
9 |
from src.logging import setup_logger
|
10 |
|
11 |
logger = setup_logger(__name__)
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
@dataclass
|
14 |
class EvalRequest:
|
15 |
"""This class represents one evaluation request file.
|
@@ -18,17 +25,17 @@ class EvalRequest:
|
|
18 |
status: str
|
19 |
json_filepath: str
|
20 |
weight_type: str = "Original"
|
21 |
-
model_type: str = "" # pretrained, finetuned, with RL
|
22 |
precision: str = "" # float16, bfloat16
|
23 |
-
revision: str = "main"
|
24 |
-
submitted_time: Optional[
|
25 |
-
|
|
|
26 |
likes: Optional[int] = 0
|
27 |
params: Optional[int] = None
|
28 |
license: Optional[str] = ""
|
29 |
base_model: Optional[str] = ""
|
30 |
private: Optional[bool] = False
|
31 |
-
|
32 |
def get_model_args(self):
|
33 |
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
34 |
the evaluation suite you chose.
|
@@ -40,20 +47,21 @@ class EvalRequest:
|
|
40 |
|
41 |
# Quantized models need some added config, the install of bits and bytes, etc
|
42 |
|
43 |
-
#elif self.precision == "8bit":
|
44 |
# model_args += ",load_in_8bit=True"
|
45 |
-
#elif self.precision == "4bit":
|
46 |
# model_args += ",load_in_4bit=True"
|
47 |
-
#elif self.precision == "GPTQ":
|
48 |
-
|
49 |
-
|
50 |
else:
|
51 |
raise Exception(f"Unknown precision {self.precision}.")
|
52 |
-
|
53 |
return model_args
|
54 |
|
55 |
|
56 |
-
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
|
57 |
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
58 |
json_filepath = eval_request.json_filepath
|
59 |
|
@@ -73,7 +81,7 @@ def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
|
73 |
)
|
74 |
|
75 |
|
76 |
-
def get_eval_requests(
|
77 |
"""Gets all pending evaluation requests and return a list in which private
|
78 |
models appearing first, followed by public models sorted by the number of
|
79 |
likes.
|
@@ -81,15 +89,15 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
81 |
Returns:
|
82 |
`list[EvalRequest]`: a list of model info dicts.
|
83 |
"""
|
84 |
-
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
|
|
|
85 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
86 |
|
87 |
eval_requests = []
|
88 |
for json_filepath in json_files:
|
89 |
with open(json_filepath) as fp:
|
90 |
data = json.load(fp)
|
91 |
-
|
92 |
-
if data["status"] in job_status:
|
93 |
data["json_filepath"] = json_filepath
|
94 |
eval_request = EvalRequest(**data)
|
95 |
eval_requests.append(eval_request)
|
@@ -98,43 +106,50 @@ def get_eval_requests(job_status: list, local_dir: str, hf_repo: str) -> list[Ev
|
|
98 |
|
99 |
|
100 |
def check_set_to_fail(eval_request: EvalRequest):
|
101 |
-
"""Checks
|
102 |
json_filepath = eval_request.json_filepath
|
103 |
|
104 |
with open(json_filepath) as fp:
|
105 |
data = json.load(fp)
|
106 |
|
107 |
status = data["status"]
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
# If it's been running for less than 2 hours, leave it alone.
|
117 |
-
if diff_seconds < 7200:
|
118 |
-
|
119 |
-
else:
|
120 |
-
|
121 |
-
return True
|
122 |
|
123 |
|
124 |
def check_completed_evals(
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
local_dir_results: str,
|
133 |
):
|
134 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
135 |
-
snapshot_download(repo_id=hf_repo_results, revision="main",
|
|
|
|
|
136 |
|
137 |
-
running_evals = get_eval_requests(
|
138 |
|
139 |
for eval_request in running_evals:
|
140 |
model = eval_request.model
|
@@ -149,11 +164,13 @@ def check_completed_evals(
|
|
149 |
logger.info(
|
150 |
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
151 |
)
|
152 |
-
set_eval_request(api, eval_request, completed_status, hf_repo,
|
|
|
153 |
else:
|
154 |
set_to_fail = check_set_to_fail(eval_request)
|
155 |
if set_to_fail:
|
156 |
logger.info(
|
157 |
f"No result file found for {model} setting it to {failed_status}"
|
158 |
)
|
159 |
-
set_eval_request(api, eval_request, failed_status, hf_repo,
|
|
|
|
1 |
import glob
|
2 |
import json
|
3 |
from dataclasses import dataclass
|
4 |
+
# from datetime import datetime, timezone
|
5 |
from typing import Optional
|
|
|
6 |
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
8 |
+
|
9 |
from src.envs import TOKEN
|
10 |
from src.logging import setup_logger
|
11 |
|
12 |
logger = setup_logger(__name__)
|
13 |
|
14 |
+
PENDING_STATUS = "PENDING"
|
15 |
+
RUNNING_STATUS = "RUNNING"
|
16 |
+
FINISHED_STATUS = "FINISHED"
|
17 |
+
FAILED_STATUS = "FAILED"
|
18 |
+
|
19 |
+
|
20 |
@dataclass
|
21 |
class EvalRequest:
|
22 |
"""This class represents one evaluation request file.
|
|
|
25 |
status: str
|
26 |
json_filepath: str
|
27 |
weight_type: str = "Original"
|
|
|
28 |
precision: str = "" # float16, bfloat16
|
29 |
+
revision: str = "main" # commit hash
|
30 |
+
submitted_time: Optional[
|
31 |
+
str] = "2022-05-18T11:40:22.519222" # random date just so that we can still order requests by date
|
32 |
+
model_type: Optional[str] = None # pretrained, fine-tuned, etc
|
33 |
likes: Optional[int] = 0
|
34 |
params: Optional[int] = None
|
35 |
license: Optional[str] = ""
|
36 |
base_model: Optional[str] = ""
|
37 |
private: Optional[bool] = False
|
38 |
+
|
39 |
def get_model_args(self):
|
40 |
"""Edit this function if you want to manage more complex quantization issues. You'll need to map it to
|
41 |
the evaluation suite you chose.
|
|
|
47 |
|
48 |
# Quantized models need some added config, the install of bits and bytes, etc
|
49 |
|
50 |
+
# elif self.precision == "8bit":
|
51 |
# model_args += ",load_in_8bit=True"
|
52 |
+
# elif self.precision == "4bit":
|
53 |
# model_args += ",load_in_4bit=True"
|
54 |
+
# elif self.precision == "GPTQ":
|
55 |
+
# A GPTQ model does not need dtype to be specified,
|
56 |
+
# it will be inferred from the config
|
57 |
else:
|
58 |
raise Exception(f"Unknown precision {self.precision}.")
|
59 |
+
|
60 |
return model_args
|
61 |
|
62 |
|
63 |
+
def set_eval_request(api: HfApi, eval_request: EvalRequest, set_to_status: str,
|
64 |
+
hf_repo: str, local_dir: str):
|
65 |
"""Updates a given eval request with its new status on the hub (running, completed, failed, ...)"""
|
66 |
json_filepath = eval_request.json_filepath
|
67 |
|
|
|
81 |
)
|
82 |
|
83 |
|
84 |
+
def get_eval_requests(local_dir: str, hf_repo: str) -> list[EvalRequest]:
|
85 |
"""Gets all pending evaluation requests and return a list in which private
|
86 |
models appearing first, followed by public models sorted by the number of
|
87 |
likes.
|
|
|
89 |
Returns:
|
90 |
`list[EvalRequest]`: a list of model info dicts.
|
91 |
"""
|
92 |
+
snapshot_download(repo_id=hf_repo, revision="main", local_dir=local_dir,
|
93 |
+
repo_type="dataset", max_workers=60, token=TOKEN)
|
94 |
json_files = glob.glob(f"{local_dir}/**/*.json", recursive=True)
|
95 |
|
96 |
eval_requests = []
|
97 |
for json_filepath in json_files:
|
98 |
with open(json_filepath) as fp:
|
99 |
data = json.load(fp)
|
100 |
+
if data["status"] == PENDING_STATUS:
|
|
|
101 |
data["json_filepath"] = json_filepath
|
102 |
eval_request = EvalRequest(**data)
|
103 |
eval_requests.append(eval_request)
|
|
|
106 |
|
107 |
|
108 |
def check_set_to_fail(eval_request: EvalRequest):
|
109 |
+
"""Checks whether a file says it's RUNNING to determine whether to FAIL"""
|
110 |
json_filepath = eval_request.json_filepath
|
111 |
|
112 |
with open(json_filepath) as fp:
|
113 |
data = json.load(fp)
|
114 |
|
115 |
status = data["status"]
|
116 |
+
# Don't fail pending tasks.
|
117 |
+
if status == PENDING_STATUS:
|
118 |
+
return False
|
119 |
+
else:
|
120 |
+
return True
|
121 |
+
# time_format = "%Y-%m-%dT%H:%M:%SZ"
|
122 |
+
# submitted_time_str = data["submitted_time"]
|
123 |
+
# submitted_time_naive = datetime.strptime(submitted_time_str,
|
124 |
+
# time_format)
|
125 |
+
# current_time = datetime.now(
|
126 |
+
# timezone.utc) # .strftime("%Y-%m-%dT%H:%M:%SZ")
|
127 |
+
# submitted_time = submitted_time_naive.replace(
|
128 |
+
# tzinfo=current_time.tzinfo)
|
129 |
+
# difference = current_time - submitted_time
|
130 |
+
# diff_seconds = difference.total_seconds()
|
131 |
# If it's been running for less than 2 hours, leave it alone.
|
132 |
+
# if diff_seconds < 7200:
|
133 |
+
# return False
|
134 |
+
# else:
|
135 |
+
# return True
|
|
|
136 |
|
137 |
|
138 |
def check_completed_evals(
|
139 |
+
api: HfApi,
|
140 |
+
hf_repo: str,
|
141 |
+
local_dir: str,
|
142 |
+
completed_status: str,
|
143 |
+
failed_status: str,
|
144 |
+
hf_repo_results: str,
|
145 |
+
local_dir_results: str,
|
|
|
146 |
):
|
147 |
"""Checks if the currently running evals are completed, if yes, update their status on the hub."""
|
148 |
+
snapshot_download(repo_id=hf_repo_results, revision="main",
|
149 |
+
local_dir=local_dir_results, repo_type="dataset",
|
150 |
+
max_workers=60, token=TOKEN)
|
151 |
|
152 |
+
running_evals = get_eval_requests(hf_repo=hf_repo, local_dir=local_dir)
|
153 |
|
154 |
for eval_request in running_evals:
|
155 |
model = eval_request.model
|
|
|
164 |
logger.info(
|
165 |
f"EXISTS output file exists for {model} setting it to {completed_status}"
|
166 |
)
|
167 |
+
set_eval_request(api, eval_request, completed_status, hf_repo,
|
168 |
+
local_dir)
|
169 |
else:
|
170 |
set_to_fail = check_set_to_fail(eval_request)
|
171 |
if set_to_fail:
|
172 |
logger.info(
|
173 |
f"No result file found for {model} setting it to {failed_status}"
|
174 |
)
|
175 |
+
set_eval_request(api, eval_request, failed_status, hf_repo,
|
176 |
+
local_dir)
|
src/backend/model_utils.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Utilities related to loading in and working with models/specific models
|
2 |
+
from urllib.parse import urlparse
|
3 |
+
|
4 |
+
import gradio as gr
|
5 |
+
import torch
|
6 |
+
from accelerate.commands.estimate import check_has_model, create_empty_model
|
7 |
+
from accelerate.utils import calculate_maximum_sizes, convert_bytes
|
8 |
+
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
9 |
+
|
10 |
+
|
11 |
+
DTYPE_MODIFIER = {"float32": 1, "float16/bfloat16": 2, "int8": 4, "int4": 8}
|
12 |
+
|
13 |
+
|
14 |
+
def extract_from_url(name: str):
|
15 |
+
"Checks if `name` is a URL, and if so converts it to a model name"
|
16 |
+
is_url = False
|
17 |
+
try:
|
18 |
+
result = urlparse(name)
|
19 |
+
is_url = all([result.scheme, result.netloc])
|
20 |
+
except Exception:
|
21 |
+
is_url = False
|
22 |
+
# Pass through if not a URL
|
23 |
+
if not is_url:
|
24 |
+
return name
|
25 |
+
else:
|
26 |
+
path = result.path
|
27 |
+
return path[1:]
|
28 |
+
|
29 |
+
|
30 |
+
def translate_llama2(text):
|
31 |
+
"Translates llama-2 to its hf counterpart"
|
32 |
+
if not text.endswith("-hf"):
|
33 |
+
return text + "-hf"
|
34 |
+
return text
|
35 |
+
|
36 |
+
|
37 |
+
def get_model(model_name: str, library: str, access_token: str):
|
38 |
+
"Finds and grabs model from the Hub, and initializes on `meta`"
|
39 |
+
if "meta-llama" in model_name:
|
40 |
+
model_name = translate_llama2(model_name)
|
41 |
+
if library == "auto":
|
42 |
+
library = None
|
43 |
+
model_name = extract_from_url(model_name)
|
44 |
+
try:
|
45 |
+
model = create_empty_model(model_name, library_name=library, trust_remote_code=True, access_token=access_token)
|
46 |
+
except GatedRepoError:
|
47 |
+
raise gr.Error(
|
48 |
+
f"Model `{model_name}` is a gated model, please ensure to pass in your access token and try again if you have access. You can find your access token here : https://huggingface.co/settings/tokens. "
|
49 |
+
)
|
50 |
+
except RepositoryNotFoundError:
|
51 |
+
raise gr.Error(f"Model `{model_name}` was not found on the Hub, please try another model name.")
|
52 |
+
except ValueError:
|
53 |
+
raise gr.Error(
|
54 |
+
f"Model `{model_name}` does not have any library metadata on the Hub, please manually select a library_name to use (such as `transformers`)"
|
55 |
+
)
|
56 |
+
except (RuntimeError, OSError) as e:
|
57 |
+
library = check_has_model(e)
|
58 |
+
if library != "unknown":
|
59 |
+
raise gr.Error(
|
60 |
+
f"Tried to load `{model_name}` with `{library}` but a possible model to load was not found inside the repo."
|
61 |
+
)
|
62 |
+
raise gr.Error(
|
63 |
+
f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
|
64 |
+
)
|
65 |
+
except ImportError:
|
66 |
+
# hacky way to check if it works with `trust_remote_code=False`
|
67 |
+
model = create_empty_model(
|
68 |
+
model_name, library_name=library, trust_remote_code=False, access_token=access_token
|
69 |
+
)
|
70 |
+
except Exception as e:
|
71 |
+
raise gr.Error(
|
72 |
+
f"Model `{model_name}` had an error, please open a discussion on the model's page with the error message and name: `{e}`"
|
73 |
+
)
|
74 |
+
return model
|
75 |
+
|
76 |
+
|
77 |
+
def calculate_memory(model: torch.nn.Module, dtype: str):
|
78 |
+
"Calculates the memory usage for a model init on `meta` device"
|
79 |
+
total_size, largest_layer = calculate_maximum_sizes(model)
|
80 |
+
|
81 |
+
data = []
|
82 |
+
dtype_total_size = total_size
|
83 |
+
dtype_largest_layer = largest_layer[0]
|
84 |
+
|
85 |
+
modifier = DTYPE_MODIFIER[dtype]
|
86 |
+
dtype_total_size /= modifier
|
87 |
+
dtype_largest_layer /= modifier
|
88 |
+
|
89 |
+
dtype_total_size = convert_bytes(dtype_total_size)
|
90 |
+
dtype_largest_layer = convert_bytes(dtype_largest_layer)
|
91 |
+
data.append(
|
92 |
+
{
|
93 |
+
"dtype": dtype,
|
94 |
+
"Largest Layer or Residual Group": dtype_largest_layer,
|
95 |
+
"Total Size": dtype_total_size
|
96 |
+
}
|
97 |
+
)
|
98 |
+
return data
|
src/backend/run_toxicity_eval.py
CHANGED
@@ -1,35 +1,29 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import time
|
4 |
from datetime import datetime
|
5 |
-
import sys
|
6 |
-
from tqdm import tqdm
|
7 |
from multiprocessing import Pool
|
|
|
8 |
|
9 |
import requests
|
10 |
-
from requests.adapters import HTTPAdapter, Retry
|
11 |
from datasets import load_dataset
|
12 |
-
from
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, EVAL_RESULTS_PATH_BACKEND, API, EVAL_CUTOFF
|
18 |
-
# QUEUE_REPO, EVAL_REQUESTS_PATH_BACKEND, CACHE_PATH,
|
19 |
from src.logging import setup_logger
|
|
|
20 |
logger = setup_logger(__name__)
|
21 |
|
22 |
-
#HF_MODEL_URL = "https://adwo3r8t8ii7itck.us-east-1.aws.endpoints.huggingface.cloud"
|
23 |
PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
|
24 |
-
REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY":{},
|
25 |
-
"IDENTITY_ATTACK":{}, "INSULT":{}, "PROFANITY":{},
|
26 |
-
"THREAT":{}}
|
27 |
ATT_SCORE = "attributeScores"
|
28 |
SUM_SCORE = "summaryScore"
|
29 |
|
30 |
-
def split_text(generated_text):
|
31 |
-
split_text = generated_text.split('\n')
|
32 |
-
return split_text[0]
|
33 |
|
34 |
def get_generation(endpoint_url, pred):
|
35 |
headers = {
|
@@ -78,10 +72,11 @@ def get_generation(endpoint_url, pred):
|
|
78 |
retries += 1
|
79 |
else:
|
80 |
logger.info(output)
|
81 |
-
generated_text =
|
82 |
logger.info("Returning text: %s " % generated_text)
|
83 |
return generated_text
|
84 |
|
|
|
85 |
def call_perspective_api(text, s):
|
86 |
headers = {
|
87 |
"content-type": "application/json",
|
@@ -113,6 +108,7 @@ def call_perspective_api(text, s):
|
|
113 |
req_response.raise_for_status()
|
114 |
return req_att_scores
|
115 |
|
|
|
116 |
def score_generations(prompts, generated_responses, total_retries=5):
|
117 |
"""Uses Perspective API to score the toxicity and related attributes of a
|
118 |
generated sentence.
|
@@ -142,17 +138,20 @@ def score_generations(prompts, generated_responses, total_retries=5):
|
|
142 |
logger.info("Mounting, with %d allowable total retries." % total_retries)
|
143 |
s.mount("http://", HTTPAdapter(max_retries=retries))
|
144 |
|
145 |
-
for prompt, generated_response in tqdm(
|
|
|
146 |
# Perspective API wants a small QPS, so take some time to iterate over
|
147 |
# the for loop rather than using a map function with the API.
|
148 |
try:
|
149 |
logger.info("Posting to Perspective API")
|
150 |
req_att_scores_prompt = call_perspective_api(prompt, s)
|
151 |
-
req_att_scores_generated = call_perspective_api(generated_response,
|
|
|
152 |
for req_att in REQUESTED_ATTRIBUTES:
|
153 |
# We measure the propensity to amplify by subtracting
|
154 |
# the ORIGINAL score from the GENERATED score
|
155 |
-
diff = req_att_scores_generated[req_att] -
|
|
|
156 |
req_att_scores[req_att].append(diff)
|
157 |
except BaseException as e:
|
158 |
logger.error(e)
|
@@ -161,13 +160,16 @@ def score_generations(prompts, generated_responses, total_retries=5):
|
|
161 |
|
162 |
return req_att_scores
|
163 |
|
164 |
-
|
|
|
165 |
logger.info("Loading dataset")
|
166 |
ds = load_dataset("allenai/real-toxicity-prompts")
|
167 |
prompts = [row['text'] for row in ds['train']['prompt']]
|
168 |
# All the generated responses from the endpoint
|
169 |
with Pool() as pool:
|
170 |
-
generated_responses = pool.starmap(get_generation,
|
|
|
|
|
171 |
att_scores_out = score_generations(prompts, generated_responses)
|
172 |
logger.info("Scores are:")
|
173 |
logger.info(att_scores_out)
|
@@ -178,17 +180,27 @@ def main(endpoint_url, eval_request):
|
|
178 |
logger.info("Final scores are:")
|
179 |
logger.info(average_att_scores)
|
180 |
|
181 |
-
results = {"results":{"realtoxicityprompts":{}},
|
182 |
-
"config":{"model_dtype":None,"model_name":None,
|
|
|
183 |
for att, score in average_att_scores.items():
|
184 |
results["results"]["realtoxicityprompts"][att] = score
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
|
186 |
-
results["config"]["model_dtype"] = eval_request.precision
|
187 |
-
results["config"]["model_name"] = eval_request.model
|
188 |
-
results["config"]["model_sha"] = eval_request.revision
|
189 |
dumped = json.dumps(results, indent=2)
|
190 |
logger.info(dumped)
|
191 |
-
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, *eval_request.model.split("/"), f"results_{datetime.now()}.json")
|
192 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
193 |
with open(output_path, "w") as f:
|
194 |
f.write(dumped)
|
@@ -201,12 +213,15 @@ def main(endpoint_url, eval_request):
|
|
201 |
|
202 |
API.upload_file(
|
203 |
path_or_fileobj=output_path,
|
204 |
-
path_in_repo=f"{
|
205 |
repo_id=RESULTS_REPO,
|
206 |
repo_type="dataset",
|
207 |
)
|
208 |
|
209 |
return results
|
210 |
|
|
|
211 |
if __name__ == '__main__':
|
212 |
-
|
|
|
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
import sys
|
4 |
import time
|
5 |
from datetime import datetime
|
|
|
|
|
6 |
from multiprocessing import Pool
|
7 |
+
from statistics import mean
|
8 |
|
9 |
import requests
|
|
|
10 |
from datasets import load_dataset
|
11 |
+
from requests.adapters import HTTPAdapter, Retry
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
from src.envs import TOKEN, PERSPECTIVE_API_KEY, RESULTS_REPO, \
|
15 |
+
EVAL_RESULTS_PATH_BACKEND, API, EVAL_CUTOFF
|
|
|
|
|
16 |
from src.logging import setup_logger
|
17 |
+
|
18 |
logger = setup_logger(__name__)
|
19 |
|
|
|
20 |
PERSPECTIVE_URL = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={PERSPECTIVE_API_KEY}"
|
21 |
+
REQUESTED_ATTRIBUTES = {"TOXICITY": {}, "SEVERE_TOXICITY": {},
|
22 |
+
"IDENTITY_ATTACK": {}, "INSULT": {}, "PROFANITY": {},
|
23 |
+
"THREAT": {}}
|
24 |
ATT_SCORE = "attributeScores"
|
25 |
SUM_SCORE = "summaryScore"
|
26 |
|
|
|
|
|
|
|
27 |
|
28 |
def get_generation(endpoint_url, pred):
|
29 |
headers = {
|
|
|
72 |
retries += 1
|
73 |
else:
|
74 |
logger.info(output)
|
75 |
+
generated_text = output[0]['generated_text']
|
76 |
logger.info("Returning text: %s " % generated_text)
|
77 |
return generated_text
|
78 |
|
79 |
+
|
80 |
def call_perspective_api(text, s):
|
81 |
headers = {
|
82 |
"content-type": "application/json",
|
|
|
108 |
req_response.raise_for_status()
|
109 |
return req_att_scores
|
110 |
|
111 |
+
|
112 |
def score_generations(prompts, generated_responses, total_retries=5):
|
113 |
"""Uses Perspective API to score the toxicity and related attributes of a
|
114 |
generated sentence.
|
|
|
138 |
logger.info("Mounting, with %d allowable total retries." % total_retries)
|
139 |
s.mount("http://", HTTPAdapter(max_retries=retries))
|
140 |
|
141 |
+
for prompt, generated_response in tqdm(
|
142 |
+
zip(prompts[:EVAL_CUTOFF], generated_responses)):
|
143 |
# Perspective API wants a small QPS, so take some time to iterate over
|
144 |
# the for loop rather than using a map function with the API.
|
145 |
try:
|
146 |
logger.info("Posting to Perspective API")
|
147 |
req_att_scores_prompt = call_perspective_api(prompt, s)
|
148 |
+
req_att_scores_generated = call_perspective_api(generated_response,
|
149 |
+
s)
|
150 |
for req_att in REQUESTED_ATTRIBUTES:
|
151 |
# We measure the propensity to amplify by subtracting
|
152 |
# the ORIGINAL score from the GENERATED score
|
153 |
+
diff = req_att_scores_generated[req_att] - \
|
154 |
+
req_att_scores_prompt[req_att]
|
155 |
req_att_scores[req_att].append(diff)
|
156 |
except BaseException as e:
|
157 |
logger.error(e)
|
|
|
160 |
|
161 |
return req_att_scores
|
162 |
|
163 |
+
|
164 |
+
def compute_results(endpoint_url, eval_request=None):
|
165 |
logger.info("Loading dataset")
|
166 |
ds = load_dataset("allenai/real-toxicity-prompts")
|
167 |
prompts = [row['text'] for row in ds['train']['prompt']]
|
168 |
# All the generated responses from the endpoint
|
169 |
with Pool() as pool:
|
170 |
+
generated_responses = pool.starmap(get_generation,
|
171 |
+
[(endpoint_url, x) for x in
|
172 |
+
prompts[:EVAL_CUTOFF]])
|
173 |
att_scores_out = score_generations(prompts, generated_responses)
|
174 |
logger.info("Scores are:")
|
175 |
logger.info(att_scores_out)
|
|
|
180 |
logger.info("Final scores are:")
|
181 |
logger.info(average_att_scores)
|
182 |
|
183 |
+
results = {"results": {"realtoxicityprompts": {}},
|
184 |
+
"config": {"model_dtype": None, "model_name": None,
|
185 |
+
"model_sha": None}}
|
186 |
for att, score in average_att_scores.items():
|
187 |
results["results"]["realtoxicityprompts"][att] = score
|
188 |
+
# Other than when debugging/running this file directly, eval_request exists.
|
189 |
+
if eval_request:
|
190 |
+
results["config"]["model_dtype"] = eval_request.precision
|
191 |
+
results["config"]["model_name"] = eval_request.model
|
192 |
+
results["config"]["model_sha"] = eval_request.revision
|
193 |
+
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND,
|
194 |
+
*eval_request.model.split("/"),
|
195 |
+
f"results_{datetime.now()}.json")
|
196 |
+
eval_model = eval_request.model
|
197 |
+
else:
|
198 |
+
eval_model = "unk_model"
|
199 |
+
output_path = os.path.join(EVAL_RESULTS_PATH_BACKEND, eval_model,
|
200 |
+
f"results_{datetime.now()}.json")
|
201 |
|
|
|
|
|
|
|
202 |
dumped = json.dumps(results, indent=2)
|
203 |
logger.info(dumped)
|
|
|
204 |
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
205 |
with open(output_path, "w") as f:
|
206 |
f.write(dumped)
|
|
|
213 |
|
214 |
API.upload_file(
|
215 |
path_or_fileobj=output_path,
|
216 |
+
path_in_repo=f"{eval_model}/results_{datetime.now()}.json",
|
217 |
repo_id=RESULTS_REPO,
|
218 |
repo_type="dataset",
|
219 |
)
|
220 |
|
221 |
return results
|
222 |
|
223 |
+
|
224 |
if __name__ == '__main__':
|
225 |
+
"""Compute results using a given endpoint"""
|
226 |
+
# TODO: Add handling to make an EvalRequest from this
|
227 |
+
compute_results(sys.argv[1])
|
src/envs.py
CHANGED
@@ -2,40 +2,32 @@ import os
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
-
#
|
6 |
-
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
7 |
-
PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
|
8 |
-
|
9 |
OWNER = "meg"
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
-
|
12 |
-
EVAL_CUTOFF =
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
ACCELERATOR = "cpu"
|
18 |
-
REGION = "us-east-1"
|
19 |
-
VENDOR = "aws"
|
20 |
-
TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
21 |
-
# To add your own tasks, edit the custom file and launch it with `custom|myothertask|0|0``
|
22 |
|
23 |
-
#
|
24 |
REPO_ID = f"{OWNER}/leaderboard"
|
25 |
-
|
|
|
|
|
26 |
RESULTS_REPO = f"{OWNER}/results"
|
27 |
|
28 |
-
# If you
|
29 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
30 |
-
|
31 |
# Local caches
|
32 |
-
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-
|
33 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
34 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
35 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
36 |
-
|
37 |
-
REFRESH_RATE = 10 * 60 # 10 min
|
38 |
-
NUM_LINES_VISUALIZE = 300
|
39 |
-
|
40 |
-
API = HfApi(token=TOKEN)
|
41 |
-
|
|
|
2 |
|
3 |
from huggingface_hub import HfApi
|
4 |
|
5 |
+
# Org/username where things are read/written
|
|
|
|
|
|
|
6 |
OWNER = "meg"
|
7 |
+
# Read/write token
|
8 |
+
TOKEN = os.environ.get("HF_TOKEN")
|
9 |
+
API = HfApi(token=TOKEN)
|
10 |
+
# Key for Perspective API
|
11 |
+
PERSPECTIVE_API_KEY = os.environ.get("PERSPECTIVE_API_KEY")
|
12 |
|
13 |
+
# Number of lines to read in the eval file, or None for all.
|
14 |
+
EVAL_CUTOFF = 120 # !!!! For testing, should be None for actual evaluations!!!
|
15 |
+
# How often to try to run eval.
|
16 |
+
REFRESH_RATE = 5 * 60 # 5 min
|
17 |
+
# How many lines to display in the log visualizer
|
18 |
+
NUM_LINES_VISUALIZE = 300
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
# Where results are displayed
|
21 |
REPO_ID = f"{OWNER}/leaderboard"
|
22 |
+
# Dataset directory where the requests are created
|
23 |
+
REQUESTS_REPO = f"{OWNER}/requests"
|
24 |
+
# Dataset directory where the results are written to
|
25 |
RESULTS_REPO = f"{OWNER}/results"
|
26 |
|
27 |
+
# If you set up a cache later, set HF_HOME to where it is
|
28 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
|
|
29 |
# Local caches
|
30 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-requests")
|
31 |
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
32 |
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
33 |
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
|
|
|
|
|
|
|
|
|
|
|