File size: 2,840 Bytes
977063a 5051da6 f9d0ccd 977063a 5051da6 f9d0ccd 5051da6 f9d0ccd 5051da6 f9d0ccd 5051da6 11fbe39 6853b2a f9d0ccd 6853b2a f9d0ccd 5051da6 f9d0ccd 5051da6 11fbe39 6853b2a 5051da6 f9d0ccd 5051da6 49c6a0b 5051da6 f9d0ccd 5051da6 f9d0ccd 5051da6 49c6a0b 5051da6 49c6a0b 977063a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import gradio as gr
import subprocess
import os
import re
from datetime import datetime
def run_evaluation(model_name):
results = []
# Use the secret OpenRouter API key from the Hugging Face space
if "OPENROUTER_API_KEY" not in os.environ:
return "Error: OPENROUTER_API_KEY not found in environment variables."
try:
# Set up environment
env = os.environ.copy()
env["OPENROUTER_API_KEY"] = os.environ["OPENROUTER_API_KEY"]
# Run inference
current_date = datetime.now().strftime("%Y%m%d")
inference_cmd = f"""
cd duckdb-nsql/ &&
python eval/predict.py \
predict \
eval/data/dev.json \
eval/data/tables.json \
--output-dir output/ \
--stop-tokens ';' \
--max-tokens 30000 \
--overwrite-manifest \
--manifest-client openrouter \
--manifest-engine {model_name} \
--prompt-format duckdbinstgraniteshort
"""
inference_result = subprocess.run(inference_cmd, shell=True, check=True, capture_output=True, text=True, env=env)
results.append("Inference completed.")
# Extract JSON file path from inference output
json_path_match = re.search(r'(.*\.json)', inference_result.stdout)
if not json_path_match:
raise ValueError("Could not find JSON file path in inference output")
json_file = os.path.basename(json_path_match.group(1))
results.append(f"Generated JSON file: {json_file}")
# Run evaluation
eval_cmd = f"""
cd duckdb-nsql/ &&
python eval/evaluate.py evaluate \
--gold eval/data/dev.json \
--db eval/data/databases/ \
--tables eval/data/tables.json \
--output-dir output/ \
--pred output/{json_file}
"""
eval_result = subprocess.run(eval_cmd, shell=True, check=True, capture_output=True, text=True)
# Extract and format metrics from eval output
metrics = eval_result.stdout
if metrics:
results.append(f"Evaluation completed:\n{metrics}")
else:
results.append("Evaluation completed, but couldn't get metrics.")
except subprocess.CalledProcessError as e:
results.append(f"Error occurred: {str(e)}")
results.append(f"Command output: {e.output}")
except Exception as e:
results.append(f"An unexpected error occurred: {str(e)}")
return "\n\n".join(results)
with gr.Blocks() as demo:
gr.Markdown("# DuckDB SQL Evaluation App (OpenRouter)")
model_name = gr.Textbox(label="Model Name (e.g., qwen/qwen-2.5-72b-instruct)")
start_btn = gr.Button("Start Evaluation")
output = gr.Textbox(label="Output", lines=20)
start_btn.click(fn=run_evaluation, inputs=[model_name], outputs=output)
demo.launch() |