import gradio as gr import os import subprocess from pathlib import Path import time import requests def check_services(): """Check if all required services are running""" services = [ ("Controller", "http://localhost:21001"), ("API Server", "http://localhost:8000"), ("Model Worker", "http://localhost:8080") ] for service_name, url in services: try: requests.get(url) print(f"{service_name} is running") except requests.exceptions.ConnectionError: return False, f"{service_name} is not running" return True, "All services are running" def check_training_status(): # First check if services are running services_ok, message = check_services() if not services_ok: return message results_dir = Path("/app/results") if not results_dir.exists(): return "Training hasn't started yet." iterations = len(list(results_dir.glob("iter_*"))) return f"Completed {iterations} training iterations." def start_training(model_path, instruct_count, max_iter): # Check if services are running services_ok, message = check_services() if not services_ok: return message os.environ["MODEL_PATH"] = model_path os.environ["INSTRUCT_COUNT"] = str(instruct_count) os.environ["MAX_ITER"] = str(max_iter) try: subprocess.run(["bash", "run.sh"], check=True, cwd="/app/qwen") return "Training completed successfully!" except subprocess.CalledProcessError as e: return f"Error during training: {str(e)}" # Create the interface with gr.Blocks() as iface: gr.Markdown("# Self-Lengthen Training Interface") with gr.Row(): with gr.Column(): model_path = gr.Textbox( label="Model Path", value="/app/models/base_model", info="Path to the base model" ) instruct_count = gr.Number( label="Instruction Count", value=5000, minimum=100, info="Number of instructions to generate" ) max_iter = gr.Number( label="Max Iterations", value=3, minimum=1, info="Number of training iterations" ) train_btn = gr.Button("Start Training") with gr.Column(): status_output = gr.Textbox( label="Status", value="Ready to start training...", interactive=False ) refresh_btn = gr.Button("Refresh Status") train_btn.click( fn=start_training, inputs=[model_path, instruct_count, max_iter], outputs=status_output ) refresh_btn.click( fn=check_training_status, inputs=None, outputs=status_output ) if __name__ == "__main__": # Wait for services to be ready print("Waiting for services to start...") while True: services_ok, message = check_services() if services_ok: break print(message) time.sleep(5) print("All services are running, starting web interface...") iface.launch(server_name="0.0.0.0", server_port=7860)