import gradio as gr from huggingface_hub import InferenceClient from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments from datasets import load_dataset # Load the model and tokenizer model_name = "HuggingFaceH4/zephyr-7b-beta" client = InferenceClient(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) # Load multiple coding datasets def load_code_datasets(): datasets = { "CodeSearchNet": load_dataset("code_search_net", "python"), "StackOverflow": load_dataset("stackexchange", "stack_overflow"), "GitHub": load_dataset("github", "python"), } return datasets datasets = load_code_datasets() # Preprocessing function for tokenizing code def preprocess_code_data(examples): return tokenizer(examples['code'], padding="max_length", truncation=True) # Apply preprocessing to all datasets tokenized_datasets = {name: dataset.map(preprocess_code_data, batched=True) for name, dataset in datasets.items()} # Fine-tuning settings training_args = TrainingArguments( output_dir="./results", per_device_train_batch_size=4, num_train_epochs=3, logging_dir='./logs', evaluation_strategy="epoch" ) # Trainer setup trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["CodeSearchNet"]['train'], eval_dataset=tokenized_datasets["CodeSearchNet"]['test'], ) # Fine-tuning the model trainer.train() # Define the system message for coding tasks system_message = """ You are an advanced AI assistant specialized in coding. Your purpose is to: 1. Provide error-free, optimal code in multiple programming languages (e.g., Python, JavaScript, Java, C++). 2. Ensure your answers are precise, functional, and concise, avoiding redundant explanations. 3. When handling coding problems, break them into smaller, actionable steps, and provide solutions for each step if applicable. 4. Focus on real-world coding practices, including debugging, refactoring, and optimizing code. 5. In case of incorrect code or errors, identify the issue, explain it briefly, and provide a corrected solution. 6. Always prioritize clear, correct syntax, and follow best practices for coding. Guidelines: 1. If given code with issues, explain the issues and provide the corrected code without excessive verbosity. 2. Ensure code is tested and runnable with minimal dependencies. 3. Use meaningful variable names and comments where necessary for clarity. 4. If asked to explain code, provide a concise but sufficient explanation for the key parts. Thank you for using this system. Please proceed with your query. """ # Define the respond function to handle user queries def respond(message, history, system_message, max_tokens, temperature, top_p): validate_inputs(max_tokens, temperature, top_p) # Prepare messages for the model messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: # User's message messages.append({"role": "user", "content": val[0]}) if val[1]: # Assistant's response messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) response = "" try: # Generate response with streaming for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response except Exception as e: response = f"An error occurred while generating the response: {str(e)}" yield response # Add additional features for code-specific tasks def multi_step_code_generation(problem_statement): """ Generate code in multiple stages, breaking down the problem. """ stages = [ "1. Understand the problem: Analyze the requirements.", "2. Design the basic structure of the solution.", "3. Implement core functions and logic.", "4. Optimize and refactor the code." ] solution_parts = [] for stage in stages: # Simulate AI providing code in steps solution_parts.append(f"Solution for Stage: {stage}\n") return "\n".join(solution_parts) def generate_prompt(language, task): """ Generate a coding prompt for different programming languages. """ prompts = { "python": f"Write a Python program to {task}.", "javascript": f"Write a JavaScript function to {task}.", "java": f"Write a Java program to {task}.", "c++": f"Write a C++ function to {task}.", } return prompts.get(language.lower(), f"Write a program to {task}.") # Create Gradio Interface for Chatbot demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value=system_message, label="System message"), gr.Slider(minimum=1, maximum=32768, value=17012, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), gr.Textbox(label="Task Description", placeholder="Describe your coding task here..."), gr.Textbox(label="Programming Language", placeholder="Python, JavaScript, Java, C++, etc."), ], ) if __name__ == "__main__": demo.launch()