import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from pydantic import BaseModel import spaces torch.random.manual_seed(0) model = AutoModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, ) # Pydantic class for output validation class VideoAnalysis(BaseModel): indoor: int hands_free: int screen_interaction: int standing: int @spaces.GPU(duration=100) def process_description(description): # Construct a prompt for your LLM based on the video description prompt = f""" You are a helpful AI assistant. Analyze the following video description and answer the questions with 0 for True and 1 for False: Video Description: {description} Questions: - Is the scene indoors? - Are the subject's hands free? - Is there screen interaction by the subject? - Is the subject standing? Provide your answers in JSON format like this: {{"indoor": 0, "hands_free": 1, "screen_interaction": 0, "standing": 1}} """ generation_args = { "max_new_tokens": 100, # Adjust as needed "return_full_text": False, "temperature": 0.0, "do_sample": False, } output = pipe(prompt, **generation_args) json_text = output[0]['generated_text'] try: # Attempt to parse and validate the JSON response analysis_result = VideoAnalysis.model_validate_json(json_text) return analysis_result.model_dump_json() # Return as valid JSON except Exception as e: print(f"Error processing LLM output: {e}") return {"error": "Could not process the video description."}