Spaces:

ManishThota
/

GSoC-Super-Rapid-Annotator

Runtime error

App Files Files Community

ManishThota commited on Aug 21, 2024

Commit

89b6fe6

verified ·

1 Parent(s): 88cedc3

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -26

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ from src.video_model import describe_video
 from src.utils import parse_string, parse_annotations
 import os
 # # --- Function to construct the final query ---
 # def process_video_and_questions(video, sitting, hands, location, screen):
 #     # Extract the video name (filename)
@@ -15,32 +16,29 @@ import os
 #     query = f"Describe the video in detail and answer the questions"
 #     additional_info = []
 #     if sitting:
-#         additional_info.append("Is the subject in the video standing or sitting?")
 #     if hands:
-#         additional_info.append("Is the subject holding any object in their hands, if so the hands are not free else they are free?")
 #     if location:
-#         additional_info.append("Is the subject present indoors or outdoors?")
 #     if screen:
-#         additional_info.append("Is the subject interacting with a screen in the background by facing the screen?")
 #     end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Below is an example:
-#         <instructions>
-#             <annotation>indoors: 0</annotation>
-#             <annotation>standing: 1</annotation>
-#             <annotation>hands.free: None</annotation>
-#             <annotation>screen.interaction_yes: None</annotation>
-#         </instructions>
 #         """
-    # final_query = query + " " + " ".join(additional_info)
-    # final_prompt = final_query + " " + end_query
-    # # Assuming your describe_video function handles the video processing
-    # response = describe_video(video, final_prompt)
-    # final_response = f"<video_name>{video_name}</video_name>" + " " + response
-    # return final_response
 def process_video_and_questions(video, sitting, hands, location, screen):
     # Extract the video name (filename)
@@ -71,13 +69,22 @@ def process_video_and_questions(video, sitting, hands, location, screen):
     else:
         additional_info.append("<annotation>screen.interaction_yes: None</annotation>")
-    # Updated end_query string with clear explanation and example
-    end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Below is an example:
-    <annotation>indoors: 0</annotation>
-    <annotation>standing: 1</annotation>
-    <annotation>hands.free: None</annotation>
-    <annotation>screen.interaction_yes: None</annotation>
     """
     final_query = query + " " + " ".join(additional_info)
@@ -89,7 +96,6 @@ def process_video_and_questions(video, sitting, hands, location, screen):
     return final_response
 def output_to_csv(final_response):
     # Parse the string to get the content
     parsed_content = parse_string(final_response, ["video_name", "annotation"])

 from src.utils import parse_string, parse_annotations
 import os
 # # --- Function to construct the final query ---
 # def process_video_and_questions(video, sitting, hands, location, screen):
 #     # Extract the video name (filename)
 #     query = f"Describe the video in detail and answer the questions"
 #     additional_info = []
 #     if sitting:
+#         additional_info.append("sitting/standing : Is the subject in the video standing or sitting?")
 #     if hands:
+#         additional_info.append("hands_free: Is the subject holding any object in their hands, if so the hands are not free else they are free?")
 #     if location:
+#         additional_info.append("indoors/outdoors: Is the subject present indoors or outdoors?")
 #     if screen:
+#         additional_info.append("screen_interactions: Is the subject interacting with a screen in the background by facing the screen?")
 #     end_query = """Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present. Below is an example:
+#         <annotation>indoors: 0</annotation>
+#         <annotation>standing: 1</annotation>
+#         <annotation>hands.free: None</annotation>
+#         <annotation>screen.interaction_yes: None</annotation>
 #         """
+#     final_query = query + " " + " ".join(additional_info)
+#     final_prompt = final_query + " " + end_query
+#     # Assuming your describe_video function handles the video processing
+#     response = describe_video(video, final_prompt)
+#     final_response = f"<video_name>{video_name}</video_name>" + " " + response
+#     return final_response
 def process_video_and_questions(video, sitting, hands, location, screen):
     # Extract the video name (filename)
     else:
         additional_info.append("<annotation>screen.interaction_yes: None</annotation>")
+    # Updated end_query with structured prompt
+    end_query = """
+    You're an AI assistant, and your goal is to provide the results of the video analysis in the correct format as described below:
+    <annotations>
+    - Provide the results in <annotation> tags, where 0 indicates False, 1 indicates True, and None indicates that no information is present.
+    - Use <annotation> tags for each attribute like indoors, standing, hands.free, and screen.interaction_yes.
+    </annotations>
+    <example>
+    - Here's an example of the expected format:
+      <annotation>indoors: 0</annotation>
+      <annotation>standing: 1</annotation>
+      <annotation>hands.free: None</annotation>
+      <annotation>screen.interaction_yes: None</annotation>
+    </example>
     """
     final_query = query + " " + " ".join(additional_info)
     return final_response
 def output_to_csv(final_response):
     # Parse the string to get the content
     parsed_content = parse_string(final_response, ["video_name", "annotation"])