Spaces:

OpenGVLab
/

ControlLLM

Running

App Files Files Community

zwgao commited on Dec 15, 2023

Commit

3fdcc70

1 Parent(s): 9704f7c

add file

Browse files

Files changed (47) hide show

.gitignore +4 -0
app.py +690 -0
assets/assistant.png +0 -0
assets/human.png +0 -0
builtin_plan.json +15 -0
cllm/agents/__init__.py +2 -0
cllm/agents/base.py +173 -0
cllm/agents/builtin/__init__.py +3 -0
cllm/agents/builtin/plans.py +634 -0
cllm/agents/builtin/tools.py +1512 -0
cllm/agents/container.py +98 -0
cllm/agents/tog/__init__.py +2 -0
cllm/agents/tog/compiler.py +62 -0
cllm/agents/tog/controller.py +157 -0
cllm/agents/tog/interpretor.py +262 -0
cllm/agents/tog/planner.py +156 -0
cllm/agents/tog/responser.py +66 -0
cllm/services/audio/__init__.py +0 -0
cllm/services/audio/api.py +140 -0
cllm/services/general/__init__.py +0 -0
cllm/services/general/api.py +65 -0
cllm/services/image_editing/__init__.py +0 -0
cllm/services/image_editing/api.py +277 -0
cllm/services/image_generation/__init__.py +0 -0
cllm/services/image_generation/api.py +96 -0
cllm/services/image_inpainting/__init__.py +0 -0
cllm/services/image_inpainting/api.py +76 -0
cllm/services/image_perception/__init__.py +0 -0
cllm/services/image_perception/api.py +202 -0
cllm/services/image_processing/__init__.py +0 -0
cllm/services/image_processing/api.py +63 -0
cllm/services/nlp/__init__.py +0 -0
cllm/services/nlp/api.py +163 -0
cllm/services/nlp/llms/__init__.py +2 -0
cllm/services/nlp/llms/chat_models.py +219 -0
cllm/services/nlp/llms/memory/__init__.py +1 -0
cllm/services/nlp/llms/memory/message_memory.py +131 -0
cllm/services/nlp/llms/memory/utils.py +52 -0
cllm/services/tog/__init__.py +2 -0
cllm/services/tog/api.py +40 -0
cllm/services/utils.py +50 -0
cllm/services/video/__init__.py +0 -0
cllm/services/video/api.py +135 -0
cllm/services/vqa/__init__.py +0 -0
cllm/services/vqa/api.py +28 -0
cllm/utils.py +79 -0
requirements.txt +14 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+run.sh
+client_resources/
+cllm.log

app.py ADDED Viewed

	@@ -0,0 +1,690 @@

+import copy
+import logging
+import os
+import os.path as osp
+from functools import partial
+from pydoc import locate
+import shutil
+import json
+from traceback import print_exc
+import uuid
+from pathlib import Path
+from collections import OrderedDict
+import numpy as np
+from PIL import Image
+import whisper
+import fire
+import gradio as gr
+import gradio.themes.base as ThemeBase
+from gradio.themes.utils import colors, fonts, sizes
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.agents.builtin import plans
+from cllm.services.general.api import remote_logging
+from cllm.agents import container, FILE_EXT
+from cllm.utils import get_real_path, plain2md, md2plain
+import openai
+openai.api_base = os.environ.get("OPENAI_API_BASE", None)
+openai.api_key = os.environ.get("OPENAI_API_KEY", None)
+logging.basicConfig(
+    filename="cllm.log",
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)-8s %(message)s",
+)
+logger = logging.getLogger(__name__)
+RESOURCE_ROOT = os.environ.get("CLIENT_ROOT", "./client_resources")
+def is_image(file_path):
+    ext = FILE_EXT["image"]
+    _, extension = os.path.splitext(file_path)
+    return extension[1:] in ext
+def is_video(file_path):
+    ext = FILE_EXT["video"]
+    _, extension = os.path.splitext(file_path)
+    return extension[1:] in ext
+def is_audio(file_path):
+    ext = FILE_EXT["audio"]
+    _, extension = os.path.splitext(file_path)
+    return extension[1:] in ext
+def get_file_type(file_path):
+    if is_image(file_path):
+        if "mask" in file_path:
+            return "mask"
+        return "image"
+    elif is_video(file_path):
+        return "video"
+    elif is_audio(file_path):
+        return "audio"
+    raise ValueError("Invalid file type")
+def convert_dict_to_frame(data):
+    import pandas
+    outputs = []
+    for k, v in data.items():
+        output = {"Resource": k}
+        if not isinstance(v, str):
+            output["Type"] = str(v.__class__)
+        else:
+            output["Type"] = v
+        outputs.append(output)
+    if len(outputs) == 0:
+        return None
+    return pandas.DataFrame(outputs)
+class Seafoam(ThemeBase.Base):
+    def __init__(
+        self,
+        *,
+        primary_hue=colors.emerald,
+        secondary_hue=colors.blue,
+        neutral_hue=colors.gray,
+        spacing_size=sizes.spacing_md,
+        radius_size=sizes.radius_md,
+        text_size=sizes.text_sm,
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            spacing_size=spacing_size,
+            radius_size=radius_size,
+            text_size=text_size,
+        )
+        super().set(
+            body_background_fill_dark="#111111",
+            button_primary_background_fill="*primary_300",
+            button_primary_background_fill_hover="*primary_200",
+            button_primary_text_color="black",
+            button_secondary_background_fill="*secondary_300",
+            button_secondary_background_fill_hover="*secondary_200",
+            border_color_primary="#0BB9BF",
+            slider_color="*secondary_300",
+            slider_color_dark="*secondary_600",
+            block_title_text_weight="600",
+            block_border_width="3px",
+            block_shadow="*shadow_drop_lg",
+            button_shadow="*shadow_drop_lg",
+            button_large_padding="10px",
+        )
+class InteractionLoop:
+    def __init__(
+        self,
+        controller="cllm.agents.code.Controller",
+    ):
+        self.stream = True
+        Controller = locate(controller)
+        self.controller = Controller(stream=self.stream, interpretor_kwargs=dict())
+        self.whisper = whisper.load_model("base")
+    def _gen_new_name(self, r_type, ext="png"):
+        this_new_uuid = str(uuid.uuid4())[:6]
+        new_file_name = f"{this_new_uuid}_{r_type}.{ext}"
+        return new_file_name
+    def init_state(self):
+        user_state = OrderedDict()
+        user_state["resources"] = OrderedDict()
+        user_state["history_msgs"] = []
+        resources = OrderedDict()
+        for item in sorted(os.listdir("./assets/resources")):
+            if item.startswith("."):
+                continue
+            shutil.copy(
+                osp.join("./assets/resources", item),
+                osp.join(RESOURCE_ROOT, item),
+            )
+            resources[item] = get_file_type(item)
+        # return user_state, user_state["resources"]
+        return user_state, resources
+    def add_file(self, user_state, history, file):
+        if user_state.get("resources", None) is None:
+            user_state["resources"] = OrderedDict()
+        if file is None:
+            return user_state, None, history, None
+        # filename = os.path.basename(file.name)
+        file = Path(file)
+        ext = file.suffix[1:]
+        if ext in FILE_EXT["image"]:
+            ext = "png"
+        r_type = get_file_type(file.name)
+        new_filename = self._gen_new_name(r_type, ext)
+        saved_path = get_real_path(new_filename)
+        if ext in FILE_EXT["image"]:
+            Image.open(file).convert("RGB").save(saved_path, "png")
+            user_state["input_image"] = new_filename
+        else:
+            shutil.copy(file, saved_path)
+        logger.info(f"add file: {saved_path}")
+        user_state["resources"][new_filename] = r_type
+        for key, val in user_state["resources"].items():
+            if key == "prompt_points":
+                user_state["resources"].pop(key)
+                break
+        history, _ = self.add_text(history, (saved_path,), role="human", append=False)
+        history, _ = self.add_text(
+            history, f"Recieved file {new_filename}", role="assistant", append=False
+        )
+        memory = convert_dict_to_frame(user_state["resources"])
+        image_name = None
+        if Path(saved_path).suffix[1:] in FILE_EXT["image"]:
+            image_name = saved_path
+        return user_state, image_name, history, memory
+    def add_msg(self, history, text, audio, role="assistant", append=False):
+        if text is not None and text.strip() != "":
+            return self.add_text(history, text, role=role, append=append)
+        elif audio is not None:
+            return self.add_audio(history, audio, role=role, append=append)
+        return history, ""
+    def add_text(self, history, text, role="assistant", append=False):
+        if history is None:
+            return history, ""
+        assert role in ["human", "assistant"]
+        idx = 0
+        if len(history) == 0 or role == "human":
+            history.append([None, None])
+        if role == "assistant":
+            idx = 1
+            if not append and history[-1][1] is not None:
+                history.append([None, None])
+        if append:
+            history[-1][idx] = (
+                text if history[-1][idx] is None else history[-1][idx] + text
+            )
+        else:
+            history[-1][idx] = text
+        if isinstance(text, str):
+            logger.info(f"add text: {md2plain(text)}")
+        return history, ""
+    def add_audio(self, history, audio, role="assistant", append=False):
+        assert role in ["human", "assistant"]
+        result = self.whisper.transcribe(audio)
+        text = result["text"]
+        logger.info(f"add audio: {text}")
+        return self.add_text(history, text, role=role, append=append)
+    def plan(self, user_state, input_image, history, history_plan):
+        logger.info(f"Task plan...")
+        if user_state.get("resources", None) is None:
+            user_state["resources"] = OrderedDict()
+        request = history[-1][0]
+        user_state["request"] = request
+        if isinstance(request, str) and request.startswith("$"):
+            solution = f'show$("{request[1:]}")'
+        else:
+            solution = self.controller.plan(request, state=user_state)
+        print(f"request: {request}")
+        if solution == self.controller.SHORTCUT:
+            # md_text = "**Using builtin shortcut solution.**"
+            history, _ = self.add_text(
+                history, solution, role="assistant", append=False
+            )
+            user_state["solution"] = solution
+            user_state["history_msgs"] = history
+            yield user_state, input_image, history, [solution]
+        elif isinstance(solution, str) and solution.startswith("show$"):
+            user_state["solution"] = solution
+            yield user_state, input_image, history, solution
+        else:
+            output_text = (
+                "The whole process will take some time, please be patient.<br><br>"
+            )
+            history, _ = self.add_text(
+                history, output_text, role="assistant", append=True
+            )
+            yield user_state, input_image, history, history_plan
+            task_decomposition = next(solution)
+            if task_decomposition in [None, [], ""]:
+                output = "Error: unrecognized resource(s) in task decomposition."
+                task_decomposition = "[]"
+            else:
+                output = task_decomposition
+            output = f"**Task Decomposition:**\n{output}"
+            output = plain2md(output)
+            history, _ = self.add_text(history, output, role="assistant", append=True)
+            user_state["task_decomposition"] = json.loads(task_decomposition)
+            yield user_state, input_image, history, history_plan
+            history, _ = self.add_text(
+                history,
+                plain2md("\n\n**Thoughs-on-Graph:**\n"),
+                role="assistant",
+                append=True,
+            )
+            yield user_state, input_image, history, history_plan
+            solution_str = next(solution)
+            logger.info(f"Thoughs-on-Graph: \n{solution_str}")
+            if solution_str in [None, [], ""]:
+                output = "Empty solution possibly due to some internal errors."
+                solution_str = "[]"
+            else:
+                output = solution_str
+            output_md = plain2md(output)
+            history, _ = self.add_text(
+                history, output_md, role="assistant", append=True
+            )
+            solution = json.loads(solution_str)
+            user_state["solution"] = solution
+            user_state["history_msgs"] = history
+            yield user_state, input_image, history, solution
+    def execute(self, user_state, input_image, history, history_plan):
+        resources_state = user_state.get("resources", OrderedDict())
+        solution = user_state.get("solution", None)
+        if not solution:
+            yield user_state, input_image, history, history_plan
+            return
+        logger.info(f"Tool execution...")
+        if isinstance(solution, str) and solution.startswith("show$"):
+            key = solution[7:-2]
+            r_type = resources_state.get(key)
+            if r_type is None:
+                resource = f"{key} not found"
+            resource = container.auto_type("None", r_type, key)
+            history, _ = self.add_text(
+                history, (resource.to_chatbot(),), role="assistant"
+            )
+            user_state["history_msgs"] = history
+            yield user_state, input_image, history, history_plan
+            return
+        elif solution:
+            results = self.controller.execute(solution, state=user_state)
+            if not results:
+                yield user_state, input_image, history, history_plan
+                return
+            user_state["outputs"] = []
+            for result_per_step, executed_solutions, wrapped_outputs in results:
+                tool_name = json.dumps(result_per_step[0], ensure_ascii=False)
+                args = json.dumps(result_per_step[1], ensure_ascii=False)
+                if isinstance(result_per_step[2], Exception):
+                    ret = f"Internal error: {result_per_step[2]}"
+                else:
+                    ret = json.dumps(result_per_step[2], ensure_ascii=False)
+                history, _ = self.add_text(
+                    history,
+                    f"Call **{tool_name}:**<br>&nbsp;&nbsp;&nbsp;&nbsp;**Args**: {plain2md(args)}<br>&nbsp;&nbsp;&nbsp;&nbsp;**Ret**: {plain2md(ret)}",
+                    role="assistant",
+                )
+                user_state["history_msgs"] = history
+                user_state["executed_solutions"] = executed_solutions
+                yield user_state, input_image, history, history_plan
+                for _, output in enumerate(wrapped_outputs):
+                    if output is None or output.value is None:
+                        continue
+                    if isinstance(output, container.File):
+                        history, _ = self.add_text(
+                            history,
+                            f"Here is {output.filename}:",
+                            role="assistant",
+                        )
+                        history, _ = self.add_text(
+                            history, (output.to_chatbot(),), role="assistant"
+                        )
+                    user_state["outputs"].extend(wrapped_outputs)
+                    user_state["history_msgs"] = history
+                    yield user_state, input_image, history, history_plan
+        else:
+            yield user_state, input_image, history, history_plan
+    def reply(self, user_state, history):
+        logger.info(f"Make response...")
+        executed_solution = user_state.get("executed_solutions", None)
+        resources_state = user_state.get("resources", OrderedDict())
+        solution = user_state.get("solution", None)
+        memory = convert_dict_to_frame(resources_state)
+        if isinstance(solution, str) and solution.startswith("show$"):
+            return user_state, history, memory
+        outputs = user_state.get("outputs", None)
+        response, user_state = self.controller.reply(
+            executed_solution, outputs, user_state
+        )
+        # prompt_mask_out = None
+        for i, output in enumerate(response):
+            if isinstance(output, container.File):
+                history, _ = self.add_text(history, f"Here is [{output.filename}]: ")
+                history, _ = self.add_text(history, (output.to_chatbot(),))
+            elif i == 0:
+                history, _ = self.add_text(history, output.to_chatbot())
+        user_state["history_msgs"] = history
+        return user_state, history, memory
+    def vote(self, user_state, history, data: gr.LikeData):
+        data_value = data.value
+        if isinstance(data_value, dict):
+            data_value = json.dumps(data_value)
+        if data.liked:
+            print("You upvoted this response: ", data_value)
+            logger.info("You upvoted this response: " + data_value)
+        else:
+            print("You downvoted this response: ", data_value)
+            logger.info("You downvoted this response: " + data_value)
+        remote_logging(
+            user_state.get("history_msgs", []),
+            user_state.get("task_decomposition", ""),
+            user_state.get("solution", []),
+            data_value,
+            data.liked,
+        )
+        msg = f"Thanks for your feedback! You feedback will contribute a lot to improving our ControlLLM."
+        history, _ = self.add_text(history, msg)
+        user_state["history_msgs"] = history
+        return user_state, history
+    def save_point(self, user_state, history, data: gr.SelectData):
+        if isinstance(data, gr.LikeData):
+            return self.vote(user_state, history, data)
+        if not isinstance(data, gr.SelectData):
+            return user_state, history
+        resource_state = user_state.get("resources")
+        input_image = user_state.get("input_image", None)
+        if input_image is None:
+            history, _ = self.add_text(history, "Please upload an image at first.")
+            history, _ = self.add_text(history, plans.BUILTIN_SEG_BY_POINTS, "human")
+            user_state["history_msg"] = history
+            return user_state, history
+        resource_state.pop(input_image, None)
+        resource_state[input_image] = "image"
+        history = history + [[plans.BUILTIN_SEG_BY_POINTS, None]]
+        points = []
+        if isinstance(points, str):
+            points = json.loads(points)
+        points.append(data.index)
+        resource_state[json.dumps(points)] = "prompt_points"
+        user_state["resources"] = resource_state
+        return user_state, history
+def on_switch_input(state_input, text, audio, disable=False):
+    if state_input == "audio" or disable:
+        return "text", gr.update(visible=True), gr.update(visible=False)
+    return "audio", gr.update(visible=False), gr.update(visible=True)
+def on_mask_submit(history):
+    history = history + [(plans.BUILTIN_SEG_BY_MASK, None)]
+    return history
+def app(controller="cllm.agents.tog.Controller", https=False, **kwargs):
+    loop = InteractionLoop(controller=controller)
+    init_state, builtin_resources = loop.init_state()
+    css = """
+    code {
+        font-size: var(--text-sm);
+        white-space: pre-wrap;       /* Since CSS 2.1 */
+        white-space: -moz-pre-wrap;  /* Mozilla, since 1999 */
+        white-space: -pre-wrap;      /* Opera 4-6 */
+        white-space: -o-pre-wrap;    /* Opera 7 */
+        word-wrap: break-word;       /* Internet Explorer 5.5+ */
+    }
+    """
+    with gr.Blocks(theme=Seafoam(), css=css) as demo:
+        gr.HTML(
+            """
+            <div align='center'> <h1>ControlLLM </h1> </div>
+            <p align="center"> A framework for multi-modal interaction which is able to control LLMs over invoking tools more accurately. </p>
+            <p align="center"><a href="https://github.com/OpenGVLab/ControlLLM"><b>GitHub</b></a>
+            &nbsp;&nbsp;&nbsp; <a href="https://arxiv.org/abs/2311.11797"><b>ArXiv</b></a></p>
+            """,
+        )
+        state_input = gr.State("text")
+        user_state = gr.State(copy.deepcopy(init_state))
+        with gr.Row():
+            with gr.Column(scale=6):
+                with gr.Tabs():
+                    with gr.Tab("Chat"):
+                        chatbot = gr.Chatbot(
+                            [],
+                            elem_id="chatbot",
+                            avatar_images=[
+                                "assets/human.png",
+                                "assets/assistant.png",
+                            ],
+                            show_copy_button=True,
+                            height=550,
+                        )
+                        with gr.Row():
+                            with gr.Column(scale=12):
+                                text = gr.Textbox(
+                                    show_label=False,
+                                    placeholder="Enter text and press enter, or upload an image.",
+                                    container=False,
+                                )
+                                audio = gr.Audio(
+                                    sources="microphone", type="filepath", visible=False
+                                )
+                            with gr.Column(scale=2, min_width=80):
+                                submit = gr.Button("Submit", variant="primary")
+                            with gr.Column(scale=1, min_width=40):
+                                record = gr.Button("🎙️")
+                            with gr.Column(scale=1, min_width=40):
+                                upload_btn = gr.UploadButton(
+                                    "📁",
+                                    file_types=[
+                                        "image",
+                                        "video",
+                                        "audio",
+                                        ".pdf",
+                                    ],
+                                )
+                        gr.Examples(
+                            [
+                                "Who are you?",
+                                "How is about weather in Beijing",
+                                "Describe the given image.",
+                                "find the woman wearing the red skirt in the image",
+                                "Generate a video that shows Pikachu surfing in waves.",
+                                "How many horses are there in the image?",
+                                "Can you erase the dog in the given image?",
+                                "Remove the object based on the given mask.",
+                                "Can you make a video of a serene lake with vibrant green grass and trees all around? And then create a webpage using HTML to showcase this video?",
+                                "Generate an image that shows a beautiful landscape with a calm lake reflecting the blue sky and white clouds. Then generate a video to introduce this image.",
+                                "replace the masked object with a cute yellow dog",
+                                "replace the sheep with a cute dog in the image",
+                                "Recognize the action in the video",
+                                "Generate an image where a astronaut is riding a horse",
+                                "Please generate a piece of music from the given image",
+                                "Please give me an image that shows an astronaut riding a horse on mars.",
+                                "What’s the weather situation in Berlin? Can you generate a new image that represents the weather in there?",
+                                "Can you recognize the text from the image and tell me how much is Eggs Florentine?",
+                                "Generate a piece of music for this video and dub this video with generated music",
+                                "Generate a new image based on depth map from input image",
+                                "Remove the cats from the image_1.png, image_2.png, image_3.png",
+                                "I need the banana removed from the c4c40e_image.png, 9e867c_image.png, 9e13sc_image.png",
+                                "I would be so happy if you could create a new image using the scribble from input image. The new image should be a tropical island with a dog. Write a detailed description of the given image. and highlight the dog in image",
+                                "Please generate a piece of music and a new video from the input image",
+                                "generate a new image conditioned on the segmentation from input image and the new image shows that a gorgeous lady is dancing",
+                                "generate a new image with a different background but maintaining the same composition as input image",
+                                "Generate a new image that shows an insect robot preparing a delicious meal. Then give me a video based on new image. Finally, dub the video with suitable background music.",
+                                "Translate the text into speech: I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident that all men are created equal.I have a dream that one day on the red hills of Georgia the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood. I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.",
+                            ],
+                            inputs=[text],
+                        )
+                        gr.Examples(
+                            list(plans.BUILTIN_PLANS.keys()),
+                            inputs=[text],
+                            label="Builtin Examples",
+                        )
+            with gr.Column(scale=5):
+                with gr.Tabs():
+                    with gr.Tab("Mask Input"):
+                        image_mask = gr.components.Image(
+                            sources="upload",
+                            interactive=True,
+                            type="filepath",
+                        )
+                        # with gr.Row():
+                        #     mask_submit_btn = gr.Button("Segment", variant="primary")
+                        with gr.Row():
+                            image_submit_btn = gr.Button("Upload", variant="primary")
+                    with gr.Tab("Plan"):
+                        planbot = gr.JSON(elem_classes="json")
+                    with gr.Tab("Memory"):
+                        memory_table = gr.DataFrame(
+                            # value=convert_dict_to_frame(builtin_resources),
+                            label="Memory",
+                            headers=["Resource", "Type"],
+                            row_count=5,
+                            wrap=True,
+                        )
+                gr.Examples(
+                    [
+                        osp.join("./assets/resources", item)
+                        for item in builtin_resources.keys()
+                        if item.endswith(".png")
+                    ],
+                    inputs=[image_mask],
+                    label="File Examples",
+                )
+        chatbot.like(
+            loop.vote,
+            [
+                user_state,
+                chatbot,
+            ],
+            [
+                user_state,
+                chatbot,
+            ],
+        )
+        reply_inputs = [user_state, image_mask, chatbot, planbot]
+        reply_outputs = [
+            user_state,
+            # image_mask,
+            chatbot,
+            memory_table,
+            # planbot,
+        ]
+        add_text = [
+            partial(loop.add_text, role="human"),
+            [chatbot, text],
+            [chatbot, text],
+        ]
+        text.submit(*add_text).then(loop.plan, reply_inputs, reply_inputs).then(
+            loop.execute, reply_inputs, reply_inputs
+        ).then(loop.reply, [user_state, chatbot], reply_outputs)
+        add_msg = [
+            partial(loop.add_msg, role="human"),
+            [chatbot, text, audio],
+            [chatbot, text],
+        ]
+        submit.click(*add_msg).then(
+            partial(on_switch_input, disable=True),
+            [state_input, text, audio],
+            [state_input, text, audio],
+        ).then(loop.plan, reply_inputs, reply_inputs).then(
+            loop.execute, reply_inputs, reply_inputs
+        ).then(
+            loop.reply, [user_state, chatbot], reply_outputs
+        )
+        upload_btn.upload(
+            loop.add_file,
+            inputs=[user_state, chatbot, upload_btn],
+            outputs=[user_state, image_mask, chatbot, memory_table],
+        )
+        record.click(
+            on_switch_input,
+            [state_input, text, audio],
+            [state_input, text, audio],
+        )
+        image_mask.select(
+            loop.save_point, [user_state, chatbot], [user_state, chatbot]
+        ).then(loop.plan, reply_inputs, reply_inputs).then(
+            loop.execute, reply_inputs, reply_inputs
+        ).then(
+            loop.reply, [user_state, chatbot], reply_outputs
+        )
+        image_mask.upload(
+            loop.add_file,
+            inputs=[user_state, chatbot, image_mask],
+            outputs=[user_state, image_mask, chatbot, memory_table],
+        )
+        image_submit_btn.click(
+            loop.add_file,
+            inputs=[user_state, chatbot, image_mask],
+            outputs=[user_state, image_mask, chatbot, memory_table],
+        )
+    if https:
+        demo.queue().launch(
+            server_name="0.0.0.0",
+            # ssl_certfile="./certificate/cert.pem",
+            # ssl_keyfile="./certificate/key.pem",
+            ssl_verify=False,
+            show_api=False,
+            allowed_paths=[
+                "assets/human.png",
+                "assets/assistant.png",
+            ],
+            **kwargs,
+        )
+    else:
+        demo.queue().launch(
+            server_name="0.0.0.0",
+            show_api=False,
+            allowed_paths=[
+                "assets/human.png",
+                "assets/assistant.png",
+            ],
+            **kwargs,
+        )
+if __name__ == "__main__":
+    os.makedirs(RESOURCE_ROOT, exist_ok=True)
+    app(controller="cllm.agents.tog.Controller", server_port=10024)

assets/assistant.png ADDED Viewed

assets/human.png ADDED Viewed

builtin_plan.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "you know what I want": [
+        [
+            {
+                "tool_name": "text_to_image",
+                "inputs": {
+                    "text": "a dog"
+                },
+                "outputs": [
+                    "image"
+                ]
+            }
+        ]
+    ]
+}

cllm/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .base import Tool, Action
2	+ from .container import *

cllm/agents/base.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Callable, List
+import json
+from pathlib import Path
+from collections import OrderedDict
+@dataclass
+class Action:
+    """The action represent an assignment.
+        `output = tool_name(**inputs)`
+    Examples:
+        >>> mask = segmentation_by_mask(image=image, prompt_mask=prompt_mask)
+        >>> image = image_inpainting(image=image, mask=mask)
+    """
+    tool_name: str = (None,)
+    inputs: dict = (None,)
+    outputs: List[str] = (None,)
+    def __str__(self) -> str:
+        args = ", ".join([f"{k}={v}" for k, v in self.inputs.items()])
+        return "{} = {}(".format(", ".join(self.outputs), self.tool_name) + args + ")"
+    def dict(self):
+        args = {str(k): str(v) for k, v in self.inputs.items()}
+        # args = {str(item["name"]): str(item["value"]) for item in self.inputs}
+        rets = [o if isinstance(o, str) else str(o) for o in self.outputs]
+        return {
+            "tool": self.tool_name,
+            "inputs": args,
+            "outputs": rets,
+        }
+class DataType(Enum):
+    TEXT = "text"
+    TAGS = "tags"
+    TITLE = "title"
+    # HTML = "text.html"
+    HTML = "html"
+    LOCATION = "location"
+    WEATHER = "weather"
+    TIME = "time"
+    IMAGE = "image"
+    VIDEO = "video"
+    AUDIO = "audio"
+    ANY = "any"
+    NONE = "none"
+    SEGMENTATION = "image.segmentation"
+    EDGE = "image.edge"
+    LINE = "image.line"
+    HED = "image.hed"
+    CANNY = "image.canny"
+    SCRIBBLE = "image.scribble"
+    POSE = "image.pose"
+    DEPTH = "image.depth"
+    NORMAL = "image.normal"
+    MASK = "image.mask"  # SAM mask
+    POINT = "point"
+    BBOX = "bbox"  # {'label': 'dog', 'box': [1,2,3,4], 'score': 0.9}
+    CATEGORY = "category"
+    LIST = "list"
+    def __str__(self):
+        return self.value
+    def __eq__(self, other):
+        if isinstance(other, str):
+            return self.value == other
+        elif isinstance(other, self.__class__):
+            return self.value == other.value
+        else:
+            return False
+@dataclass
+class Resource:
+    name: str
+    type: DataType
+    value: None
+    # description: str = None
+    def dict(self):
+        return {
+            "name": self.name,
+            "type": str(self.type),
+            "value": str(self.value),
+            # "description": self.description,
+        }
+@dataclass
+class Tool:
+    class Domain(Enum):
+        IMAGE_PERCEPTION = "image-perception"
+        IMAGE_GENERATION = "image-generation"
+        IMAGE_EDITING = "image-editing"
+        IMAGE_PROCESSING = "image-processing"
+        AUDIO_PERCEPTION = "audio-perception"
+        AUDIO_GENERATION = "audio-generation"
+        VIDEO_PERCEPTION = "video-perception"
+        VIDEO_GENERATION = "video-generation"
+        VIDEO_PROCESSING = "video-processing"
+        VIDEO_EDITING = "video-editing"
+        VIDEO_CUTTING = "video-cutting"
+        NATURAL_LANGUAGE_PROCESSING = "natural-language-processing"
+        CODE_GENERATION = "code-generation"
+        VISUAL_QUESTION_ANSWERING = "visual-question-answering"
+        QUESTION_ANSWERING = "question-answering"
+        GENERAL = "general"
+        def __str__(self):
+            return self.value
+    @dataclass
+    class Argument:
+        name: str
+        type: DataType
+        description: str
+        def dict(self):
+            return {
+                "name": self.name,
+                "type": str(self.type),
+                "description": self.description,
+            }
+    name: str
+    description: str
+    domain: Domain
+    model: Callable
+    usages: List[str] = field(default_factory=lambda: [])
+    args: List[Argument] = field(default_factory=lambda: [])
+    returns: List[Argument] = field(default_factory=lambda: [])
+    def dict(self):
+        return {
+            "name": self.name,
+            "description": self.description,
+            "domain": str(self.domain),
+            "args": [a.dict() for a in self.args],
+            "returns": [r.dict() for r in self.returns],
+        }
+NON_FILE_TYPES = [
+    DataType.TAGS,
+    DataType.TEXT,
+    DataType.TITLE,
+    DataType.BBOX,
+    DataType.CATEGORY,
+    DataType.LIST,
+    DataType.LOCATION,
+    DataType.POINT,
+    DataType.WEATHER,
+    DataType.TIME,
+]
+if __name__ == "__main__":
+    s = [
+        [Action("a", {"aa": [Path("/a/d/e/t.txt")]}, [Path("/a/aa.txt")])],
+        Action("b", {"bb": "bbb"}, ["bbb"]),
+    ]
+    print(json.dumps(s, indent=4, default=lambda o: o.dict()))

cllm/agents/builtin/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from . import plans
+from .plans import BUILTIN_PLANS, load_builtin_plans
+from .tools import TOOLS

cllm/agents/builtin/plans.py ADDED Viewed

	@@ -0,0 +1,634 @@

+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.agents.base import Action
+BUILTIN_SEG_BY_POINTS = "Segment the given image based on the prompt points."
+BUILTIN_SEG_BY_MASK = "Segment the given image based on the prompt mask."
+# BUILTIN_REMOVE_BY_MASK = "Remove the object based on the given mask."
+BUILTIN_IMAGE_TO_EDGE = "Generate the edge from the given image."
+BUILTIN_GENERATE_SIMILAR_IMAGE = "Generate a new image similar to the input image"
+# BUILTIN_GENERATE_SIMILAR_IMAGE2 = "Generate a similar image from the given image 2"
+# BUILTIN_GENERATE_SIMILAR_IMAGE3 = "Image to image. 3"
+BUILTIN_GENERATE_SIMILAR_IMAGE4 = "Generate a new image similar to image 4"
+BUILTIN_GENERATE_IMAGE_HED = "Generate a new image based on HED result from input image"
+BUILTIN_GENERATE_IMAGE_DEPTH = (
+    "Generate a new image based on depth map from input image"
+)
+BUILTIN_GENERATE_IMAGE_OCR = "Please extract the text from the image"
+BUILTIN_TEXT_EDGE_TO_IMAGE = "Generate an image based on the given edge map."
+BUILTIN_GENERATE_IMAGE = "Generate a new image that shows a woman is skiing"
+BUILTIN_IMAGE_TO_VIDEO = "Generate a video from the image"
+BUILTIN_COUNT_OBJECTS = "Provide me with the count of bears in the input image"
+BUILTIN_VIDEO_TO_WEBPAGE = "Generate a web page for input video"
+BUILTIN_TEXT_TO_MUSIC = "Please generate a piece of music based on given prompt. Here is the prompt: An 80s driving pop song with heavy drums and synth pads in the background"
+BUILTIN_IMAGE_ERASING1 = "Erase the wine glass from the photo"
+BUILTIN_IMAGE_ERASING2 = "Erase the cats in the photo"
+BUILTIN_IMAGE_CROPPING = "Crop the cats from the photo"
+BUILTIN_IMAGE_SEG = "give me the mask of elephant."
+BUILTIN_IMAGE_HIGHLIGHT = "highlight the elephant."
+BUILTIN_TEXT_SPEECH = "translate text into speech"
+BUILTIN_DUBBING = "dub this video with the given audio"
+BUILTIN_COUNT_OBJECTS2 = "Count the horse in the image."
+BUILTIN_IMAGE_TO_VIDEO2 = "Generate an image that shows a serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds. Then generate a video to introduce this image."
+BUILTIN_IMAGE_TO_VIDEO3 = "Create a visual and auditory representation of a peaceful and scenic landscape. The image should depict a serene and beautiful landscape with a calm lake reflecting the blue sky. The music should match the image. Finally, combine the image and the music into a video that showcases the beauty of nature."
+BUILTIN_VIDEO_CLS = "Recognize the action in the video"
+BUILTIN_VIDEO_CLS = "Recognize the action in the video"
+BUILTIN_AUDIO_CLS = "Recognize the event in this audio"
+BUILTIN_IMAGE2MUSIC = "Generate a piece of music for this image"
+BUILTIN_VIDEO2MUSIC = (
+    "Generate a piece of music for this video and dub the video with generated music"
+)
+BUILTIN_PLANS = {
+    # BUILTIN_REMOVE_BY_MASK: [
+    #     [
+    #         Action(
+    #             tool_name="image_inpainting",
+    #             inputs={"image": "image", "mask": "image.mask"},
+    #             outputs=["<GENERATED>-0"],
+    #         )
+    #     ]
+    # ],
+    BUILTIN_IMAGE_TO_EDGE: [
+        [
+            Action(
+                tool_name="image_to_edge",
+                inputs={"image": "image"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_TEXT_EDGE_TO_IMAGE: [
+        [
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-prompt"],
+            ),
+            Action(
+                tool_name="edge_text_to_image",
+                inputs={
+                    "edge": "image.edge",
+                    "text": "<TOOL-GENERATED>-prompt",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_GENERATE_SIMILAR_IMAGE: [
+        [
+            Action(
+                tool_name="image_to_edge",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-edge"],
+            ),
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-prompt"],
+            ),
+            Action(
+                tool_name="edge_text_to_image",
+                inputs={
+                    "edge": "<TOOL-GENERATED>-edge",
+                    "text": "<TOOL-GENERATED>-prompt",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    # BUILTIN_GENERATE_SIMILAR_IMAGE2: [
+    #     [
+    #         Action(
+    #             tool_name="image_captioning",
+    #             inputs={"image": "image"},
+    #             outputs=["<TOOL-GENERATED>-prompt"],
+    #         ),
+    #         Action(
+    #             tool_name="text_to_image",
+    #             inputs={"text": "<TOOL-GENERATED>-prompt"},
+    #             outputs=["<GENERATED>-0"],
+    #         ),
+    #     ]
+    # ],
+    # BUILTIN_GENERATE_SIMILAR_IMAGE3: [
+    #     [
+    #         Action(
+    #             tool_name="image_to_image",
+    #             inputs={"image": "image"},
+    #             outputs=["<GENERATED>-0"],
+    #         ),
+    #     ]
+    # ],
+    BUILTIN_GENERATE_IMAGE_HED: [
+        [
+            Action(
+                tool_name="image_to_hed",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-image_to_hed-hed-0"],
+            ),
+            Action(
+                tool_name="hed_text_to_image",
+                inputs={
+                    "text": "beautiful mountains and sunset",
+                    "hed": "<TOOL-GENERATED>-image_to_hed-hed-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_GENERATE_IMAGE_DEPTH: [
+        [
+            Action(
+                tool_name="image_captioning",
+                inputs={
+                    "image": "image",
+                },
+                outputs=["<TOOL-GENERATED>-image_captioning-text-0"],
+            ),
+            Action(
+                tool_name="image_to_depth",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-image_to_depth-depth-0"],
+            ),
+            Action(
+                tool_name="depth_text_to_image",
+                inputs={
+                    "text": "<TOOL-GENERATED>-image_captioning-text-0",
+                    "depth": "<TOOL-GENERATED>-image_to_depth-depth-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_GENERATE_IMAGE_OCR: [
+        [
+            Action(
+                tool_name="optical_character_recognition",
+                inputs={"image": "image"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_COUNT_OBJECTS: [
+        [
+            Action(
+                tool_name="object_detection",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
+            ),
+            Action(
+                tool_name="select_bbox",
+                inputs={
+                    "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
+                    "condition": "bear",
+                },
+                outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
+            ),
+            Action(
+                tool_name="count_objects",
+                inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"},
+                outputs=["<GENERATED>-0"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="image_question_answering",
+                inputs={
+                    "text": "Provide me with the count of bears in the input image",
+                    "image": "image",
+                },
+                outputs=["<GENERATED>-1"],
+            )
+        ],
+    ],
+    BUILTIN_VIDEO_TO_WEBPAGE: [
+        [
+            Action(
+                tool_name="video_captioning",
+                inputs={"video": "video"},
+                outputs=["<TOOL-GENERATED>-text-0"],
+            ),
+            Action(
+                tool_name="text_to_music",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<TOOL-GENERATED>-text_to_music-audio-0"],
+            ),
+            Action(
+                tool_name="dub_video",
+                inputs={
+                    "video": "video",
+                    "audio": "<TOOL-GENERATED>-text_to_music-audio-0",
+                },
+                outputs=["<TOOL-GENERATED>-dub_video-video-0"],
+            ),
+            Action(
+                tool_name="title_generation",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<TOOL-GENERATED>-text-1"],
+            ),
+            Action(
+                tool_name="text_to_tags",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<TOOL-GENERATED>-tags-0"],
+            ),
+            Action(
+                tool_name="video_to_webpage",
+                inputs={
+                    "video": "<TOOL-GENERATED>-dub_video-video-0",
+                    "title": "<TOOL-GENERATED>-text-1",
+                    "tags": "<TOOL-GENERATED>-tags-0",
+                    "description": "<TOOL-GENERATED>-text-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_TEXT_TO_MUSIC: [
+        [
+            Action(
+                tool_name="text_to_music",
+                inputs={
+                    "text": "An 80s driving pop song with heavy drums and synth pads in the background"
+                },
+                outputs=["<GENERATED>-audio-0"],
+            )
+        ]
+    ],
+    BUILTIN_IMAGE_ERASING1: [
+        [
+            Action(
+                tool_name="image_instance_segmentation",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
+            ),
+            Action(
+                tool_name="select_mask",
+                inputs={
+                    "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
+                    "condition": "wine glass",
+                },
+                outputs=["<TOOL-GENERATED>-select_mask-mask-1"],
+            ),
+            Action(
+                tool_name="image_inpainting",
+                inputs={
+                    "image": "image",
+                    "mask": "<TOOL-GENERATED>-select_mask-mask-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_IMAGE_ERASING2: [
+        [
+            Action(
+                tool_name="image_instance_segmentation",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
+            ),
+            Action(
+                tool_name="select_mask",
+                inputs={
+                    "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
+                    "condition": "cat",
+                },
+                outputs=["<TOOL-GENERATED>-select_mask-mask-0"],
+            ),
+            Action(
+                tool_name="image_inpainting",
+                inputs={
+                    "image": "image",
+                    "mask": "<TOOL-GENERATED>-select_mask-mask-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_IMAGE_CROPPING: [
+        [
+            Action(
+                tool_name="object_detection",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
+            ),
+            Action(
+                tool_name="select_bbox",
+                inputs={
+                    "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
+                    "condition": "cat",
+                },
+                outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
+            ),
+            Action(
+                tool_name="image_cropping",
+                inputs={
+                    "image": "image",
+                    "object": "<TOOL-GENERATED>-select_bbox-bbox-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_IMAGE_SEG: [
+        [
+            Action(
+                tool_name="image_instance_segmentation",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-image_instance_segmentation-mask-0"],
+            ),
+            Action(
+                tool_name="select_mask",
+                inputs={
+                    "mask_list": "<TOOL-GENERATED>-image_instance_segmentation-mask-0",
+                    "condition": "elephant",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_IMAGE_HIGHLIGHT: [
+        [
+            Action(
+                tool_name="object_detection",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
+            ),
+            Action(
+                tool_name="select_bbox",
+                inputs={
+                    "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
+                    "condition": "elephant",
+                },
+                outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
+            ),
+            Action(
+                tool_name="highlight_object_on_image",
+                inputs={
+                    "image": "image",
+                    "bbox": "<TOOL-GENERATED>-select_bbox-bbox-0",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_TEXT_SPEECH: [
+        [
+            Action(
+                tool_name="text_to_speech",
+                inputs={
+                    "text": "Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all"
+                },
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_DUBBING: [
+        [
+            Action(
+                tool_name="dub_video",
+                inputs={"video": "video", "audio": "audio"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_GENERATE_SIMILAR_IMAGE4: [
+        [
+            Action(
+                tool_name="segment_anything",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-seg"],
+            ),
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-prompt"],
+            ),
+            Action(
+                tool_name="segmentation_text_to_image",
+                inputs={
+                    "segmentation": "<TOOL-GENERATED>-seg",
+                    "text": "<TOOL-GENERATED>-prompt",
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_GENERATE_IMAGE: [
+        [
+            Action(
+                tool_name="text_to_image",
+                inputs={"text": "a woman is skiing"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_IMAGE_TO_VIDEO: [
+        [
+            Action(
+                tool_name="image_to_video",
+                inputs={"image": "image"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_COUNT_OBJECTS2: [
+        [
+            Action(
+                tool_name="object_detection",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-object_detection-bbox-0"],
+            ),
+            Action(
+                tool_name="select_bbox",
+                inputs={
+                    "bbox_list": "<TOOL-GENERATED>-object_detection-bbox-0",
+                    "condition": "horse",
+                },
+                outputs=["<TOOL-GENERATED>-select_bbox-bbox-0"],
+            ),
+            Action(
+                tool_name="count_objects",
+                inputs={"bbox_list": "<TOOL-GENERATED>-select_bbox-bbox-0"},
+                outputs=["<GENERATED>-0"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="image_question_answering",
+                inputs={
+                    "text": "Provide me with the count of horses in the input image",
+                    "image": "image",
+                },
+                outputs=["<GENERATED>-1"],
+            )
+        ],
+    ],
+    BUILTIN_IMAGE_TO_VIDEO2: [
+        [
+            Action(
+                tool_name="text_to_image",
+                inputs={
+                    "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky and white clouds."
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "<GENERATED>-0"},
+                outputs=["<TOOL-GENERATED>-text-0"],
+            ),
+            Action(
+                tool_name="text_to_speech",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<TOOL-GENERATED>-text_to_speech-audio-0"],
+            ),
+            Action(
+                tool_name="image_audio_to_video",
+                inputs={
+                    "image": "<GENERATED>-0",
+                    "audio": "<TOOL-GENERATED>-text_to_speech-audio-0",
+                },
+                outputs=["<GENERATED>-1"],
+            ),
+        ],
+    ],
+    BUILTIN_IMAGE_TO_VIDEO3: [
+        [
+            Action(
+                tool_name="text_to_image",
+                inputs={
+                    "text": "A serene and beautiful landscape with a calm lake reflecting the blue sky."
+                },
+                outputs=["<GENERATED>-0"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "<GENERATED>-0"},
+                outputs=["<TOOL-GENERATED>-text-0"],
+            ),
+            Action(
+                tool_name="text_to_music",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<GENERATED>-1"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="image_to_video",
+                inputs={
+                    "image": "<GENERATED>-0",
+                },
+                outputs=["<TOOL-GENERATED>-image_to_video-video-0"],
+            ),
+            Action(
+                tool_name="dub_video",
+                inputs={
+                    "video": "<TOOL-GENERATED>-image_to_video-video-0",
+                    "audio": "<GENERATED>-1",
+                },
+                outputs=["<GENERATED>-2"],
+            ),
+        ],
+    ],
+    BUILTIN_VIDEO_CLS: [
+        [
+            Action(
+                tool_name="video_classification",
+                inputs={"video": "video"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_AUDIO_CLS: [
+        [
+            Action(
+                tool_name="audio_classification",
+                inputs={"audio": "audio"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    BUILTIN_IMAGE2MUSIC: [
+        [
+            Action(
+                tool_name="image_captioning",
+                inputs={"image": "image"},
+                outputs=["<TOOL-GENERATED>-text-0"],
+            ),
+            Action(
+                tool_name="text_to_music",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<GENERATED>-0"],
+            ),
+        ]
+    ],
+    BUILTIN_VIDEO2MUSIC: [
+        [
+            Action(
+                tool_name="video_captioning",
+                inputs={"video": "video"},
+                outputs=["<TOOL-GENERATED>-text-0"],
+            ),
+            Action(
+                tool_name="text_to_music",
+                inputs={"text": "<TOOL-GENERATED>-text-0"},
+                outputs=["<GENERATED>-0"],
+            ),
+        ],
+        [
+            Action(
+                tool_name="dub_video",
+                inputs={
+                    "video": "video",
+                    "audio": "<GENERATED>-0",
+                },
+                outputs=["<GENERATED>-1"],
+            ),
+        ],
+    ],
+    BUILTIN_SEG_BY_POINTS: [
+        [
+            Action(
+                tool_name="image_segmentation_by_points",
+                inputs={"image": "image", "prompt_points": "prompt_points"},
+                outputs=["<GENERATED>-0"],
+            )
+        ]
+    ],
+    # BUILTIN_SEG_BY_MASK: [
+    #     [
+    #         Action(
+    #             tool_name='image_segmentation_by_mask',
+    #             inputs={'image': 'image', 'prompt_mask': 'prompt_mask'},
+    #             outputs=['<GENERATED>-0'],
+    #         )
+    #     ]
+    # ],
+}
+def load_builtin_plans(path):
+    import json
+    plans = json.load(open(path, "r"))
+    processed_plan = {}
+    for query, actions in plans.items():
+        actions2 = []
+        for ac in actions[0]:
+            actions2.append(
+                Action(
+                    tool_name=ac["tool_name"],
+                    inputs=ac["inputs"],
+                    outputs=ac["outputs"],
+                ),
+            )
+        processed_plan[query] = [actions2]
+    return processed_plan

cllm/agents/builtin/tools.py ADDED Viewed

	@@ -0,0 +1,1512 @@

+import sys
+import os
+sys.path.append(os.getcwd())
+from cllm.services.image_editing.api import (
+    inpainting_ldm,
+    inpainting_ldm_general,
+    partial_image_editing,
+    instruct_pix2pix,
+    image_cropping,
+    image_matting,
+    draw_bbox_on_image,
+)
+from cllm.services.image_generation.api import (
+    text2image,
+    image2image,
+    cannytext2image,
+    linetext2image,
+    hedtext2image,
+    scribbletext2image,
+    posetext2image,
+    segtext2image,
+    depthtext2image,
+    normaltext2image,
+)
+from cllm.services.image_processing.api import (
+    image2canny,
+    image2line,
+    image2hed,
+    image2scribble,
+    image2pose,
+    image2depth,
+    image2normal,
+)
+from cllm.services.image_perception.api import (
+    object_detection,
+    image_classification,
+    ocr,
+    segment_objects,
+    visual_grounding,
+    image_captioning,
+    segment_all,
+    seg_by_mask,
+    seg_by_points,
+)
+from cllm.services.video.api import (
+    video_classification,
+    video_captioning,
+    image_audio_to_video,
+    video_to_webpage,
+    dub_video,
+    image_to_video,
+    text_to_video,
+)
+from cllm.services.audio.api import (
+    text_to_music,
+    text_to_speech,
+    audio_classification,
+)
+# from cllm.services.sam.api import (
+#     segment_by_mask,
+#     segment_by_points,
+#     set_image,
+#     segment_all,
+# )
+from cllm.services.general.api import (
+    select,
+    count,
+    remote_logging,
+)
+from cllm.services.nlp.api import (
+    text_to_text_generation,
+    title_generation,
+    text_to_tags,
+    question_answering_with_context,
+    openai_chat_model,
+    summarization,
+    extract_location,
+    sentiment_analysis,
+    get_weather,
+    summarize_weather_condition,
+    get_time,
+)
+from cllm.services.vqa.api import image_qa
+from cllm.agents.base import Tool, DataType
+QUESTION_ANSWERING_TOOLS = [
+    Tool(
+        name="image_question_answering",
+        description="answers a question about an image",
+        domain=Tool.Domain.VISUAL_QUESTION_ANSWERING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image containing the information",
+            ),
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the question about the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="response",
+                type=DataType.TEXT,
+                description="output response",
+            )
+        ],
+        model=image_qa,
+    ),
+    Tool(
+        name="get_weather",
+        description="Query the weather conditions by given location. For example: what is the weather in Beijing? how cold is in New York? etc.",
+        domain=Tool.Domain.QUESTION_ANSWERING,
+        args=[
+            Tool.Argument(
+                name="location",
+                type=DataType.LOCATION,
+                description="the location where the weather is to be queried",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="result",
+                # type=DataType.WEATHER,
+                type=DataType.WEATHER,
+                description="weather information",
+            )
+        ],
+        model=get_weather,
+    ),
+    Tool(
+        name="get_time",
+        description="get current date",
+        domain=Tool.Domain.QUESTION_ANSWERING,
+        args=[
+            # Tool.Argument(
+            #     name="location",
+            #     type=DataType.LOCATION,
+            #     description="location where the time is to be queried",
+            # ),
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="input text",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="response",
+                type=DataType.TIME,
+                description="output time",
+            )
+        ],
+        model=get_time,
+    ),
+    # Tool(
+    #     name="calculator",
+    #     description="It can solve mathematics problems and support various mathematical expressions: from basic arithmetic to more complex expressions.",
+    #     domain=Tool.Domain.QUESTION_ANSWERING,
+    #     args=[
+    #         Tool.Argument(
+    #             name="text",
+    #             type=DataType.TEXT,
+    #             description="input instructions",
+    #         ),
+    #     ],
+    #     returns=[
+    #         Tool.Argument(
+    #             name="result",
+    #             type=DataType.TEXT,
+    #             description="result about weather",
+    #         )
+    #     ],
+    #     model=None,
+    # ),
+]
+IMAGE_CAPTIONING_TOOLS = [
+    Tool(
+        name="image_captioning",
+        description='Generate a caption or description for the image. It can generate a detailed description that can be used for image perception and image generation. For example: a) you can use this tool when you want to know what is it in the image"; and b) when you want to generate a new image similar or resemble to input.png, you can use `image_captioning` to obtain the description about image input.png.',
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be captioned",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the description for the input image",
+            )
+        ],
+        model=image_captioning,
+    ),
+]
+IMAGE_EDITING_TOOLS = [
+    Tool(
+        name="partial_image_editing",
+        description="Given the mask denoting the region to edit,  Edit the given image at local region. Useful when you want to replace an object via a mask image. "
+        "like: replace the masked object with a dog. ",
+        domain=Tool.Domain.IMAGE_EDITING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be edited",
+            ),
+            Tool.Argument(
+                name="mask",
+                type=DataType.MASK,
+                description="the mask image representing the editing position",
+            ),
+            Tool.Argument(
+                name="prompt",
+                type=DataType.TEXT,
+                description="the prompt specified the edition",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the edited image",
+            )
+        ],
+        model=partial_image_editing,
+    ),
+    Tool(
+        name="text_image_editing",
+        description="Edit the given image based on the text prompt.",
+        domain=Tool.Domain.IMAGE_EDITING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be edited",
+            ),
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the prompt specified the edition",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the edited image",
+            )
+        ],
+        model=instruct_pix2pix,
+    ),
+    Tool(
+        name="image_inpainting",
+        description="inpaint the region of the image based on the given mask. For example: remove the dog in the image, erase the spoon in given image, etc.",
+        domain=Tool.Domain.IMAGE_EDITING,
+        usages=["remove some objects"],
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be inpainted",
+            ),
+            Tool.Argument(
+                name="mask",
+                type=DataType.MASK,
+                description="the segmentation mask for the inpainting region",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the processed image",
+            )
+        ],
+        model=inpainting_ldm_general,
+    ),
+    Tool(
+        name="highlight_object_on_image",
+        description="This tool is usually used after `object_detection` `visual_grounding` and `select_bbox`. Useful when you want to: 1) highlight the region of interest on the image; 2) know where the object is. For example: highlight the elephant from image, locate the dog in the image, find the spoon in given image, detect if the object is present in the image, etc.",
+        domain=Tool.Domain.IMAGE_EDITING,
+        usages=["highlight the region of interest on the image"],
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+            Tool.Argument(
+                name="bbox",
+                type=DataType.BBOX,
+                description="the bounding boxes that need to be drawn on the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="result",
+                type=DataType.IMAGE,
+                description="the new image on which the tool highlight the the region of interest by bounding boxes",
+            )
+        ],
+        model=draw_bbox_on_image,
+    ),
+    Tool(
+        name="image_cropping",
+        description="Crop the image based on the given bounding box. Useful when you want to crop the dog in the image, crop the spoon in given image, etc.",
+        domain=Tool.Domain.IMAGE_EDITING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+            Tool.Argument(
+                name="object",
+                type=DataType.BBOX,
+                description="the detected object",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the cropped image",
+            )
+        ],
+        model=image_cropping,
+    ),
+    # Tool(
+    #     name="mask_image",
+    #     description="Mask the background from the image based on the given mask. For example: mask anything except the dog in the image, extract the spoon from given image without any inpainting, etc.",
+    #     domain=Tool.Domain.IMAGE_EDITING,
+    #     args=[
+    #         Tool.Argument(
+    #             name="image",
+    #             type=DataType.IMAGE,
+    #             description="the image to be processed",
+    #         ),
+    #         Tool.Argument(
+    #             name="mask",
+    #             type=DataType.MASK,
+    #             description="the mask of the matted region",
+    #         ),
+    #     ],
+    #     returns=[
+    #         Tool.Argument(
+    #             name="image",
+    #             type=DataType.IMAGE,
+    #             description="the matted image",
+    #         )
+    #     ],
+    #     model=image_matting,
+    # ),
+]
+IMAGE_GENERATION_TOOLS = [
+    Tool(
+        name="text_to_image",
+        description="generate an image based on the given description.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=text2image,
+    ),
+    Tool(
+        name="image_to_image",
+        description="generate an new image based on the given image.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the given image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=image2image,
+    ),
+    Tool(
+        name="line_text_to_image",
+        description="generate an image based on the given description and line map.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="line",
+                type=DataType.LINE,
+                description="the line map outlining the line of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=linetext2image,
+    ),
+    Tool(
+        name="hed_text_to_image",
+        description="generate an image based on the given description and HED map (holistically-nested edge detection).",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="hed",
+                type=DataType.HED,
+                description="the HED map outlining the edge of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=hedtext2image,
+    ),
+    Tool(
+        name="scribble_text_to_image",
+        description="generate an image based on the given description and the scribble.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="scribble",
+                type=DataType.SCRIBBLE,
+                description="the scribble outlining the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=scribbletext2image,
+    ),
+    Tool(
+        name="pose_text_to_image",
+        description="generate an image based on the given description and the pose.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="pose",
+                type=DataType.POSE,
+                description="the pose of the human in the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=posetext2image,
+    ),
+    Tool(
+        name="segmentation_text_to_image",
+        description="generate an image based on the given description and segmentation mask.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="segmentation",
+                type=DataType.SEGMENTATION,
+                description="the segmentation mask describing the structure of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=segtext2image,
+    ),
+    Tool(
+        name="edge_text_to_image",
+        description="generate an image based on the given description and edge map.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="edge",
+                type=DataType.EDGE,
+                description="the edge map describing the structure of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=cannytext2image,
+    ),
+    Tool(
+        name="depth_text_to_image",
+        description="generate an image based on the given description and depth map.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="depth",
+                type=DataType.DEPTH,
+                description="the depth map describing the structure of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=depthtext2image,
+    ),
+    Tool(
+        name="normal_text_to_image",
+        description="generate an image based on the given description and normal map.",
+        domain=Tool.Domain.IMAGE_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            ),
+            Tool.Argument(
+                name="normal",
+                type=DataType.NORMAL,
+                description="the normal map describing the structure of the image",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the generated image",
+            )
+        ],
+        model=normaltext2image,
+    ),
+]
+IMAGE_TRANSFORM_TOOLS = [
+    Tool(
+        name="image_to_edge",
+        description="get the edge map of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="edge",
+                type=DataType.EDGE,
+                description="the edge map of the image",
+            )
+        ],
+        model=image2canny,
+    ),
+    Tool(
+        name="image_to_line",
+        description="get the line map of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="line",
+                type=DataType.LINE,
+                description="the line map of the image",
+            )
+        ],
+        model=image2line,
+    ),
+    Tool(
+        name="image_to_hed",
+        description="get the HED map of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="hed",
+                type=DataType.HED,
+                description="the hed map of the image",
+            )
+        ],
+        model=image2hed,
+    ),
+    Tool(
+        name="image_to_scribble",
+        description="get the scribble of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="scribble",
+                type=DataType.SCRIBBLE,
+                description="the scribble of the image",
+            )
+        ],
+        model=image2scribble,
+    ),
+    Tool(
+        name="image_to_pose",
+        description="Get the pose of the image. It is usually used in image generation conditioned on pose map from input image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="pose",
+                type=DataType.POSE,
+                description="the pose of the image",
+            )
+        ],
+        model=image2pose,
+    ),
+    Tool(
+        name="image_to_depth",
+        description="get the depth map of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="depth",
+                type=DataType.DEPTH,
+                description="the depth map",
+            )
+        ],
+        model=image2depth,
+    ),
+    Tool(
+        name="image_to_normal",
+        description="get the normal map of the image.",
+        domain=Tool.Domain.IMAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="normal",
+                type=DataType.NORMAL,
+                description="the normal map",
+            )
+        ],
+        model=image2normal,
+    ),
+]
+IMAGE_PERCEPTION_TOOLS = [
+    Tool(
+        name="object_detection",
+        description="detect all the objects in the image.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that contains the objects",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="object",
+                type=DataType.BBOX,
+                description="the detected objects in json format. "
+                "example output: [\{'score': 0.9994931221008301, 'label': 'dog', 'box': \{'xmin': 466, 'ymin': 301, 'xmax': 1045, 'ymax': 583\}\}]",
+            )
+        ],
+        model=object_detection,
+    ),
+    Tool(
+        name="image_classification",
+        description="classify the objects in the image.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        usages=["ask about the class of the image"],
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that contains the objects",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="category",
+                type=DataType.CATEGORY,
+                description="the categories in json format. "
+                "example output: [\{'score': 0.9, 'label': 'dog'\}]",
+            )
+        ],
+        model=image_classification,
+    ),
+    Tool(
+        name="video_classification",
+        description="Classify the video and detect the actions in the video.",
+        domain=Tool.Domain.VIDEO_PERCEPTION,
+        usages=["ask about the class of the video"],
+        args=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="the given video",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="category",
+                type=DataType.CATEGORY,
+                description="the categories in json format. "
+                "example output: [\{'score': 0.9, 'label': 'Playing basketball'\}]",
+            )
+        ],
+        model=video_classification,
+    ),
+    Tool(
+        name="image_instance_segmentation",
+        description="segment the common objects in the given image.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be segmented",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="mask", type=DataType.MASK, description="the output mask"
+            )
+        ],
+        model=segment_objects,
+    ),
+    Tool(
+        name="image_segmentation_by_mask",
+        description="segment the given image with the prompt mask.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be segmented",
+            ),
+            Tool.Argument(
+                name="prompt_mask",
+                type=DataType.MASK,
+                description="the prompt mask that guides the segmentation",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="mask", type=DataType.MASK, description="the output mask"
+            )
+        ],
+        model=seg_by_mask,
+    ),
+    Tool(
+        name="image_segmentation_by_points",
+        description="segment the given image with the prompt points.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be segmented",
+            ),
+            Tool.Argument(
+                name="prompt_points",
+                type=DataType.POINT,
+                description="the prompt points that guides the segmentation",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="mask", type=DataType.MASK, description="the output mask"
+            )
+        ],
+        model=seg_by_points,
+    ),
+    Tool(
+        name="segment_anything",
+        description="Segment the given image without other inputs. This tool return the segmentation map for input image. The segmentation can be used to generate a new image.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be segmented",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="segmentation",
+                type=DataType.SEGMENTATION,
+                description="the output segmentation",
+            )
+        ],
+        model=segment_all,
+    ),
+    Tool(
+        name="visual_grounding",
+        description="Visual Grounding (VG) aims to locate the most relevant object or region in an image, based on a natural language query. The query can be a phrase, a sentence or even a multi-round dialogue.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be processed",
+            ),
+            Tool.Argument(
+                name="query",
+                type=DataType.TEXT,
+                description="a query that can be a phrase, a sentence",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="bbox",
+                type=DataType.BBOX,
+                description="the detected bounding boxes for ",
+            )
+        ],
+        model=visual_grounding,
+    ),
+    Tool(
+        name="optical_character_recognition",
+        description="Optical Character Recognition (OCR) is the process that converts an image of text into a machine-readable text format.",
+        domain=Tool.Domain.IMAGE_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="the image that need to be processed",
+            )
+        ],
+        returns=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="the recognized text",
+            )
+        ],
+        model=ocr,
+    ),
+]
+GENERAL_TOOLS = [
+    Tool(
+        name="select_category",
+        description="select the target classes in category list with the given condition.",
+        domain=Tool.Domain.GENERAL,
+        usages=["pick out the objects with the same type"],
+        args=[
+            Tool.Argument(
+                name="category_list",
+                type=DataType.CATEGORY,
+                description="the list to be processed",
+            ),
+            Tool.Argument(
+                name="condition",
+                type=DataType.TEXT,
+                description="the condition to select objects",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="target_category_result",
+                type=DataType.CATEGORY,
+                description="the selected list",
+            )
+        ],
+        model=select,
+    ),
+    Tool(
+        name="select_bbox",
+        description="select the bounding boxes with the given condition.",
+        domain=Tool.Domain.GENERAL,
+        usages=["filter out the bounding boxes with the same type"],
+        args=[
+            Tool.Argument(
+                name="bbox_list",
+                type=DataType.BBOX,
+                description="the bounding box list to be processed",
+            ),
+            Tool.Argument(
+                name="condition",
+                type=DataType.TEXT,
+                description="the condition to select objects",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="result",
+                type=DataType.BBOX,
+                description="the selected bbox list",
+            )
+        ],
+        model=select,
+    ),
+    Tool(
+        name="select_mask",
+        description="select the masks with the given condition.",
+        domain=Tool.Domain.GENERAL,
+        args=[
+            Tool.Argument(
+                name="mask_list",
+                type=DataType.MASK,
+                description="the list to be processed",
+            ),
+            Tool.Argument(
+                name="condition",
+                type=DataType.TEXT,
+                description="the condition to select objects",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="result",
+                type=DataType.MASK,
+                description="the selected mask list",
+            )
+        ],
+        model=select,
+    ),
+    Tool(
+        name="count_categories",
+        description="count target categories in the given list.",
+        domain=Tool.Domain.GENERAL,
+        args=[
+            Tool.Argument(
+                name="category_list",
+                type=DataType.CATEGORY,
+                description="the list to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="length",
+                type=DataType.TEXT,
+                description="the length of the given list, return in the string format."
+                "Example: The length of the given list is 10",
+            )
+        ],
+        model=count,
+    ),
+    Tool(
+        name="count_objects",
+        description="count target objects in the given list. It is useful when you want to count the number of objects in the image",
+        domain=Tool.Domain.GENERAL,
+        args=[
+            Tool.Argument(
+                name="bbox_list",
+                type=DataType.BBOX,
+                description="the bounding box list to be counted",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="length",
+                type=DataType.TEXT,
+                description="the length of the given list, return in the string format."
+                "Example: The length of the given list is 10",
+            )
+        ],
+        model=count,
+    ),
+    Tool(
+        name="count_masks",
+        description="count target mask in the given list.",
+        domain=Tool.Domain.GENERAL,
+        args=[
+            Tool.Argument(
+                name="mask_list",
+                type=DataType.MASK,
+                description="the list to be processed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="length",
+                type=DataType.TEXT,
+                description="the length of the given list, return in the string format."
+                "Example: The length of the given list is 10",
+            )
+        ],
+        model=count,
+    ),
+]
+VIDEO_TOOLS = [
+    # VIDEO
+    Tool(
+        name="video_captioning",
+        description='Generate a caption or description for video. It can generate a detailed description that can be used for video perception and video generation. For example: a) you can use this tool when you want to know what happened in the video"; and b) when you want to generate tags for input video, you can use translate description obtained from `image_captioning` into tags.',
+        domain=Tool.Domain.VIDEO_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="the video to be captioned.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="caption",
+                type=DataType.TEXT,
+                description="the caption or description of input video.",
+            )
+        ],
+        model=video_captioning,
+    ),
+    Tool(
+        name="image_audio_to_video",
+        description="Generate a video with speech to introduce the image.",
+        domain=Tool.Domain.VIDEO_GENERATION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="The input image to be introduced.",
+            ),
+            Tool.Argument(
+                name="audio",
+                type=DataType.AUDIO,
+                description="The audio contained the speech of image description.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="Generated video that can introduce the image with speech",
+            )
+        ],
+        model=image_audio_to_video,
+    ),
+    Tool(
+        name="image_to_video",
+        description="Generate a video based on image.",
+        domain=Tool.Domain.VIDEO_GENERATION,
+        args=[
+            Tool.Argument(
+                name="image",
+                type=DataType.IMAGE,
+                description="The input image.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="Generated video from the input image.",
+            )
+        ],
+        model=image_to_video,
+    ),
+    Tool(
+        name="video_to_webpage",
+        description="Generate a web page to promote and introduce the video.",
+        domain=Tool.Domain.VIDEO_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="The input image to be introduced.",
+            ),
+            Tool.Argument(
+                name="title",
+                type=DataType.TITLE,
+                description="The title of video.",
+            ),
+            Tool.Argument(
+                name="tags",
+                type=DataType.TAGS,
+                description="The tags of video.",
+            ),
+            Tool.Argument(
+                name="description",
+                type=DataType.TEXT,
+                description="The description of video.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="html_code",
+                type=DataType.HTML,
+                description="Generated HTML webpage with code that can introduce the video with speech.",
+            )
+        ],
+        model=video_to_webpage,
+    ),
+    Tool(
+        name="dub_video",
+        description="Dub the input video with given audio track.",
+        domain=Tool.Domain.VIDEO_EDITING,
+        args=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="The input image to be introduced.",
+            ),
+            Tool.Argument(
+                name="audio",
+                type=DataType.AUDIO,
+                description="The audio of video.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="Output video with designated audio.",
+            )
+        ],
+        model=dub_video,
+    ),
+    Tool(
+        name="text_to_video",
+        description="It takes as input a natural language description and produces a video matching that description",
+        domain=Tool.Domain.VIDEO_GENERATION,
+        args=[
+            Tool.Argument(
+                name="prompt",
+                type=DataType.TEXT,
+                description="the text describing the image",
+            )
+        ],
+        returns=[
+            Tool.Argument(
+                name="video",
+                type=DataType.VIDEO,
+                description="the generated video",
+            )
+        ],
+        model=text_to_video,
+    ),
+]
+AUDIO_TOOLS = [
+    # AUDIO
+    Tool(
+        name="text_to_music",
+        description="Generate music condioned on input text/prompt. For example, you can use this tool when you want to generate music for a poem, generate a piece of music from image.",
+        domain=Tool.Domain.AUDIO_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="Input text for music generation.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="music",
+                type=DataType.AUDIO,
+                description="Generated music conditioned on text.",
+            )
+        ],
+        model=text_to_music,
+    ),
+    Tool(
+        name="text_to_speech",
+        description="Create natural-sounding speech from text, where the speech can be generated in multiple languages and for multiple speakers",
+        domain=Tool.Domain.AUDIO_GENERATION,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text that will be translated into speech.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="speech",
+                type=DataType.AUDIO,
+                description="Generated speech or voice conditioned on text.",
+            )
+        ],
+        model=text_to_speech,
+    ),
+    Tool(
+        name="audio_classification",
+        description="Audio classification is the task of assigning a label or class to a given audio. It can be used for recognizing which command a user is giving or the emotion of a statement, as well as identifying a speaker.",
+        domain=Tool.Domain.AUDIO_PERCEPTION,
+        args=[
+            Tool.Argument(
+                name="audio",
+                type=DataType.AUDIO,
+                description="The input audio that will be classified.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="speech",
+                type=DataType.CATEGORY,
+                description="The recognized categories in json format.",
+            )
+        ],
+        model=audio_classification,
+    ),
+]
+NLP_TOOLS = [
+    # Text
+    Tool(
+        name="text_to_text_generation",
+        description="Text to text generation. It can be used for sentence acceptability judgment, Sentiment analysis, Paraphrasing/sentence similarity, Natural language inference, Sentence completion, Word sense disambiguation, Question answering.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="answer",
+                type=DataType.TEXT,
+                description="Generated answer for given input.",
+            )
+        ],
+        model=text_to_text_generation,
+    ),
+    Tool(
+        name="title_generation",
+        description="Generate a title for given text.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="title",
+                type=DataType.TITLE,
+                description="Generated title based given sentences.",
+            )
+        ],
+        model=title_generation,
+    ),
+    Tool(
+        name="openai_chat_model",
+        description="Answer the question by Large Language Model.",
+        domain=Tool.Domain.QUESTION_ANSWERING,
+        args=[
+            Tool.Argument(
+                name="input_msg",
+                type=DataType.TEXT,
+                description="The input text",
+            )
+        ],
+        returns=[
+            Tool.Argument(
+                name="answer",
+                type=DataType.TEXT,
+                description="Generated answer based given text.",
+            )
+        ],
+        model=openai_chat_model,
+    ),
+    Tool(
+        name="summarization",
+        description="Summarize sentences, long narratives, articles, papers, textbooks.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text to be Summarized.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="summarized_text",
+                type=DataType.TEXT,
+                description="Summarized text.",
+            )
+        ],
+        model=summarization,
+    ),
+    Tool(
+        name="text_to_tags",
+        description="Predict the tags of text, article and papers by using the their textual content as input",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text to be Summarized.",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="tags",
+                type=DataType.TAGS,
+                description="The extracted tags from input text",
+            )
+        ],
+        model=text_to_tags,
+    ),
+    Tool(
+        name="named_entity_recognition",
+        description="Named-entity recognition (NER) (also known as (named) entity identification, entity chunking, and entity extraction) is a subtask of information extraction that seeks to locate and classify named entities mentioned in unstructured text into pre-defined categories such as person names, organizations, locations, medical codes, time expressions, quantities, monetary values, percentages, etc.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text from which the named entities are extracted",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="tags",
+                type=DataType.TAGS,
+                description="The extracted entities",
+            )
+        ],
+        model=None,
+    ),
+    Tool(
+        name="sentiment_analysis",
+        description="Sentiment analysis is the process of analyzing digital text to determine if the emotional tone of the message is positive, negative, or neutral.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text to be analyzed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The sentiment of text",
+            )
+        ],
+        model=sentiment_analysis,
+    ),
+    Tool(
+        name="extract_location",
+        description="Extracts the locale name from the text. For example, if the text is 'what is the weather in Beijing', the tool will return 'Beijing'. If the text is 'Samuel ppops in a happy plce called Berlin which happens to be Kazakhstan', the tool will return 'Berlin,Kazakhstan'.",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="text",
+                type=DataType.TEXT,
+                description="The input text to be analyzed",
+            ),
+        ],
+        returns=[
+            Tool.Argument(
+                name="location",
+                type=DataType.LOCATION,
+                description="The sentiment of text",
+            )
+        ],
+        model=extract_location,
+    ),
+    Tool(
+        name="summarize_weather_condition",
+        description="Translate the json formatted weather information into the text that human can understand. For example, when you want to generate a new image based on weather information",
+        domain=Tool.Domain.NATURAL_LANGUAGE_PROCESSING,
+        args=[
+            Tool.Argument(
+                name="weather",
+                type=DataType.WEATHER,
+                description="weather condition",
+            )
+        ],
+        returns=[
+            Tool.Argument(
+                name="weather_summary",
+                type=DataType.TEXT,
+                description="the weather summary",
+            )
+        ],
+        model=summarize_weather_condition,
+    ),
+]
+TOOLS = (
+    QUESTION_ANSWERING_TOOLS
+    + IMAGE_CAPTIONING_TOOLS
+    + IMAGE_EDITING_TOOLS
+    + IMAGE_GENERATION_TOOLS
+    + IMAGE_TRANSFORM_TOOLS
+    + IMAGE_PERCEPTION_TOOLS
+    + GENERAL_TOOLS
+    + VIDEO_TOOLS
+    + AUDIO_TOOLS
+    + NLP_TOOLS
+)
+TOOLS = {tool.name: tool for tool in TOOLS}
+if __name__ == "__main__":
+    tools = []
+    for tool in TOOLS.values():
+        tools.append(tool.dict())
+    import json
+    with open("tools.json", "w") as f:
+        json.dump(tools, f, indent=4)

cllm/agents/container.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import os
+import sys
+sys.path.append(os.getcwd())
+import os.path as osp
+from pathlib import Path
+import json
+from .base import DataType
+from cllm.utils import get_real_path
+# sys.path.insert(0, sys.path[0] + "/../")
+FILE_EXT = {
+    "image": ["png", "jpeg", "jpg", "gif", "bmp", "tiff", "webp"],
+    "video": ["mp4", "mov", "avi", "mkv"],
+    "audio": ["wav", "mp3"],
+}
+class Container:
+    def __init__(self, name, rtype, value) -> None:
+        self.name = name
+        self.rtype = rtype
+        self.value = value
+    def to_chatbot(self):
+        pass
+    def __str__(self):
+        pass
+    def __repr__(self) -> str:
+        return str(self)
+class File(Container):
+    def to_chatbot(self):
+        return str(self.value)
+    @property
+    def filename(self):
+        return os.path.basename(self.value)
+    def __str__(self):
+        return f"`{self.filename}`"
+class HTML(File):
+    def to_chatbot(self):
+        return str(self.value)
+    def __str__(self):
+        return f"`{self.filename}`"
+class Image(File):
+    def __str__(self):
+        return f"`{self.filename}`"
+class Video(File):
+    def __str__(self):
+        return f"`{self.filename}`"
+class Audio(File):
+    def __str__(self):
+        return f"`{self.filename}`"
+class Text(Container):
+    def to_chatbot(self):
+        if isinstance(self.value, str):
+            return self.value
+        elif isinstance(self.value, (list, tuple, dict)):
+            return json.dumps(self.value, indent=2)
+        return self.value
+    def __str__(self):
+        if isinstance(self.value, (list, dict)):
+            return json.dumps(self.value)
+        elif isinstance(self.value, str):
+            return self.value
+        return str(self.value)
+def auto_type(name, rtype, value):
+    if value is None:
+        return None
+    if "image" in str(rtype):
+        return Image(name, rtype, get_real_path(value))
+    if DataType.VIDEO == rtype:
+        return Video(name, rtype, get_real_path(value))
+    if DataType.AUDIO == rtype:
+        return Audio(name, rtype, get_real_path(value))
+    if DataType.HTML == rtype:
+        return HTML(name, rtype, get_real_path(value))
+    return Text(name, rtype, value)

cllm/agents/tog/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .planner import Planner
2	+ from .controller import Controller

cllm/agents/tog/compiler.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import List, Union
+import ast
+import sys
+import os
+sys.path.append(os.getcwd())
+from .cllm.agents.base import Action
+class Parser:
+    def parse(self, plan) -> List[Action]:
+        # ignore indent
+        input = "\n".join([line.strip() for line in plan.split("\n")])
+        actions = []
+        for stmt in ast.parse(input).body:
+            if isinstance(stmt, ast.Assign):
+                assign: ast.Assign = stmt
+                output: ast.Name = assign.targets[0]
+                func_call: ast.Call = assign.value
+                func_name: ast.Name = func_call.func
+                kwargs: List[ast.keyword] = func_call.keywords
+                args = {}
+                for kwarg in kwargs:
+                    k = kwarg.arg
+                    if isinstance(kwarg.value, ast.Name):
+                        v = kwarg.value.id
+                    else:
+                        v = ast.literal_eval(kwarg.value)
+                    args[k] = v
+                action = Action(
+                    tool_name=func_name.id, outputs=[output.id], inputs=args
+                )
+                actions.append(action)
+        return actions
+class Compiler:
+    def __init__(self):
+        self.parser = Parser()
+    def compile(self, plan: Union[str, List[Union[Action, str]]]) -> List[Action]:
+        """The input could be a plain string, a list of structured `Action`
+        or combination of structured `Action` or unstructured action string.
+        """
+        actions = self.parse(plan)
+        actions = self.correct(actions)
+        return actions
+    def parse(self, plan) -> List[Action]:
+        if isinstance(plan, str):
+            return self.parser.parse(plan)
+        actions = []
+        for action in plan:
+            if isinstance(action, str):
+                action = self.parser.parse(action)[0]
+            actions.append(action)
+        return actions
+    def correct(self, actions):
+        return actions

cllm/agents/tog/controller.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import traceback
+import logging
+from typing import Tuple, List
+import copy
+from pathlib import Path
+import json
+from collections import OrderedDict
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.agents import container
+from cllm.agents.builtin import BUILTIN_PLANS, load_builtin_plans
+from cllm.agents.container import auto_type
+from cllm.agents.base import DataType, NON_FILE_TYPES
+from .interpretor import Interpretor
+from .planner import Planner
+from .responser import generate_response
+logger = logging.getLogger(__name__)
+class Controller:
+    def __init__(self, stream=True, interpretor_kwargs={}):
+        self.stream = stream
+        self.planner = Planner(self.stream)
+        self.interpretor = Interpretor(**interpretor_kwargs)
+        self.SHORTCUT = "**Using builtin shortcut solution.**"
+        BUILTIN_PLANS.update(load_builtin_plans("builtin_plan.json"))
+        logger.info(BUILTIN_PLANS)
+    def plan(self, request: str, state: dict):
+        logger.info(request)
+        resource_memory = state.get("resources", {})
+        raw_solution = None
+        # shortcut for builtin plan
+        for trigger_prompt, _ in BUILTIN_PLANS.items():
+            if request == trigger_prompt:
+                return self.SHORTCUT
+        # dynamic execution
+        if raw_solution is None:
+            raw_solution = self.planner.plan(request, resource_memory)
+        return raw_solution
+    def parse_solution_from_stream(self, raw_solution):
+        return self.planner.parse(raw_solution)
+    def execute(self, raw_solution: str, state: dict):
+        resource_memory = state.get("resources")
+        request = state["request"]
+        solution = None
+        if raw_solution == self.SHORTCUT:
+            for trigger_prompt, builtin_plan in BUILTIN_PLANS.items():
+                if request == trigger_prompt:
+                    solution = builtin_plan
+                    solution = self._fill_args(solution, resource_memory)
+        else:
+            solution = self.planner.parse(raw_solution)
+        if not solution:
+            return None
+        try:
+            history_msgs = state.get("history_msgs")
+            return self.interpretor.interpret(solution, history_msgs)
+        except Exception as e:
+            traceback.print_exc()
+        return None
+    def reply(self, executed_plan: dict, outputs: list, state: dict):
+        error_response = [
+            auto_type(
+                "response",
+                DataType.TEXT,
+                "Sorry, I cannot understand your request due to an internal error.",
+            )
+        ]
+        state = copy.deepcopy(state)
+        if (
+            executed_plan is None
+            or len(executed_plan) == 0
+            or outputs is None
+            or len(outputs) == 0
+        ):
+            return error_response, state
+        resources = state.get("resources", OrderedDict())
+        for o in outputs:
+            if isinstance(o, container.File):
+                resources[str(o.filename)] = str(o.rtype)
+        state["resources"] = resources
+        response = generate_response(state["request"], executed_plan, outputs)
+        if len(response) == 0:
+            return error_response, state
+        logger.info(response)
+        return response, state
+    def run(self, task: str, state: dict) -> Tuple[List, str]:
+        try:
+            return self._run(task, state)
+        except:
+            traceback.print_exc()
+            logger.info(traceback.format_exc())
+            return [
+                auto_type(
+                    "response",
+                    DataType.TEXT,
+                    "Sorry, I cannot understand your request due to an internal error.",
+                )
+            ], "Internal Error"
+    def _run(self, task: str, state: dict) -> Tuple[List, str]:
+        logger.info(task)
+        BUILTIN_PLANS.update(load_builtin_plans("builtin_plan.json"))
+        logger.info(BUILTIN_PLANS)
+        resource_memory = state.get("resources", OrderedDict())
+        history_msgs = state.get("history_msgs", [])
+        plan = None
+        # shortcut for builtin plan
+        for trigger_prompt, builtin_plan in BUILTIN_PLANS.items():
+            if task == trigger_prompt:
+                plan = builtin_plan
+                plan = self._fill_args(plan, resource_memory)
+        # dynamic executation
+        if plan is None:
+            plan = self.planner.planning(task, resource_memory)
+        logger.info(plan)
+        executed_plan, output_files = self.interpretor.interpret(
+            plan, resource_memory, history_msgs
+        )
+        logger.info(output_files)
+        for o in output_files:
+            if isinstance(o, container.File):
+                resource_memory[o.filename] = str(o.rtype)
+        outputs = generate_response(task, executed_plan, output_files)
+        logger.info(outputs)
+        return outputs, executed_plan
+    def _fill_args(self, plan, memory):
+        plan = copy.deepcopy(plan)
+        latest_resource = OrderedDict()
+        for key, val in memory.items():
+            latest_resource[val] = key
+        for actions in plan:
+            for action in actions:
+                for key, val in action.inputs.items():
+                    if "<TOOL-GENERATED>" not in val:
+                        action.inputs[key] = latest_resource.get(val, val)
+        return plan

cllm/agents/tog/interpretor.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import logging
+from traceback import print_exc
+from typing import List, Dict
+import os.path as osp
+import io
+import copy
+import re
+import uuid
+from matplotlib.pyplot import isinteractive
+from numpy import isin
+import sys
+import os
+sys.path.append(os.getcwd())
+from cllm.agents.base import Action, DataType, Tool, NON_FILE_TYPES
+from cllm.agents.builtin import TOOLS
+from cllm.agents.container import auto_type
+from cllm.utils import get_real_path, get_root_dir, transform_msgs
+logger = logging.getLogger(__name__)
+def code(source, type="py"):
+    return f"```{type}\n{source}\n```"
+class Interpretor:
+    def __init__(self):
+        self.tools = TOOLS
+        self.non_file_types = NON_FILE_TYPES
+    def interpret(self, stages: List[List[Action]], history_msgs: List = []):
+        memory = {}
+        solution = copy.deepcopy(stages)
+        history_msgs = copy.deepcopy(history_msgs)
+        history_msgs = transform_msgs(history_msgs)
+        has_error = False
+        for actions in solution:
+            for action in actions:
+                tool = self.load_tool(name=action.tool_name)
+                tool_inputs = self.load_args(tool, action.inputs, memory)
+                tool_inputs["history_msgs"] = history_msgs
+                tool_inputs["root_dir"] = get_root_dir()
+                try:
+                    tool_outputs = tool.model(**tool_inputs)
+                    action.inputs = self._update_inputs(memory, action.inputs)
+                    action.outputs, wrapped_outputs = self._update_output(
+                        memory, action, tool_outputs, tool
+                    )
+                    logger.info(
+                        "Call {}, args {}, return {}".format(
+                            action.tool_name, action.inputs, action.outputs
+                        )
+                    )
+                    executed_action = (
+                        action.tool_name,
+                        action.inputs,
+                        action.outputs,
+                    )
+                except FileNotFoundError as e:
+                    print_exc()
+                    tool_outputs = None
+                    logger.error(f"Error when executing {action.tool_name}: {e}")
+                    has_error = True
+                    wrapped_outputs = []
+                    executed_action = (
+                        action.tool_name,
+                        action.inputs,
+                        f"FileNotFoundError: No such file or directory: {osp.basename(e.filename)}",
+                    )
+                except Exception as e:
+                    print_exc()
+                    tool_outputs = None
+                    has_error = True
+                    logger.error(f"Error when executing {action.tool_name}: {e}")
+                    wrapped_outputs = []
+                    executed_action = (
+                        action.tool_name,
+                        action.inputs,
+                        f"Internal error: {e}",
+                    )
+                yield executed_action, solution, wrapped_outputs
+                if has_error:
+                    return
+    def _update_output(self, memory, action, tool_outputs, tool):
+        outputs = []
+        wrapped_outputs = []
+        if action.outputs is not None:
+            if len(action.outputs) == 1:
+                tool_outputs = [tool_outputs]
+            for i, (arg_name, arg_value) in enumerate(
+                zip(action.outputs, tool_outputs)
+            ):
+                memory[arg_name] = arg_value
+                if arg_value is None:
+                    outputs.append(arg_value)
+                    wrapped_outputs.append(
+                        auto_type(
+                            arg_name,
+                            DataType.TEXT,
+                            None,
+                        )
+                    )
+                    continue
+                if isinstance(arg_value, (dict, list)):
+                    arg_value = self.pretty_floats(arg_value)
+                if tool.returns[i].type in self.non_file_types:
+                    outputs.append(arg_value)
+                    wrapped_outputs.append(
+                        auto_type(
+                            arg_name,
+                            tool.returns[i].type,
+                            arg_value,
+                        )
+                    )
+                    continue
+                transformed_output = self.transform_output(
+                    action.inputs,
+                    tool.name,
+                    tool.args,
+                    arg_value,
+                    tool.returns[i].type,
+                )
+                outputs.append(transformed_output)
+                memory[arg_name] = transformed_output
+                if not isinstance(transformed_output, list):
+                    wrapped_outputs.append(
+                        auto_type(
+                            arg_name,
+                            tool.returns[i].type,
+                            transformed_output,
+                        )
+                    )
+                    continue
+                for output in transformed_output:
+                    if DataType.MASK == tool.returns[i].type:
+                        output = output if isinstance(output, str) else output["mask"]
+                    wrapped_outputs.append(
+                        auto_type(
+                            arg_name,
+                            tool.returns[i].type,
+                            output if isinstance(output, str) else output["mask"],
+                        )
+                    )
+        return outputs, wrapped_outputs
+    def pretty_floats(self, obj):
+        if isinstance(obj, float):
+            return round(obj, 4)
+        elif isinstance(obj, dict):
+            return dict((k, self.pretty_floats(v)) for k, v in obj.items())
+        elif isinstance(obj, (list, tuple)):
+            return list(map(self.pretty_floats, obj))
+        return obj
+    def _update_inputs(self, memory, action_inputs):
+        action_inputs = copy.deepcopy(action_inputs)
+        for key, value in action_inputs.items():
+            if "<TOOL-GENERATED>" in value:
+                action_inputs[key] = memory.get(value, value)
+            elif "<GENERATED>" in value:
+                action_inputs[key] = memory.get(value, value)
+        return action_inputs
+    def gen_filename(self, too_name, resource_type):
+        def to_camelcase(s):
+            res = re.sub(r"(?!^)_([a-zA-Z])", lambda m: m.group(1).upper(), s)
+            res = res[0].upper() + res[1:]
+            return res
+        if resource_type == DataType.VIDEO:
+            ext = "mp4"
+        elif resource_type == DataType.AUDIO:
+            ext = "wav"
+        elif resource_type == DataType.HTML:
+            ext = "html"
+        else:
+            ext = "png"
+        too_name = too_name.replace("_to_", "2_")
+        too_name = to_camelcase(too_name)
+        this_file_id = str(uuid.uuid4())[:6]
+        type_str = str(resource_type).split(".")[-1]
+        return f"{this_file_id}_{type_str}.{ext}"
+    def _save_resource(self, file_name, resource, resource_type):
+        if isinstance(resource, dict):
+            if "mask" in resource:
+                resource = resource["mask"]
+        if resource_type == DataType.HTML:
+            with open(get_real_path(file_name), "w") as fout:
+                fout.write(resource)
+        elif resource is not None:
+            if isinstance(resource, io.BufferedReader):
+                resource = resource.read()
+            with open(get_real_path(file_name), "wb") as fout:
+                fout.write(resource)
+        else:
+            return None
+    def transform_output(
+        self, action_inputs, tool_name, tool_args, tool_output, output_type
+    ):
+        if output_type != DataType.MASK:
+            if isinstance(tool_output, list):
+                results = []
+                for output in tool_output:
+                    file_name = self.gen_filename(tool_name, output_type)
+                    self._save_resource(file_name, output, output_type)
+                    results.append(file_name)
+                return results
+            else:
+                file_name = self.gen_filename(tool_name, output_type)
+                self._save_resource(file_name, tool_output, output_type)
+                return file_name
+        tool_output = copy.deepcopy(tool_output)
+        if isinstance(tool_output, list):
+            for output in tool_output:
+                if isinstance(output["mask"], str):
+                    continue
+                file_name = self.gen_filename(tool_name, output_type)
+                self._save_resource(file_name, output, output_type)
+                output["mask"] = file_name
+        elif isinstance(tool_output, bytes):
+            file_name = self.gen_filename(tool_name, output_type)
+            self._save_resource(file_name, tool_output, output_type)
+            tool_output = file_name
+        elif tool_output is None:
+            pass
+        else:
+            raise RuntimeError("Wrong type.")
+        return tool_output
+    def load_tool(self, name):
+        return self.tools[name]
+    def load_args(self, tool: Tool, action_inputs, memory):
+        real_args = {}
+        for item in tool.args:
+            arg_name = item.name
+            arg_value = action_inputs[arg_name]
+            if "<GENERATED>" in arg_value or "<TOOL-GENERATED>" in arg_value:
+                assert arg_value in memory, print(f"Unknown {arg_name}: {arg_value}")
+                real_args[arg_name] = memory[arg_value]
+            else:
+                real_args[arg_name] = arg_value
+        return real_args
+    @property
+    def variables(self):
+        return {k: v for k, v in self.memory.items() if k not in TOOLS and k != "print"}

cllm/agents/tog/planner.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import json
+from typing import List
+import logging
+import os
+import sys
+sys.path.append(os.getcwd())
+from ..base import Action, NON_FILE_TYPES
+# from cllm.services.tog import TaskSolver, TaskDecomposer, config
+# from cllm.services.nlp.llms import ChatOpenAI, MessageMemory
+from cllm.services.tog.api import tog, task_decomposer
+from collections import OrderedDict
+import copy
+logger = logging.getLogger(__name__)
+class Planner:
+    def __init__(
+        self, streaming=False, backend="remote", device="cuda:0", **llm_kwargs
+    ):
+        self.streaming = streaming
+        if backend == "local":
+            pass
+            # self.cfg = config
+            # self.device = device
+            # self.mem = MessageMemory(**self.cfg.memory)
+            # self.llm = ChatOpenAI(temperature=0.2, **llm_kwargs)
+            # self.tog = TaskSolver(self.llm, self.cfg.task_solver_config, device).solve
+            # self.decomposer = TaskDecomposer(device, self.cfg.task_decomposer_cfg).solve
+        elif backend == "remote":
+            self.decomposer = task_decomposer
+            self.tog = tog
+        else:
+            raise ValueError("Backend should be chosen from [remote, local]")
+    def _find_latest_resource(self, resources, type):
+        for key, val in list(resources.items())[::-1]:
+            if val == type:
+                return key
+        return None
+    def _check_task_decomposition(
+        self, task_decomposition: str | list, available_resources: dict
+    ):
+        copy_task_decomposition = copy.deepcopy(task_decomposition)
+        available_resources = copy.deepcopy(available_resources)
+        if isinstance(copy_task_decomposition, str):
+            copy_task_decomposition = json.loads(copy_task_decomposition)
+        for subtask in copy_task_decomposition:
+            for arg in subtask["args"]:
+                if arg["type"] in NON_FILE_TYPES:
+                    continue
+                r_type = available_resources.get(arg["value"], "None").split(".")[-1]
+                if arg["value"] not in available_resources or arg["type"] != r_type:
+                    new_value = self._find_latest_resource(
+                        available_resources, arg["type"]
+                    )
+                    if new_value is None:
+                        logger.error(
+                            f"No available resource for {arg['value']} with type {arg['type']}"
+                        )
+                        return None
+                    arg["value"] = new_value
+            available_resources[subtask["returns"][0]["value"]] = subtask["returns"][0][
+                "type"
+            ]
+        return json.dumps(copy_task_decomposition, indent=2, ensure_ascii=False)
+    def wrap_request(self, request, memory):
+        logger.info(memory)
+        resource_list = {k: v.split(".")[-1] for k, v in memory.items()}
+        request = f"Resource list: {resource_list}\n{request}"
+        logger.info(f"Input: {request}")
+        return request
+    def solve_streaming(self, request: str, memory: dict = OrderedDict()):
+        request = self.wrap_request(request, memory)
+        sub_tasks = self.decomposer(request, streaming=self.streaming)
+        logger.info(f"Task decomposition: \n{sub_tasks}")
+        sub_tasks = self._check_task_decomposition(sub_tasks, memory)
+        yield sub_tasks
+        if sub_tasks in [None, "", []]:
+            yield None
+        else:
+            solutions = self.tog(request, sub_tasks, streaming=self.streaming)
+            yield solutions
+    def solve(self, request: str, memory: dict = OrderedDict()) -> List:
+        self.wrap_request(request, memory)
+        sub_tasks = self.decomposer(request)
+        solutions = self.tog(request, sub_tasks)
+        print(f"solutions: {solutions}")
+        return sub_tasks, solutions
+    def plan(self, task, memory: dict = OrderedDict()) -> List:
+        if self.streaming:
+            return self.solve_streaming(task, memory)
+        else:
+            return self.solve(task, memory)
+    def _check_solutions(self, solution: List | str) -> bool:
+        if isinstance(solution, str):
+            solution = json.loads(solution)
+        if len(solution) == 0:
+            return False
+        valid = True
+        for i, stage_candiate in enumerate(solution):
+            if len(stage_candiate) == 0:
+                logger.error(f"No solution is found in {i}-th subtask.")
+                valid = False
+            elif (
+                "solution" in stage_candiate[0]
+                and len(stage_candiate[0]["solution"]) == 0
+            ):
+                logger.error(f"No solution is found in {i+1}-th subtask.")
+                valid = False
+            else:
+                logger.info(f"Solutions for {i+1}-th subtask:\n{stage_candiate}")
+        return valid
+    def parse(self, solution: List | str) -> List[List[Action]]:
+        if isinstance(solution, str):
+            solution = json.loads(solution)
+        if not self._check_solutions(solution):
+            return None
+        if isinstance(solution[0], Action):
+            return solution
+        stages = []
+        for i, stage_candiate in enumerate(solution):
+            stage = stage_candiate[0]["solution"]
+            actions = []
+            for action in stage:
+                inputs = {arg["name"]: arg["value"] for arg in action["args"]}
+                outputs = [r["value"] for r in action["returns"]]
+                actions.append(
+                    Action(action["tool_name"], inputs=inputs, outputs=outputs)
+                )
+            stages.append(actions)
+        return stages
+    def __call__(
+        self, request: str, memory: dict = OrderedDict()
+    ) -> List[List[Action]]:
+        solution = self.solve(request, memory)
+        return self.parse(solution)

cllm/agents/tog/responser.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import openai
+import logging
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.services.nlp.llms.chat_models import ChatOpenAI
+# from cllm.services.nlp.llms.memory import MessageMemory
+from langchain.schema import SystemMessage
+from cllm.agents.base import DataType
+from cllm.agents import container
+RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGV-lab from Shanghai Artificial Intelligence Laboratory. For user's request, the system executes the solution and collects the results based on the following workflow. You need to respond to user requests based on the following information.
+Here are the information for you reference.
+## User Request
+{}
+## Workflow and Execution Results
+{}
+Now you should pay attention to Collected Results. You first must answer the user’s request in a straightforward manner. Then you need to summarize the workflow and intermediate results friendly. Some of the results may not be accurate and need you to use your judgement in making decisions. If the results contain file names, you have to output the file name directly. Only if there is nothing returned by tools, you should tell user you can not finish the task. Now, please friendly summarize the results and answer the question for the user requests `{}`.
+""".strip()
+SIMPLE_RESPONSE_GENERATION_PROMPT = """Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai Artificial Intelligence Laboratory. You need to respond to user requests based on the following information.
+Here are the information for you reference.
+## User Request
+{}
+## Workflow and Execution Results
+{}
+Now, please friendly summarize the results and answer the question for the user requests `{}`.
+""".strip()
+logger = logging.getLogger(__name__)
+def generate_response(user_input, solution, output_files):
+    if (
+        len(solution) <= 1
+        and len(solution[0]) <= 1
+        and solution[0][0].tool_name == "question_answering"
+    ):
+        content = SIMPLE_RESPONSE_GENERATION_PROMPT.format(
+            user_input, solution, user_input
+        )
+    else:
+        content = RESPONSE_GENERATION_PROMPT.format(user_input, solution, user_input)
+    logger.info("##### Response Generation #####")
+    logger.info(content)
+    chat = ChatOpenAI(model_name="gpt-3.5-turbo-1106")
+    messages = [SystemMessage(content=content)]
+    output = chat(messages)
+    logger.info(output)
+    # files = [output for output in output_files if isinstance(output, container.File)]
+    # return [container.Text('Response', DataType.TEXT, output)] + files
+    return [container.Text("Response", DataType.TEXT, output)]

cllm/services/audio/__init__.py ADDED Viewed

File without changes

cllm/services/audio/api.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import io
+import os
+import uuid
+import requests
+from cllm.services.nlp.api import openai_chat_model
+from cllm.services.utils import get_bytes_value
+__ALL__ = [
+    "audio_classification",
+    "automatic_speech_recognition",
+    "text_to_speech",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10057):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def audio_classification(audio, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/audio_classification"
+    if isinstance(audio, str):
+        audio = open(audio, "rb").read()
+    files = {"audio": (audio, get_bytes_value(audio))}
+    response = requests.post(url, files=files)
+    return response.json()
+def automatic_speech_recognition(audio: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/automatic_speech_recognition"
+    # audio_file = open(audio, "rb")
+    files = {"audio": (audio, get_bytes_value(audio))}
+    response = requests.post(url, files=files)
+    return response.json()
+def text_to_speech(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    human_msg = f"""Your task is to extract the prompt from input. Here is examples:
+    Input:
+    translate the text into speech: \"Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all\"
+    Answer:
+    Hope is the thing with feathers That perches in the soul, And sings the tune without the words, And never stops at all
+    Input:
+    Can you help me transcribe the text into audio: I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood. I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice. I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.
+    Answer:
+    I have a dream that one day this nation will rise up and live out the true meaning of its creed: We hold these truths to be self-evident, that all men are created equal.I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood. I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice. I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.
+    Input:
+    Create speech using the text: And so, my fellow Americans: ask not what your country can do for you — ask what you can do for your country.
+    Answer:
+    And so, my fellow Americans: ask not what your country can do for you — ask what you can do for your country.
+    Input:
+    The image features a large brown and white dog standing on a tree stump, accompanied by a small cat. The dog is positioned on the right side of the stump, while the cat is on the left side. Both animals appear to be looking at the camera, creating a captivating scene.\n\nThe dog and cat are the main focus of the image, with the dog being larger and more prominent, while the cat is smaller and positioned closer to the ground. The tree stump serves as a natural and interesting backdrop for the two animals, making the scene unique and engaging.
+    Answer:
+    The image features a large brown and white dog standing on a tree stump, accompanied by a small cat. The dog is positioned on the right side of the stump, while the cat is on the left side. Both animals appear to be looking at the camera, creating a captivating scene.\n\nThe dog and cat are the main focus of the image, with the dog being larger and more prominent, while the cat is smaller and positioned closer to the ground. The tree stump serves as a natural and interesting backdrop for the two animals, making the scene unique and engaging.
+    Input:
+    Life, thin and light-off time and time again\nFrivolous tireless\nI heard the echo, from the valleys and the heart\nOpen to the lonely soul of sickle harvesting\nRepeat outrightly, but also repeat the well-being of eventually swaying in the desert oasis\nI believe I am\nBorn as the bright summer flowers\nDo not withered undefeated fiery demon rule\nHeart rate and breathing to bear the load of the cumbersome Bored\nI heard the music, from the moon and carcass\nAuxiliary extreme aestheticism bait to capture misty\nFilling the intense life, but also filling the pure\nThere are always memories throughout the earth
+    Answer:
+    Life, thin and light-off time and time again\nFrivolous tireless\nI heard the echo, from the valleys and the heart\nOpen to the lonely soul of sickle harvesting\nRepeat outrightly, but also repeat the well-being of eventually swaying in the desert oasis\nI believe I am\nBorn as the bright summer flowers\nDo not withered undefeated fiery demon rule\nHeart rate and breathing to bear the load of the cumbersome Bored\nI heard the music, from the moon and carcass\nAuxiliary extreme aestheticism bait to capture misty\nFilling the intense life, but also filling the pure\nThere are always memories throughout the earth
+    Input:
+    {text}
+    Answer:
+    """
+    extracted_prompt = openai_chat_model(human_msg)
+    print(f"extracted_prompt: {extracted_prompt}")
+    url = f"http://{host}:{port}/text_to_speech"
+    data = {"text": extracted_prompt}
+    response = requests.post(url, data=data)
+    return response.content
+def text_to_music(text: str, **kwargs):
+    # print('a' * 40)
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    human_msg = f"""Your task is to extract the prompt from input. Here is examples:
+    Input:
+    Please generate a piece of music based on given prompt. Here is the prompt: An 80s driving pop song with heavy drums
+    Answer:
+    An 80s driving pop song with heavy drums
+    Input:
+    I would like you to provide me with a new song that represents an energetic and lively 80s pop track with prominent drums and synthesizer pads
+    Answer:
+    an energetic and lively 80s pop track with prominent drums and synthesizer pads
+    Input:
+    I'm looking for a song that has a driving pop vibe from the 80s, with heavy drums and synth pads playing in the background
+    Answer:
+    a driving pop vibe from the 80s, with heavy drums and synth pads playing in the background
+    Input:
+    Can you make a song that has a lively and energetic rhythm with prominent drums and electronic keyboard sounds in the background
+    Answer:
+    a lively and energetic rhythm with prominent drums and electronic keyboard sounds in the background
+    Input:
+    Can you make a piece of light and relaxing music
+    Answer:
+    a piece of light and relaxing music
+    Input:
+    {text}
+    Answer:
+    """
+    extracted_prompt = openai_chat_model(human_msg)
+    url = f"http://{host}:{port}/text_to_music"
+    data = {"text": extracted_prompt}
+    response = requests.post(url, data=data)
+    return response.content

cllm/services/general/__init__.py ADDED Viewed

File without changes

cllm/services/general/api.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from re import I
+from typing import List
+from pathlib import Path
+import os
+import requests
+__ALL__ = ["remote_logging", "select", "count"]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10056):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def select(**kwargs):
+    if "bbox_list" in kwargs:
+        list = kwargs["bbox_list"]
+        condition = kwargs["condition"]
+        return [l for l in list if l["label"] == condition]
+    if "mask_list" in kwargs:
+        list = kwargs["mask_list"]
+        condition = kwargs["condition"]
+        # return combine_masks([l for l in list if l['label'] == condition])
+        return [l for l in list if l["label"] == condition]
+    if "category_list" in kwargs:
+        list = kwargs["category_list"]
+        condition = kwargs["condition"]
+        # return combine_masks([l for l in list if l['label'] == condition])
+        return [l for l in list if l["label"] == condition]
+def count(**kwargs):
+    len_of_list = 0
+    if "bbox_list" in kwargs:
+        len_of_list = len(kwargs["bbox_list"])
+    elif "mask_list" in kwargs:
+        len_of_list = len(kwargs["mask_list"])
+    return f"The length of the given list is {len_of_list}"
+def remote_logging(
+    history_msgs: list,
+    task_decomposition: list,
+    solution: list,
+    record: str,
+    like: bool,
+    **kwargs,
+):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/remote_logging"
+    data = {
+        "history_msgs": history_msgs,
+        "task_decomposition": task_decomposition,
+        "solution": solution,
+        "record": record,
+        "like": like,
+    }
+    response = requests.post(url, data=data)
+    return response.content

cllm/services/image_editing/__init__.py ADDED Viewed

File without changes

cllm/services/image_editing/api.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import copy
+import io
+import os
+from PIL import Image, ImageDraw, ImageChops
+import numpy as np
+import requests
+from PIL import Image
+from typing import List, Union
+from pathlib import Path
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.services.utils import get_bytes_value
+from cllm.utils import get_real_path
+from cllm.services.nlp.api import openai_chat_model
+__ALL__ = [
+    "instruct_pix2pix",
+    "image_cropping",
+    "image_matting",
+    "draw_bbox_on_image",
+    "partial_image_editing",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10049):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def image_cropping(image: str | Path, object: List[dict], **kwargs):
+    """
+    bbox format: {'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}
+    """
+    if object in [None, b"", []]:
+        return None
+    if isinstance(image, (str, Path)):
+        image = Image.open(get_real_path(image)).convert("RGB")
+    elif isinstance(image, bytes):
+        image = Image.open(io.BytesIO(image)).convert("RGB")
+    w, h = image.size
+    cropped_images = []
+    for box in object:
+        box = copy.deepcopy(box["box"])
+        box = unify_bbox(box, w, h)
+        (left, upper, right, lower) = (
+            box["xmin"],
+            box["ymin"],
+            box["xmax"],
+            box["ymax"],
+        )
+        cropped_image = image.crop((left, upper, right, lower))
+        # cropped_image.save('test.png')
+        img_stream = io.BytesIO()
+        cropped_image.save(img_stream, format="png")
+        img_stream.seek(0)
+        cropped_images.append(img_stream.getvalue())
+    if len(cropped_images) == 0:
+        return None
+    return cropped_images
+def image_matting(image: str | Path, mask: Union[str, bytes, List], **kwargs):
+    """
+    {'score': 0.999025,
+    'label': 'person',
+    'mask': <PIL.Image.Image image mode=L size=386x384>}
+    """
+    if mask in [None, b"", []]:
+        return None
+    image = Image.open(get_bytes_value(image)).convert("RGB")
+    mask = copy.deepcopy(mask)
+    if isinstance(mask, List):
+        mask_list = []
+        for m in mask:
+            if isinstance(m, dict):
+                mask_list.append(get_bytes_value(m["mask"]))
+            else:
+                mask_list.append(get_bytes_value(m))
+        mask = combine_masks(mask_list)
+    elif isinstance(mask, str):
+        mask = get_bytes_value(mask)
+    mask = Image.open(mask).convert("L")
+    mask = np.array(mask) > 0
+    image = np.array(image)
+    image = image * np.expand_dims(mask, -1)
+    img_stream = io.BytesIO()
+    image.save(img_stream, format="png")
+    img_stream.seek(0)
+    return img_stream.getvalue()
+def unify_bbox(bbox, w, h):
+    bbox["xmin"] = (
+        bbox["xmin"] if isinstance(bbox["xmin"], int) else int(bbox["xmin"] * w)
+    )
+    bbox["ymin"] = (
+        bbox["ymin"] if isinstance(bbox["ymin"], int) else int(bbox["ymin"] * h)
+    )
+    bbox["xmax"] = (
+        bbox["xmax"] if isinstance(bbox["xmax"], int) else int(bbox["xmax"] * w)
+    )
+    bbox["ymax"] = (
+        bbox["ymax"] if isinstance(bbox["ymax"], int) else int(bbox["ymax"] * h)
+    )
+    return bbox
+def draw_bbox_on_image(image: str | Path, bbox: list, **kwargs):
+    if isinstance(image, (str, Path)):
+        image = Image.open(get_real_path(image)).convert("RGB")
+    elif isinstance(image, bytes):
+        image = Image.open(io.BytesIO(image)).convert("RGB")
+    image = image.copy()
+    w, h = image.size
+    for box in bbox:
+        box = copy.deepcopy(box["box"])
+        box = unify_bbox(box, w, h)
+        (left, upper, right, lower) = (
+            box["xmin"],
+            box["ymin"],
+            box["xmax"],
+            box["ymax"],
+        )
+        draw = ImageDraw.Draw(image)
+        font_width = int(
+            min(box["xmax"] - box["xmin"], box["ymax"] - box["ymin"]) * 0.01
+        )
+        draw.rectangle(((left, upper), (right, lower)), outline="Red", width=font_width)
+    img_stream = io.BytesIO()
+    image.save(img_stream, format="png")
+    img_stream.seek(0)
+    # image = Image.save(image, format='png')
+    return img_stream.getvalue()
+def _imagetext2image(image, text, endpoint, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/{endpoint}"
+    data = {"text": text}
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files, data=data)
+    return response.content
+def instruct_pix2pix(image, text, **kwargs):
+    return _imagetext2image(image, text, endpoint="instruct_pix2pix", **kwargs)
+def partial_image_editing(
+    image: str | bytes, mask: str | list | bytes, prompt: str, **kwargs
+):
+    if mask in [None, b"", []]:
+        return None
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/partial_image_editing"
+    human_msg = f"""Your task is to extract the prompt from input. Here is examples:
+    Input:
+    Replace the masked object in the given image with a yellow horse
+    Answer:
+    a yellow horse
+    Input:
+    Use the c1s5af_mask.png in to replace the object with a man in the image
+    Answer:
+    a man
+    Input:
+    Modify the given image by replacing the object indicated in the mask with a bouquet of flowers
+    Answer:
+    with a bouquet of flowers
+    Input:
+    Use the 7a3c72_mask.png file to replace the object in the a9430b_image.png with a bus colored yellow and red with the number 5 on its front sign
+    Answer:
+    a bus colored yellow and red with the number 5 on its front sign.
+    Input:
+    Replace the masked area in image with a fat boy wearing a black jacket.
+    Answer:
+    a fat boy wearing a black jacket
+    Input:
+    {prompt}
+    Answer:
+    """
+    extracted_prompt = openai_chat_model(human_msg)
+    data = {"prompt": extracted_prompt}
+    if isinstance(mask, List):
+        mask_list = []
+        for m in mask:
+            if isinstance(m, dict):
+                mask_list.append(get_bytes_value(m["mask"]))
+            else:
+                mask_list.append(get_bytes_value(m))
+        mask = combine_masks(mask_list)
+    files = {
+        "image": (image, get_bytes_value(image)),
+        "mask": ("mask", get_bytes_value(mask)),
+    }
+    response = requests.post(url, files=files, data=data)
+    return response.content
+def combine_masks(mask_images):
+    if mask_images is None or len(mask_images) == 0:
+        return None
+    # Create a new blank image to store the combined mask
+    combined_mask = Image.open(io.BytesIO(mask_images[0])).convert("1")
+    # Iterate through each mask image and combine them
+    for mask_image in mask_images:
+        mask = Image.open(io.BytesIO(mask_image)).convert("1")
+        combined_mask = ImageChops.logical_or(combined_mask, mask)
+    stream = io.BytesIO()
+    combined_mask.save(stream, "png")
+    stream.seek(0)
+    # return {"label": mask_images[0]["label"], "mask": stream.getvalue()}
+    return stream.getvalue()
+def inpainting_ldm_general(image, mask: Union[str, bytes, List], **kwargs):
+    if mask in [None, b"", []]:
+        return get_bytes_value(image)
+    mask = copy.deepcopy(mask)
+    if isinstance(mask, List):
+        mask_list = []
+        for m in mask:
+            if isinstance(m, dict):
+                mask_list.append(get_bytes_value(m["mask"]))
+            else:
+                mask_list.append(get_bytes_value(m))
+        mask = combine_masks(mask_list)
+    elif isinstance(mask, str):
+        mask = get_bytes_value(mask)
+        # mask = Image.open(mask).convert("1")
+    return inpainting_ldm(image, mask, **kwargs)
+def inpainting_ldm(image, mask, **kwargs):
+    if mask in [None, b""]:
+        return get_bytes_value(image)
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/inpainting_ldm"
+    files = {
+        "image": (image, get_bytes_value(image)),
+        "mask": get_bytes_value(mask),
+    }
+    response = requests.post(url, files=files)
+    return response.content

cllm/services/image_generation/__init__.py ADDED Viewed

File without changes

cllm/services/image_generation/api.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import io
+import os
+import requests
+import sys
+sys.path.append(os.getcwd())
+from PIL import Image
+from cllm.services.utils import get_bytes_value
+__ALL__ = [
+    "text2image",
+    "cannytext2image",
+    "linetext2image",
+    "hedtext2image",
+    "scribbletext2image",
+    "posetext2image",
+    "segtext2image",
+    "depthtext2image",
+    "normaltext2image" "image2image",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10049):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def text2image(text, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/text2image"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.content
+def image2image(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/image2image"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.content
+def _imagetext2image(image, text, endpoint, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/{endpoint}"
+    data = {"text": text}
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files, data=data)
+    # image = Image.open(io.BytesIO(response.content))
+    # image = io.BytesIO(response.content)
+    # return image
+    return response.content
+def cannytext2image(edge, text, **kwargs):
+    return _imagetext2image(edge, text, endpoint="cannytext2image", **kwargs)
+def linetext2image(line, text, **kwargs):
+    return _imagetext2image(line, text, endpoint="linetext2image", **kwargs)
+def hedtext2image(hed, text, **kwargs):
+    return _imagetext2image(hed, text, endpoint="hedtext2image", **kwargs)
+def scribbletext2image(scribble, text, **kwargs):
+    return _imagetext2image(scribble, text, endpoint="scribbletext2image", **kwargs)
+def posetext2image(pose, text, **kwargs):
+    return _imagetext2image(pose, text, endpoint="posetext2image", **kwargs)
+def segtext2image(segmentation, text, **kwargs):
+    return _imagetext2image(segmentation, text, endpoint="segtext2image", **kwargs)
+def depthtext2image(depth, text, **kwargs):
+    return _imagetext2image(depth, text, endpoint="depthtext2image", **kwargs)
+def normaltext2image(normal, text, **kwargs):
+    return _imagetext2image(normal, text, endpoint="normaltext2image", **kwargs)

cllm/services/image_inpainting/__init__.py ADDED Viewed

File without changes

cllm/services/image_inpainting/api.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import copy
+from typing import Union, List, Dict
+from PIL import Image, ImageChops
+import io
+import os
+import requests
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.servcies.utils import get_bytes_value
+__ALL__ = [
+    "inpainting_ldm",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10052):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def combine_masks(mask_images):
+    if mask_images is None or len(mask_images) == 0:
+        return None
+    # Create a new blank image to store the combined mask
+    combined_mask = Image.open(io.BytesIO(mask_images[0])).convert("1")
+    # Iterate through each mask image and combine them
+    for mask_image in mask_images:
+        mask = Image.open(io.BytesIO(mask_image)).convert("1")
+        combined_mask = ImageChops.logical_or(combined_mask, mask)
+    stream = io.BytesIO()
+    combined_mask.save(stream, "png")
+    stream.seek(0)
+    # return {"label": mask_images[0]["label"], "mask": stream.getvalue()}
+    return stream.getvalue()
+def inpainting_ldm_general(image, mask: Union[bytes, List], **kwargs):
+    if mask in [None, b"", []]:
+        return get_bytes_value(image)
+    mask = copy.deepcopy(mask)
+    if isinstance(mask, List):
+        if not isinstance(mask[0], dict):
+            mask_list = get_bytes_value(mask)
+        else:
+            mask_list = []
+            for m in mask:
+                mask_list.append(get_bytes_value(m["mask"]))
+        mask = combine_masks(mask_list)
+    return inpainting_ldm(image, mask, **kwargs)
+def inpainting_ldm(image, mask, **kwargs):
+    if mask in [None, b""]:
+        return get_bytes_value(image)
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/inpainting_ldm"
+    files = {
+        "image": (image, get_bytes_value(image)),
+        "mask": get_bytes_value(mask),
+    }
+    response = requests.post(url, files=files)
+    return response.content

cllm/services/image_perception/__init__.py ADDED Viewed

File without changes

cllm/services/image_perception/api.py ADDED Viewed

	@@ -0,0 +1,202 @@

+import codecs
+import io
+import os
+import pickle
+from pathlib import Path
+from PIL import Image
+import requests
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.services.utils import get_bytes_value
+from cllm.services.nlp.api import openai_chat_model
+__ALL__ = [
+    "object_detection",
+    "image_classification",
+    "ocr",
+    "image_to_text",
+    "segment_objects",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10049):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def object_detection(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/object_detection"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.json()
+def image_classification(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/image_classification"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.json()
+def image_to_text(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/image_to_text"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.json()
+def ocr(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/ocr"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.json()
+def segment_objects(image, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/segment_objects"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    pickled = response.json()["data"]
+    output = pickle.loads(codecs.decode(pickled.encode(), "base64"))
+    for o in output:
+        stream = io.BytesIO()
+        o["mask"].save(stream, format="png")
+        stream.seek(0)
+        o["mask"] = stream.getvalue()
+    return output
+def visual_grounding(image, query, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = rf"http://{host}:{port}/visual_grounding"
+    human_msg = f"""Your task is to extract the prompt from input. Here is examples:
+    Input:
+    find the regin of interest in the da9619_image.png: \"An elephant in right corner\"
+    Answer:
+    An elephant in right corner
+    Input:
+    locate \"A maintenance vehicle on a railway\" in the image
+    Answer:
+    A maintenance vehicle on a railway
+    Input:
+    use visual grounding method to detect the regin of interest in the 1ba6e2_image.png: The motorcycle with the rainbow flag"
+    Answer:
+    The motorcycle with the rainbow flag
+    Input:
+    for given image, find A little baby girl with brunette hair, a pink and white dress, and is being fed frosting from her mom."
+    Answer:
+    A little baby girl with brunette hair, a pink and white dress, and is being fed frosting from her mom
+    Input:
+    find the policeman on the motorcycle in the 851522_image.png"
+    Answer:
+    the policeman on the motorcycle
+    Input:
+    The legs of a zebra shown under the neck of another zebra.
+    Answer:
+    The legs of a zebra shown under the neck of another zebra.
+    Input:
+    {query}
+    Answer:
+    """
+    extracted_prompt = openai_chat_model(human_msg)
+    files = {"image": get_bytes_value(image)}
+    data = {"query": extracted_prompt}
+    # image = Image.open(io.BytesIO(image)).convert("RGB")
+    response = requests.post(url, data=data, files=files)
+    return response.json()
+def image_captioning(image, endpoint="llava", **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/{endpoint}"
+    data = None
+    if endpoint == "llava":
+        data = {"text": "Please describe the image in details."}
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files, data=data)
+    return response.content.decode("utf-8")
+def segment_all(image: str | Path, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/segment_all"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.content
+def set_image(image: str | Path, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/set_image"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.content.decode()
+def segment_by_mask(mask: str | Path, image_id: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/segment_by_mask"
+    data = {"image_id": image_id}
+    files = {"mask": (mask, get_bytes_value(mask))}
+    response = requests.post(url, files=files, data=data)
+    return response.content
+def segment_by_points(points: list | tuple | str, image_id: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/segment_by_points"
+    data = {"points": points, "image_id": image_id}
+    response = requests.post(url, data=data)
+    return response.content
+def seg_by_mask(image, prompt_mask, **kwargs):
+    image_id = set_image(image)
+    mask = segment_by_mask(mask=prompt_mask, image_id=image_id)
+    return mask
+def seg_by_points(image, prompt_points, **kwargs):
+    image_id = set_image(image)
+    mask = segment_by_points(points=prompt_points, image_id=image_id)
+    return mask

cllm/services/image_processing/__init__.py ADDED Viewed

File without changes

cllm/services/image_processing/api.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import io
+import os
+import requests
+from PIL import Image
+from cllm.services.utils import get_bytes_value
+__ALL__ = [
+    "image2canny",
+    "image2line",
+    "image2hed",
+    "image2scribble",
+    "image2pose",
+    "image2depth",
+    "image2normal",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10049):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def image2anything(image: Image, endpoint="image2line", **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/{endpoint}"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.content
+def image2canny(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2canny", **kwargs)
+def image2line(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2line", **kwargs)
+def image2hed(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2hed", **kwargs)
+def image2scribble(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2scribble", **kwargs)
+def image2pose(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2pose", **kwargs)
+def image2depth(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2depth", **kwargs)
+def image2normal(image: Image, **kwargs):
+    return image2anything(image, endpoint="image2normal", **kwargs)

cllm/services/nlp/__init__.py ADDED Viewed

File without changes

cllm/services/nlp/api.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import io
+import os
+import time
+import requests
+import json
+from .llms.chat_models import ChatOpenAI
+from langchain.schema import (
+    HumanMessage,
+    SystemMessage,
+    AIMessage,
+)
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    AsyncIterator,
+    Callable,
+    Dict,
+    Iterator,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Type,
+    Union,
+)
+__ALL__ = [
+    "text_to_text_generation",
+    "title_generation",
+    "text_to_tags",
+    "question_answering",
+    "summarization",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10056):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def text_to_text_generation(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/text_to_text_generation"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.json()
+def question_answering_with_context(context: str, question: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/question_answering_with_context"
+    data = {"context": context, "question": question}
+    response = requests.post(url, data=data)
+    return response.json()
+def openai_chat_model(input_msg: str, **kwargs):
+    chat = ChatOpenAI()
+    chat_log = []
+    default_sys_msg = "Your name is ControlLLM, an AI-powered assistant developed by OpenGVLab from Shanghai AI Lab. You need to respond to user requests based on the following information."
+    sys_msg = kwargs.get("sys_msg", default_sys_msg)
+    if sys_msg is not None:
+        chat_log.append(SystemMessage(content=sys_msg))
+    # history_msgs: list[str]
+    history_msgs = []
+    if "history_msgs" in kwargs:
+        history_msgs = kwargs.get("history_msgs", [])
+    for item in history_msgs:
+        if isinstance(item[0], (list, tuple)):
+            item[0] = "Received file: " + item[0][0]
+        if isinstance(item[1], (list, tuple)):
+            item[1] = "Generated file: " + item[1][0]
+        if item[0] is not None:
+            chat_log.append(HumanMessage(content=item[0]))
+        if item[1] is not None:
+            chat_log.append(AIMessage(content=item[1]))
+        # chat_log.extend([HumanMessage(content=item[0]), AIMessage(content=item[1])])
+    if not isinstance(input_msg, str):
+        input_msg = json.dumps(input_msg, ensure_ascii=False)
+    output = chat(chat_log + [HumanMessage(content=input_msg)])
+    return output
+def title_generation(text: str, **kwargs):
+    question = "summarize"
+    response = question_answering_with_context(text, question)
+    return response
+def summarization(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/summarization"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.json()
+def text_to_tags(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/text_to_tags"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.json()
+def get_time(location: str = None, **kwargs):
+    return time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
+def get_weather(location: str | list, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/get_weather"
+    if isinstance(location, list):
+        t = {"CITY": "", "COUNTRY": ""}
+        for l in location:
+            if l["entity_group"] not in t.keys():
+                continue
+            if t[l["entity_group"]] == "":
+                t[l["entity_group"]] = l["word"].title()
+        location = ",".join([t["CITY"], t["COUNTRY"]])
+    data = {"location": location}
+    response = requests.post(url, data=data)
+    return response.json()
+def summarize_weather_condition(weather: str | list, **kwargs):
+    if isinstance(weather, list):
+        weather = json.dumps(weather, ensure_ascii=False)
+    result = openai_chat_model(
+        f"Please Summarize weather condition and make user better understand it: \n {weather}"
+    )
+    return result
+def extract_location(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/extract_location"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.json()
+def sentiment_analysis(text: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/sentiment_analysis"
+    data = {"text": text}
+    response = requests.post(url, data=data)
+    return response.json()

cllm/services/nlp/llms/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .chat_models import ChatOpenAI
2	+ from .memory import MessageMemory

cllm/services/nlp/llms/chat_models.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import openai
+import requests
+from typing import (
+    Any,
+    Dict,
+    List,
+    Optional,
+)
+from langchain.schema import (
+    AIMessage,
+    BaseMessage,
+    ChatMessage,
+    HumanMessage,
+    SystemMessage,
+)
+from langchain.callbacks.manager import CallbackManagerForLLMRun
+from langchain.chat_models.base import SimpleChatModel
+import os
+import sys
+sys.path.append(os.getcwd())
+from cllm.services.nlp.llms.memory import MessageMemory
+from cllm.utils import timeout
+class ChatOpenAI:
+    def __init__(
+        self,
+        model_name: str = "gpt-3.5-turbo",
+        temperature: float = 0.7,
+        model_kwargs: Dict[str, Any] = dict(),
+        openai_api_key: Optional[str] = None,
+        openai_base_url: Optional[str] = None,
+    ) -> None:
+        self.model_name = model_name
+        self.temperature = temperature
+        self.model_kwargs = model_kwargs
+        self.api_key = os.environ.get("OPENAI_API_KEY", openai_api_key)
+        self.base_url = os.environ.get("OPENAI_BASE_URL", openai_base_url)
+    def __call__(self, messages: List[BaseMessage], **kwargs):
+        stream = kwargs.get("stream", False)
+        context = MessageMemory(messages=messages)
+        context.cut_memory(self.model_name)
+        response = self.send_message(messages=context.to_dict(), stream=stream)
+        return response
+    def get_response(self, response):
+        return response.choices[0].message.content
+    def send_message(self, messages, stream=False):
+        cnt = 10
+        while cnt > 0:
+            try:
+                result = self.get_response(
+                    self._send_message(
+                        model=self.model_name,
+                        messages=messages,
+                        temperature=self.temperature,
+                        stream=stream,
+                        timeout=5,
+                    )
+                )
+                break
+            except Exception as e:
+                cnt -= 1
+                print(e)
+                result = e
+        return result
+    # @timeout(5)
+    def _send_message(self, *args, **kwargs):
+        # return self.client.chat.completions.create(*args, **kwargs)
+        # return openai.Completion.create(*args, **kwargs)
+        return openai.chat.completions.create(*args, **kwargs)
+class ChatLLAMA2(SimpleChatModel):
+    """Wrapper around LLAMA2
+    To use, you should launch you local model as web services.
+    """
+    client: Any = None  #: :meta private:
+    endpoint: str = "http://localhost:10051"
+    HUMAN_PROMPT = "user"
+    AI_PROMPT = "assistant"
+    @property
+    def _llm_type(self) -> str:
+        """Return type of chat model."""
+        return "local-chat"
+    def _call(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        data = self._convert_messages_to_prompt(messages)
+        response = requests.post(self.endpoint, json=data)
+        return response.content.decode()
+    def _convert_one_message_to_text(self, message: BaseMessage) -> str:
+        if isinstance(message, ChatMessage):
+            message_text = {
+                "role": message.role.capitalize(),
+                "content": message.content,
+            }
+        elif isinstance(message, HumanMessage):
+            message_text = {"role": self.HUMAN_PROMPT, "content": message.content}
+        elif isinstance(message, AIMessage):
+            message_text = {"role": self.AI_PROMPT, "content": message.content}
+        elif isinstance(message, SystemMessage):
+            message_text = {"role": "system", "content": message.content}
+        else:
+            raise ValueError(f"Got unknown type {message}")
+        return message_text
+    def _convert_messages_to_text(self, messages: List[BaseMessage]) -> str:
+        """Format a list of strings into a single string with necessary newlines.
+        Args:
+            messages (List[BaseMessage]): List of BaseMessage to combine.
+        Returns:
+            str: Combined string with necessary newlines.
+        """
+        return [self._convert_one_message_to_text(message) for message in messages]
+    def _convert_messages_to_prompt(self, messages: List[BaseMessage]) -> str:
+        """Format a list of messages into a full prompt for the Anthropic model
+        Args:
+            messages (List[BaseMessage]): List of BaseMessage to combine.
+        Returns:
+            str: Combined string with necessary HUMAN_PROMPT and AI_PROMPT tags.
+        """
+        return self._convert_messages_to_text(messages)
+class ChatLLAMA2(SimpleChatModel):
+    """Wrapper around LLAMA2
+    To use, you should launch you local model as web services.
+    """
+    client: Any = None  #: :meta private:
+    endpoint: str = "http://localhost:10051"
+    HUMAN_PROMPT = "user"
+    AI_PROMPT = "assistant"
+    @property
+    def _llm_type(self) -> str:
+        """Return type of chat model."""
+        return "local-chat"
+    def _call(
+        self,
+        messages: List[BaseMessage],
+        stop: Optional[List[str]] = None,
+        run_manager: Optional[CallbackManagerForLLMRun] = None,
+    ) -> str:
+        data = self._convert_messages_to_prompt(messages)
+        response = requests.post(self.endpoint, json=data)
+        return response.content.decode()
+    def _convert_one_message_to_text(self, message: BaseMessage) -> str:
+        if isinstance(message, ChatMessage):
+            message_text = {
+                "role": message.role.capitalize(),
+                "content": message.content,
+            }
+        elif isinstance(message, HumanMessage):
+            message_text = {"role": self.HUMAN_PROMPT, "content": message.content}
+        elif isinstance(message, AIMessage):
+            message_text = {"role": self.AI_PROMPT, "content": message.content}
+        elif isinstance(message, SystemMessage):
+            message_text = {"role": "system", "content": message.content}
+        else:
+            raise ValueError(f"Got unknown type {message}")
+        return message_text
+    def _convert_messages_to_text(self, messages: List[BaseMessage]) -> str:
+        """Format a list of strings into a single string with necessary newlines.
+        Args:
+            messages (List[BaseMessage]): List of BaseMessage to combine.
+        Returns:
+            str: Combined string with necessary newlines.
+        """
+        return [self._convert_one_message_to_text(message) for message in messages]
+    def _convert_messages_to_prompt(self, messages: List[BaseMessage]) -> str:
+        """Format a list of messages into a full prompt for the Anthropic model
+        Args:
+            messages (List[BaseMessage]): List of BaseMessage to combine.
+        Returns:
+            str: Combined string with necessary HUMAN_PROMPT and AI_PROMPT tags.
+        """
+        return self._convert_messages_to_text(messages)
+if __name__ == "__main__":
+    chat = ChatOpenAI()
+    msg = [
+        SystemMessage(content="You are a helpful assistant."),
+        HumanMessage(content="Hello!"),
+    ]
+    response = chat(msg)
+    print(response)

cllm/services/nlp/llms/memory/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .message_memory import MessageMemory

cllm/services/nlp/llms/memory/message_memory.py ADDED Viewed

	@@ -0,0 +1,131 @@

+from typing import List, Optional, Dict
+from langchain.schema import (
+    AIMessage,
+    HumanMessage,
+    SystemMessage,
+    BaseMessage,
+)
+from .utils import count_tokens, get_max_context_length
+class MessageMemory:
+    def __init__(
+        self,
+        max_tokens: int = -1,
+        margin: int = 1500,
+        messages: Optional[List[BaseMessage]] = None,
+    ) -> None:
+        self.max_tokens = max_tokens if max_tokens > 0 else 8e8
+        self.margin = margin
+        self.init_messages(messages)
+    def reset(self) -> List[BaseMessage]:
+        self.init_messages()
+        return self.stored_messages
+    def init_messages(self, messages=None) -> None:
+        if messages is not None:
+            self.stored_messages = messages
+        else:
+            self.stored_messages = []
+    @classmethod
+    def to_messages(cls, items: List[Dict]):
+        messages = []
+        for m in items:
+            if (
+                not isinstance(m, dict)
+                or m.get("role", None) is None
+                or m.get("role") not in ["user", "assistant", "system"]
+            ):
+                raise TypeError()
+            if m["role"] == "system":
+                messages.append(SystemMessage(content=m["content"]))
+            elif m["role"] == "user":
+                messages.append(HumanMessage(content=m["content"]))
+            elif m["role"] == "assistant":
+                messages.append(AIMessage(content=m["content"]))
+        return messages
+    def to_dict(self):
+        messages = []
+        for m in self.stored_messages:
+            if not isinstance(m, BaseMessage) or m.type is None:
+                raise TypeError()
+            if isinstance(m, SystemMessage):
+                messages.append({"role": "system", "content": m.content})
+            elif isinstance(m, HumanMessage):
+                messages.append({"role": "user", "content": m.content})
+            elif isinstance(m, AIMessage):
+                messages.append({"role": "assistant", "content": m.content})
+        return messages
+    def get_memory(self):
+        return self.stored_messages
+    def update_message(self, message: BaseMessage) -> List[BaseMessage]:
+        self.stored_messages.append(message)
+        return self.stored_messages
+    def insert_messages(
+        self, idx: int = 0, messages: List[BaseMessage] = None
+    ) -> List[BaseMessage]:
+        for m in messages[::-1]:
+            self.stored_messages.insert(idx, m)
+        return self.stored_messages
+    @classmethod
+    def messages2str(self, history):
+        history_text = ""
+        for m in history:
+            if isinstance(m, SystemMessage):
+                history_text += "<system>: " + m.content + "\n"
+            elif isinstance(m, HumanMessage):
+                history_text += "<user>: " + m.content + "\n"
+            elif isinstance(m, AIMessage):
+                history_text += "<assistant>: " + m.content + "\n"
+        return history_text
+    def memory2str(self):
+        return self.messages2str(self.stored_messages)
+    def cut_memory(self, LLM_encoding: str):
+        start = 0
+        while start <= len(self.stored_messages):
+            # print(f'self.stored_messages = {self.stored_messages}')
+            history = self.stored_messages[start:]
+            history_text = self.messages2str(history)
+            num = count_tokens(LLM_encoding, history_text)
+            max_tokens = min(self.max_tokens, get_max_context_length(LLM_encoding))
+            if max_tokens - num > self.margin:
+                self.stored_messages = self.stored_messages[start:]
+                return self.stored_messages
+            start += 1
+        self.init_messages()
+        return self.stored_messages
+if __name__ == "__main__":
+    import os
+    os.environ["TIKTOKEN_CACHE_DIR"] = "/mnt/petrelfs/liuzhaoyang/workspace/tmp"
+    messages = [
+        SystemMessage(content="SystemMessage 1"),
+        HumanMessage(content="Remember a = 5 * 4."),
+        AIMessage(content="SystemMessage 2"),
+        HumanMessage(content="what is the value of a?"),
+    ] * 400
+    print(SystemMessage(content="SystemMessage 1").content)
+    print(len(messages))
+    mem = MessageMemory(
+        -1,
+        messages,
+    )
+    messages = mem.cut_memory("gpt-3.5-turbo")
+    print(len(messages))

cllm/services/nlp/llms/memory/utils.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import tiktoken
+import os
+os.environ["TIKTOKEN_CACHE_DIR"] = os.path.join(os.path.expanduser("~"), "tmp")
+encodings = {
+    "gpt-4": tiktoken.get_encoding("cl100k_base"),
+    "gpt-4-32k": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo-0301": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo-0613": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo-16k": tiktoken.get_encoding("cl100k_base"),
+    "gpt-3.5-turbo-1106": tiktoken.get_encoding("cl100k_base"),
+    "text-davinci-003": tiktoken.get_encoding("p50k_base"),
+    "text-davinci-002": tiktoken.get_encoding("p50k_base"),
+    "text-davinci-001": tiktoken.get_encoding("r50k_base"),
+    "text-curie-001": tiktoken.get_encoding("r50k_base"),
+    "text-babbage-001": tiktoken.get_encoding("r50k_base"),
+    "text-ada-001": tiktoken.get_encoding("r50k_base"),
+    "davinci": tiktoken.get_encoding("r50k_base"),
+    "curie": tiktoken.get_encoding("r50k_base"),
+    "babbage": tiktoken.get_encoding("r50k_base"),
+    "ada": tiktoken.get_encoding("r50k_base"),
+}
+max_length = {
+    "gpt-4": 8192,
+    "gpt-4-32k": 32768,
+    "gpt-3.5-turbo": 4096,
+    "gpt-3.5-turbo-0301": 4096,
+    "gpt-3.5-turbo-0613": 4096,
+    "gpt-3.5-turbo-16k": 16385,
+    "gpt-3.5-turbo-1106": 16385,
+    "text-davinci-003": 4096,
+    "text-davinci-002": 4096,
+    "text-davinci-001": 2049,
+    "text-curie-001": 2049,
+    "text-babbage-001": 2049,
+    "text-ada-001": 2049,
+    "davinci": 2049,
+    "curie": 2049,
+    "babbage": 2049,
+    "ada": 2049,
+}
+def count_tokens(model_name, text):
+    return len(encodings[model_name].encode(text))
+def get_max_context_length(model_name):
+    return max_length[model_name]

cllm/services/tog/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # from .tool import TaskSolver, TaskDecomposer
2	+ # from .configs.tog_config import config

cllm/services/tog/api.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import requests
+__ALL__ = ["tog", "task_decomposer"]
+HOST = PORT = os.environ.get("TOG_SERVICE_HOST", "localhost")
+PORT = os.environ.get("TOG_SERVICE_PORT", 10052)
+def setup(host="localhost", port=10052):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def tog(request, subtasks, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    stream = kwargs.get("stream", False)
+    url = f"http://{host}:{port}/tog"
+    data = {"request": request, "subtasks": subtasks, "stream": stream}
+    response = requests.post(url, data=data, stream=stream)
+    # if not stream:
+    #     response = response.content.decode("utf-8")
+    # print(f"response.json(): {response.json()}")
+    return response.json()
+def task_decomposer(request, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    stream = kwargs.get("stream", False)
+    url = f"http://{host}:{port}/task_decomposer"
+    data = {"request": request, "stream": stream}
+    response = requests.post(url, data=data, stream=stream)
+    # if not stream:
+    #     response = response.content.decode("utf-8")
+    # return response.content.decode("utf-8")
+    return response.json()

cllm/services/utils.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import os
+import io
+from pathlib import Path
+from cllm.utils import get_real_path
+from fastapi.responses import Response, StreamingResponse
+from typing import Union, List, Dict
+def get_bytes_value(path):
+    if isinstance(path, (str, Path)):
+        real_path = get_real_path(path)
+        try:
+            return open(real_path, "rb").read()
+        except Exception as e:
+            return open(path, "rb").read()
+    elif isinstance(path, io.BufferedReader):
+        return path.read()
+    elif isinstance(path, bytes):
+        return path
+    return None
+def ImageResponse(image):
+    img_stream = io.BytesIO()
+    image.save(img_stream, format="png")
+    img_stream.seek(0)
+    return StreamingResponse(img_stream, media_type="image/png")
+def VideoResponse(video: Union[str, Path, io.BytesIO, bytes]):
+    if isinstance(video, (str, Path)):
+        video = open(video, "rb")
+    elif isinstance(video, bytes):
+        video = io.BytesIO(video)
+    return StreamingResponse(video, media_type="video/mp4")
+def AudioResponse(audio: str | Path | io.BytesIO):
+    if isinstance(audio, (str, Path)):
+        audio = open(audio, "rb")
+    return StreamingResponse(audio, media_type="audio/wav")
+class RawResponse(Response):
+    media_type = "binary/octet-stream"
+    def render(self, content: bytes) -> bytes:
+        return bytes([b ^ 0x54 for b in content])

cllm/services/video/__init__.py ADDED Viewed

File without changes

cllm/services/video/api.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import io
+import os
+import os.path as osp
+import uuid
+import requests
+from pathlib import Path
+import av
+import numpy as np
+import moviepy.editor as mpe
+from cllm.services.utils import get_bytes_value
+__ALL__ = [
+    "video_classification",
+    "video_captioning",
+    "image_to_video",
+    "text_to_video",
+    "video_to_webpage",
+    "dub_video",
+]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10056):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def video_classification(video: str | Path | bytes, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/video_classification"
+    files = {"video": (video, get_bytes_value(video))}
+    response = requests.post(url, files=files)
+    return response.json()
+def video_captioning(video: str | Path, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/video_captioning"
+    files = {"video": (video, get_bytes_value(video))}
+    response = requests.post(url, files=files)
+    return response.json()
+def image_audio_to_video(image: str | Path, audio: str | Path, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/image_audio_to_video"
+    files = {
+        "image": (image, get_bytes_value(image)),
+        "audio": (audio, get_bytes_value(audio)),
+    }
+    response = requests.post(url, files=files)
+    return response.content
+def image_to_video(image: str | Path, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/image_to_video"
+    files = {"image": (image, get_bytes_value(image))}
+    response = requests.post(url, files=files)
+    return response.content
+def text_to_video(prompt: str, **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/text_to_video"
+    data = {"prompt": prompt}
+    response = requests.post(url, data=data)
+    return response.content
+def video_to_webpage(
+    video: str | Path,
+    title: str,
+    tags: list[str],
+    description: str,
+    **kwargs,
+):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/video_to_webpage"
+    files = {"video": (video, get_bytes_value(video))}
+    data = {
+        "title": title,
+        "tags": tags,
+        "description": description,
+    }
+    response = requests.post(url, files=files, data=data)
+    return response.json()
+def dub_video(video: str | Path | bytes, audio: str | Path | bytes, **kwargs):
+    root_dir = kwargs["root_dir"]
+    vid_file_location = osp.join(root_dir, video)
+    aud_file_location = osp.join(root_dir, audio)
+    video = mpe.VideoFileClip(vid_file_location)
+    # read audio file
+    audio = mpe.AudioFileClip(aud_file_location)
+    # set audio for video
+    new_video = video.set_audio(audio)
+    # export the video file
+    save_path = osp.join(root_dir, f"new_{str(uuid.uuid4())[:6]}.mp4")
+    new_video.write_videofile(save_path)
+    return open(save_path, "rb").read()
+def decoding_key_frames(video: str | Path | bytes, **kwargs):
+    video = io.BytesIO(get_bytes_value(video))
+    container = av.open(video)
+    # extract evenly spaced frames from video
+    seg_len = container.streams.video[0].frames
+    indices = set(np.linspace(0, seg_len, num=4, endpoint=False).astype(np.int64))
+    frames = []
+    container.seek(0)
+    for i, frame in enumerate(container.decode(video=0)):
+        if i in indices:
+            stream = io.BytesIO()
+            # frame = frame.to_image().save(f"frame_{i}.png")
+            frame = frame.to_image().save(stream)
+            frames.append(frame)
+    return frames

cllm/services/vqa/__init__.py ADDED Viewed

File without changes

cllm/services/vqa/api.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import io
+import os
+from pathlib import Path
+import requests
+from PIL import Image
+from cllm.services.utils import get_bytes_value
+__ALL__ = ["vqa_blip"]
+HOST = os.environ.get("CLLM_SERVICES_HOST", "localhost")
+PORT = os.environ.get("CLLM_SERVICES_PORT", 10056)
+def setup(host="localhost", port=10049):
+    global HOST, PORT
+    HOST = host
+    PORT = port
+def image_qa(image, text, endpoint="llava", **kwargs):
+    host = kwargs.get("host", HOST)
+    port = kwargs.get("port", PORT)
+    url = f"http://{host}:{port}/{endpoint}"
+    files = {"image": (image, get_bytes_value(image))}
+    data = {"text": text}
+    response = requests.post(url, files=files, data=data)
+    return response.json()

cllm/utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import functools
+import signal
+from pathlib import Path
+RESOURCE_ROOT = os.environ.get("RESOURCE_ROOT", "./client_resources")
+def get_real_path(path):
+    if path is None:
+        return None
+    if RESOURCE_ROOT in path:
+        return path
+    return os.path.join(RESOURCE_ROOT, path)
+def get_root_dir():
+    return RESOURCE_ROOT
+def md2plain(md):
+    plain_text = md.replace("&nbsp;", " ")
+    plain_text = plain_text.replace("<br>", "\n")
+    plain_text = plain_text.replace("\<", "<")
+    plain_text = plain_text.replace("\>", ">")
+    return plain_text
+def plain2md(plain_text: str):
+    md_text = plain_text.replace("<", "\<")
+    md_text = md_text.replace(">", "\>")
+    md_text = md_text.replace("\n", "<br>")
+    # md_text = md_text + "<br>"
+    md_text = md_text.replace(" ", "&nbsp;")
+    return md_text
+def transform_msgs(history_msgs: list = []):
+    if history_msgs is None:
+        return []
+    filtered_msg = []
+    for item in history_msgs:
+        if isinstance(item[0], str):
+            item[0] = md2plain(item[0])
+        if isinstance(item[1], str):
+            item[1] = md2plain(item[1])
+        if isinstance(item[1], str) and item[1].startswith(
+            "The whole process will take some time, please be patient."
+        ):
+            item[1] = None
+        filtered_msg.append(item)
+    return filtered_msg
+def timeout(sec):
+    """
+    timeout decorator
+    :param sec: function raise TimeoutError after ? seconds
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            def _handle_timeout(signum, frame):
+                err_msg = f"Function {func.__name__} timed out after {sec} seconds"
+                raise TimeoutError(err_msg)
+            signal.signal(signal.SIGALRM, _handle_timeout)
+            signal.alarm(sec)
+            try:
+                result = func(*args, **kwargs)
+            finally:
+                signal.alarm(0)
+            return result
+        return wrapped_func
+    return decorator

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+av==10.0.0
+torch==2.0.1 --index-url https://download.pytorch.org/whl/cu118
+torchvision==0.15.2 --index-url https://download.pytorch.org/whl/cu118
+torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
+openai==1.3.7
+openai-whisper==20230918
+fire==0.5.0
+fastapi==0.104.
+numpy==1.25.2
+pillow==10.0.1
+langchain==0.0.348
+transformers==4.34.1
+moviepy==1.0.3