{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "authorship_tag": "ABX9TyPaNjrQ4/vWL00DhJIGcmSl", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "!pip install -U langchain_community -q\n", "!pip install -U langchain -q\n", "!pip install google-search-results -q\n", "!pip install langchainhub -q\n", "!pip install text_generation -q\n", "!pip install arxiv -q" ], "metadata": { "id": "W125o44IGdna", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "c746bf1e-0b97-4eab-c94d-a7f417980d8b" }, "execution_count": 1, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.2/241.2 kB\u001b[0m \u001b[31m11.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m4.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m815.9/815.9 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for google-search-results (setup.py) ... \u001b[?25l\u001b[?25hdone\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.1/81.1 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for sgmllib3k (setup.py) ... \u001b[?25l\u001b[?25hdone\n" ] } ] }, { "cell_type": "code", "source": [ "import os\n", "from google.colab import userdata\n", "os.environ[\"HUGGINGFACEHUB_API_TOKEN\"] = userdata.get('HUGGINGFACEHUB_API_TOKEN')\n", "os.environ[\"SERPAPI_API_KEY\"] = userdata.get('SERPAPI_API_KEY')" ], "metadata": { "id": "yXd9TTjoK439" }, "execution_count": 2, "outputs": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sOrq-hGZGZ_r", "outputId": "15f1040e-2e50-40dc-f1ef-bdff72097228" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ " Why don't astronauts forget things in space?\n", "\n", "Because in space, they weightless information! 💡🚀\n", "\n", "(This one is an oldie but goodie!)\n" ] } ], "source": [ "# LangChain supports many other chat models. Here, we're using Ollama\n", "from langchain_community.chat_models import ChatOllama\n", "from langchain_core.output_parsers import StrOutputParser\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "# supports many more optional parameters. Hover on your `ChatOllama(...)`\n", "# class to view the latest available supported parameters\n", "llm = ChatOllama(\n", " model=\"mistral\",\n", " base_url=\"https://0013-35-201-206-176.ngrok-free.app\"\n", " )\n", "prompt = ChatPromptTemplate.from_template(\"Tell me a short joke about {topic}\")\n", "\n", "# using LangChain Expressive Language chain syntax\n", "# learn more about the LCEL on\n", "# https://python.langchain.com/docs/expression_language/why\n", "chain = prompt | llm | StrOutputParser()\n", "\n", "# for brevity, response is printed in terminal\n", "# You can use LangServe to deploy your application for\n", "# production\n", "print(chain.invoke({\"topic\": \"Space travel\"}))" ] }, { "cell_type": "code", "source": [ "from langchain.tools.retriever import create_retriever_tool\n", "from langchain_community.utilities import SerpAPIWrapper\n", "from langchain.retrievers import ArxivRetriever\n", "from langchain_core.tools import Tool\n", "\n", "retriever = ArxivRetriever(load_max_docs=2)\n", "\n", "tools = [create_retriever_tool(\n", " retriever,\n", " \"search arxiv's database for\",\n", " \"Use this to recomend the user a paper to read Unless stated please choose the most recent models\",\n", " # \"Searches and returns excerpts from the 2022 State of the Union.\",\n", "),\n", "\n", "Tool(\n", " name=\"SerpAPI\",\n", " description=\"A low-cost Google Search API. Useful for when you need to answer questions about current events. Input should be a search query.\",\n", " func=SerpAPIWrapper().run,\n", ")\n", "\n", "]\n", "\n", "# tools = [tool]" ], "metadata": { "id": "tn5XMqCVeaQK" }, "execution_count": 8, "outputs": [] }, { "cell_type": "code", "source": [ "from langchain import hub\n", "from langchain.agents import AgentExecutor, load_tools\n", "from langchain.agents.format_scratchpad import format_log_to_str\n", "from langchain.agents.output_parsers import (\n", " ReActJsonSingleInputOutputParser,\n", ")\n", "from langchain.tools.render import render_text_description\n", "# from langchain_community.utilities import SerpAPIWrapper\n", "# from langchain.retrievers import ArxivRetriever\n", "\n", "# setup tools\n", "# tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", "\n", "# tools = load_tools([\"serpapi\", \"llm-math\",create_retriever_tool], llm=llm)\n", "\n", "# setup ReAct style prompt\n", "prompt = hub.pull(\"hwchase17/react-json\")\n", "prompt = prompt.partial(\n", " tools=render_text_description(tools),\n", " tool_names=\", \".join([t.name for t in tools]),\n", ")\n", "\n", "chat_model = llm\n", "# define the agent\n", "chat_model_with_stop = chat_model.bind(stop=[\"\\nObservation\"])\n", "agent = (\n", " {\n", " \"input\": lambda x: x[\"input\"],\n", " \"agent_scratchpad\": lambda x: format_log_to_str(x[\"intermediate_steps\"]),\n", " }\n", " | prompt\n", " | chat_model_with_stop\n", " | ReActJsonSingleInputOutputParser()\n", ")\n", "\n", "# instantiate AgentExecutor\n", "agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True,handle_parsing_errors=True)\n", "\n", "# agent_executor.invoke(\n", "# {\n", "# \"input\": \"Who is the current holder of the speed skating world record on 500 meters? What is her current age raised to the 0.43 power?\"\n", "# }\n", "# )\n", "\n", "# agent_executor.invoke(\n", "# {\n", "# \"input\": \"what are large language models and why are they so expensive to run?\"\n", "# }\n", "# )\n", "\n", "agent_executor.invoke(\n", " {\n", " \"input\": \"How to generate videos from images using state of the art macchine learning models\"\n", " }\n", ")\n", "\n", "\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VzcIqgcDKZuZ", "outputId": "99fe8de1-8539-4fe9-ae24-576aad0daee6" }, "execution_count": 9, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m Thought: To find the most recent papers on generating videos from images using machine learning models, I will use the \"search arxiv's database for\" tool.\n", "\n", "Action:\n", "```json\n", "{\n", " \"action\": \"search arxiv's database for\",\n", " \"action_input\": \"generating videos from images machine learning\"\n", "}\n", "```\u001b[0m\u001b[36;1m\u001b[1;3mDiffusion models exhibited tremendous progress in image and video generation,\n", "exceeding GANs in quality and diversity. However, they are usually trained on\n", "very large datasets and are not naturally adapted to manipulate a given input\n", "image or video. In this paper we show how this can be resolved by training a\n", "diffusion model on a single input image or video. Our image/video-specific\n", "diffusion model (SinFusion) learns the appearance and dynamics of the single\n", "image or video, while utilizing the conditioning capabilities of diffusion\n", "models. It can solve a wide array of image/video-specific manipulation tasks.\n", "In particular, our model can learn from few frames the motion and dynamics of a\n", "single input video. It can then generate diverse new video samples of the same\n", "dynamic scene, extrapolate short videos into long ones (both forward and\n", "backward in time) and perform video upsampling. Most of these tasks are not\n", "realizable by current video-specific generation methods.\n", "\n", "In this work we deal with the problem of high-level event detection in video.\n", "Specifically, we study the challenging problems of i) learning to detect video\n", "events from solely a textual description of the event, without using any\n", "positive video examples, and ii) additionally exploiting very few positive\n", "training samples together with a small number of ``related'' videos. For\n", "learning only from an event's textual description, we first identify a general\n", "learning framework and then study the impact of different design choices for\n", "various stages of this framework. For additionally learning from example\n", "videos, when true positive training samples are scarce, we employ an extension\n", "of the Support Vector Machine that allows us to exploit ``related'' event\n", "videos by automatically introducing different weights for subsets of the videos\n", "in the overall training set. Experimental evaluations performed on the\n", "large-scale TRECVID MED 2014 video dataset provide insight on the effectiveness\n", "of the proposed methods.\n", "\n", "We study the problem of video-to-video synthesis, whose goal is to learn a\n", "mapping function from an input source video (e.g., a sequence of semantic\n", "segmentation masks) to an output photorealistic video that precisely depicts\n", "the content of the source video. While its image counterpart, the\n", "image-to-image synthesis problem, is a popular topic, the video-to-video\n", "synthesis problem is less explored in the literature. Without understanding\n", "temporal dynamics, directly applying existing image synthesis approaches to an\n", "input video often results in temporally incoherent videos of low visual\n", "quality. In this paper, we propose a novel video-to-video synthesis approach\n", "under the generative adversarial learning framework. Through carefully-designed\n", "generator and discriminator architectures, coupled with a spatio-temporal\n", "adversarial objective, we achieve high-resolution, photorealistic, temporally\n", "coherent video results on a diverse set of input formats including segmentation\n", "masks, sketches, and poses. Experiments on multiple benchmarks show the\n", "advantage of our method compared to strong baselines. In particular, our model\n", "is capable of synthesizing 2K resolution videos of street scenes up to 30\n", "seconds long, which significantly advances the state-of-the-art of video\n", "synthesis. Finally, we apply our approach to future video prediction,\n", "outperforming several state-of-the-art competing systems.\u001b[0m\u001b[32;1m\u001b[1;3m This paper discusses several recent studies on generating videos from images using machine learning models. One study introduces SinFusion, a diffusion model that learns the appearance and dynamics of a single image or video for various manipulation tasks. Another study deals with high-level event detection in videos, learning from textual descriptions and exploiting related videos. The third study proposes a novel video-to-video synthesis approach under the generative adversarial learning framework to achieve photorealistic, temporally coherent videos.\n", "\n", "Final Answer: There are recent studies that discuss generating videos from images using machine learning models. One study introduces SinFusion, a diffusion model for image/video-specific manipulation tasks. Another study deals with high-level event detection in videos using textual descriptions and related videos. The third study proposes a novel video-to-video synthesis approach under the generative adversarial learning framework to achieve photorealistic, temporally coherent videos.\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "{'input': 'How to generate videos from images using state of the art macchine learning models',\n", " 'output': 'There are recent studies that discuss generating videos from images using machine learning models. One study introduces SinFusion, a diffusion model for image/video-specific manipulation tasks. Another study deals with high-level event detection in videos using textual descriptions and related videos. The third study proposes a novel video-to-video synthesis approach under the generative adversarial learning framework to achieve photorealistic, temporally coherent videos.'}" ] }, "metadata": {}, "execution_count": 9 } ] } ] }