Spaces:

keeprunningway
/

FunClip

Building

App Files Files Community

keeprunningway commited on Dec 28, 2024

Commit

ad97eaf

verified ·

1 Parent(s): 684cd29

Upload folder using huggingface_hub

Browse files

Files changed (30) hide show

.gitattributes +5 -0
.gitignore +6 -0
.gradio/certificate.pem +31 -0
LICENSE +21 -0
README.md +171 -6
README_zh.md +169 -0
docs/images/LLM_guide.png +0 -0
docs/images/demo.png +3 -0
docs/images/demo_en.png +3 -0
docs/images/dingding.png +3 -0
docs/images/guide.jpg +3 -0
docs/images/interface.jpg +0 -0
docs/images/wechat.png +0 -0
font/STHeitiMedium.ttc +3 -0
funclip/__init__.py +0 -0
funclip/introduction.py +39 -0
funclip/launch.py +310 -0
funclip/llm/demo_prompt.py +272 -0
funclip/llm/g4f_openai_api.py +30 -0
funclip/llm/openai_api.py +48 -0
funclip/llm/qwen_api.py +30 -0
funclip/test/imagemagick_test.py +18 -0
funclip/test/test.sh +15 -0
funclip/utils/argparse_tools.py +88 -0
funclip/utils/subtitle_utils.py +130 -0
funclip/utils/theme.json +333 -0
funclip/utils/trans_utils.py +132 -0
funclip/videoclipper.py +444 -0
gradio.yaml +1 -0
requirements.txt +14 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+docs/images/demo.png filter=lfs diff=lfs merge=lfs -text
+docs/images/demo_en.png filter=lfs diff=lfs merge=lfs -text
+docs/images/dingding.png filter=lfs diff=lfs merge=lfs -text
+docs/images/guide.jpg filter=lfs diff=lfs merge=lfs -text
+font/STHeitiMedium.ttc filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+*.pyc
+.DS_Store
+*.DS_Store
+ClipVideo/clipvideo/output
+*__pycache__
+*.spec

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Alibaba
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,12 +1,177 @@
 ---
 title: FunClip
-emoji: 🌍
-colorFrom: blue
-colorTo: gray
 sdk: gradio
 sdk_version: 5.9.1
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: FunClip
+app_file: funclip/launch.py
 sdk: gradio
 sdk_version: 5.9.1
 ---
+[![SVG Banners](https://svg-banners.vercel.app/api?type=rainbow&text1=FunClip%20%20🥒&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+### <p align="center">「[简体中文](./README_zh.md) | English」</p>
+**<p align="center"> ⚡ Open-source, accurate and easy-to-use video clipping tool </p>**
+**<p align="center"> 🧠 Explore LLM based video clipping with FunClip </p>**
+<p align="center"> <img src="docs/images/interface.jpg" width=444/></p>
+<p align="center" class="trendshift">
+<a href="https://trendshift.io/repositories/10126" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10126" alt="alibaba-damo-academy%2FFunClip | Trendshift" style="width: 250px; height: 55px;" width="300" height="55"/></a>
+</p>
+<div align="center">
+<h4>
+<a href="#What's New"> What's New </a>
+｜<a href="#On Going"> On Going </a>
+｜<a href="#Install"> Install </a>
+｜<a href="#Usage"> Usage </a>
+｜<a href="#Community"> Community </a>
+</h4>
+</div>
+**FunClip** is a fully open-source, locally deployed automated video clipping tool. It leverages Alibaba TONGYI speech lab's open-source [FunASR](https://github.com/alibaba-damo-academy/FunASR) Paraformer series models to perform speech recognition on videos. Then, users can freely choose text segments or speakers from the recognition results and click the clip button to obtain the video clip corresponding to the selected segments (Quick Experience [Modelscope⭐](https://modelscope.cn/studios/iic/funasr_app_clipvideo/summary) [HuggingFace🤗](https://huggingface.co/spaces/R1ckShi/FunClip)).
+## Highlights🎨
+- 🔥Try AI clipping using LLM in FunClip now.
+- FunClip integrates Alibaba's open-source industrial-grade model [Paraformer-Large](https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which is one of the best-performing open-source Chinese ASR models available, with over 13 million downloads on Modelscope. It can also accurately predict timestamps in an integrated manner.
+- FunClip incorporates the hotword customization feature of [SeACo-Paraformer](https://modelscope.cn/models/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), allowing users to specify certain entity words, names, etc., as hotwords during the ASR process to enhance recognition results.
+- FunClip integrates the [CAM++](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary) speaker recognition model, enabling users to use the auto-recognized speaker ID as the target for trimming, to clip segments from a specific speaker.
+- The functionalities are realized through Gradio interaction, offering simple installation and ease of use. It can also be deployed on a server and accessed via a browser.
+- FunClip supports multi-segment free clipping and automatically returns full video SRT subtitles and target segment SRT subtitles, offering a simple and convenient user experience.
+<a name="What's New"></a>
+## What's New🚀
+- 2024/06/12 FunClip supports recognize and clip English audio files now. Run `python funclip/launch.py -l en` to try.
+- 🔥2024/05/13 FunClip v2.0.0 now supports smart clipping with large language models, integrating models from the qwen series, GPT series, etc., providing default prompts. You can also explore and share tips for setting prompts, the usage is as follows:
+  1. After the recognition, select the name of the large model and configure your own apikey;
+  2. Click on the 'LLM Inference' button, and FunClip will automatically combine two prompts with the video's srt subtitles;
+  3. Click on the 'AI Clip' button, and based on the output results of the large language model from the previous step, FunClip will extract the timestamps for clipping;
+  4. You can try changing the prompt to leverage the capabilities of the large language models to get the results you want;
+- 2024/05/09 FunClip updated to v1.1.0, including the following updates and fixes:
+  - Support configuration of output file directory, saving ASR intermediate results and video clipping intermediate files;
+  - UI upgrade (see guide picture below), video and audio cropping function are on the same page now, button position adjustment;
+  - Fixed a bug introduced due to FunASR interface upgrade, which has caused some serious clipping errors;
+  - Support configuring different start and end time offsets for each paragraph;
+  - Code update, etc;
+- 2024/03/06 Fix bugs in using FunClip with command line.
+- 2024/02/28 [FunASR](https://github.com/alibaba-damo-academy/FunASR) is updated to 1.0 version, use FunASR1.0 and SeACo-Paraformer to conduct ASR with hotword customization.
+- 2023/10/17 Fix bugs in multiple periods chosen, used to return video with wrong length.
+- 2023/10/10 FunClipper now supports recognizing with speaker diarization ability, choose 'yes' button in 'Recognize Speakers' and you will get recognition results with speaker id for each sentence. And then you can clip out the periods of one or some speakers (e.g. 'spk0' or 'spk0#spk3') using FunClipper.
+<a name="On Going"></a>
+## On Going🌵
+- [x] FunClip will support Whisper model for English users, coming soon (ASR using Whisper with timestamp requires massive GPU memory, we support timestamp prediction for vanilla Paraformer in FunASR to achieving this).
+- [x] FunClip will further explore the abilities of large langage model based AI clipping, welcome to discuss about prompt setting and clipping, etc.
+- [ ] Reverse periods choosing while clipping.
+- [ ] Removing silence periods.
+<a name="Install"></a>
+## Install🔨
+### Python env install
+FunClip basic functions rely on a python environment only.
+```shell
+# clone funclip repo
+git clone https://github.com/alibaba-damo-academy/FunClip.git
+cd FunClip
+# install Python requirments
+pip install -r ./requirements.txt
+```
+### imagemagick install (Optional)
+If you want to clip video file with embedded subtitles
+1. ffmpeg and imagemagick is required
+- On Ubuntu
+```shell
+apt-get -y update && apt-get -y install ffmpeg imagemagick
+sed -i 's/none/read,write/g' /etc/ImageMagick-6/policy.xml
+```
+- On MacOS
+```shell
+brew install imagemagick
+sed -i 's/none/read,write/g' /usr/local/Cellar/imagemagick/7.1.1-8_1/etc/ImageMagick-7/policy.xml
+```
+- On Windows
+Download and install imagemagick https://imagemagick.org/script/download.php#windows
+Find your python install path and change the `IMAGEMAGICK_BINARY` to your imagemagick install path in file `site-packages\moviepy\config_defaults.py`
+2. Download font file to funclip/font
+```shell
+wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/STHeitiMedium.ttc -O font/STHeitiMedium.ttc
+```
+<a name="Usage"></a>
+## Use FunClip
+### A. Use FunClip as local Gradio Service
+You can establish your own FunClip service which is same as [Modelscope Space](https://modelscope.cn/studios/iic/funasr_app_clipvideo/summary) as follow:
+```shell
+python funclip/launch.py
+# '-l en' for English audio recognize
+# '-p xxx' for setting port number
+# '-s True' for establishing service for public accessing
+```
+then visit ```localhost:7860``` you will get a Gradio service like below and you can use FunClip following the steps:
+- Step1: Upload your video file (or try the example videos below)
+- Step2: Copy the text segments you need to 'Text to Clip'
+- Step3: Adjust subtitle settings (if needed)
+- Step4: Click 'Clip' or 'Clip and Generate Subtitles'
+<img src="docs/images/guide.jpg"/>
+Follow the guide below to explore LLM based clipping:
+<img src="docs/images/LLM_guide.png" width=360/>
+### B. Experience FunClip in Modelscope
+[FunClip@Modelscope Space⭐](https://modelscope.cn/studios/iic/funasr_app_clipvideo/summary)
+[FunClip@HuggingFace Space🤗](https://huggingface.co/spaces/R1ckShi/FunClip)
+### C. Use FunClip in command line
+FunClip supports you to recognize and clip with commands:
+```shell
+# step1: Recognize
+python funclip/videoclipper.py --stage 1 \
+                       --file examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output
+# now you can find recognition results and entire SRT file in ./output/
+# step2: Clip
+python funclip/videoclipper.py --stage 2 \
+                       --file examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output \
+                       --dest_text '我们把它跟乡村振兴去结合起来，利用我们的设计的能力' \
+                       --start_ost 0 \
+                       --end_ost 100 \
+                       --output_file './output/res.mp4'
+```
+<a name="Community"></a>
+## Community Communication🍟
+FunClip is firstly open-sourced bu FunASR team, any useful PR is welcomed.
+You can also scan the following DingTalk group or WeChat group QR code to join the community group for communication.
+|                           DingTalk group                            |                     WeChat group                      |
+|:-------------------------------------------------------------------:|:-----------------------------------------------------:|
+| <div align="left"><img src="docs/images/dingding.png" width="250"/> | <img src="docs/images/wechat.png" width="215"/></div> |
+## Find Speech Models in FunASR
+[FunASR](https://github.com/alibaba-damo-academy/FunASR) hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training & finetuning of the industrial-grade speech recognition model released on ModelScope, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！
+📚FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
+📚SeACo-Paraformer Paper: <a href="https://arxiv.org/abs/2308.03266"><img src="https://img.shields.io/badge/Arxiv-2308.03266-orange"></a>
+🌟Support FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>

README_zh.md ADDED Viewed

	@@ -0,0 +1,169 @@

+[![SVG Banners](https://svg-banners.vercel.app/api?type=rainbow&text1=FunClip%20%20🥒&width=800&height=210)](https://github.com/Akshay090/svg-banners)
+### <p align="center">「简体中文 | [English](./README.md)」</p>
+**<p align="center"> ⚡ 开源、精准、方便的视频切片工具 </p>**
+**<p align="center"> 🧠 通过FunClip探索基于大语言模型的视频剪辑 </p>**
+<p align="center"> <img src="docs/images/interface.jpg" width=444/></p>
+<p align="center" class="trendshift">
+<a href="https://trendshift.io/repositories/10126" target="_blank"><img src="https://trendshift.io/api/badge/repositories/10126" alt="alibaba-damo-academy%2FFunClip | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+<div align="center">
+<h4><a href="#近期更新"> 近期更新 </a>
+｜<a href="#施工中"> 施工中 </a>
+｜<a href="#安装环境"> 安装环境 </a>
+｜<a href="#使用方法"> 使用方法 </a>
+｜<a href="#社区交流"> 社区交流 </a>
+</h4>
+</div>
+**FunClip**是一款完全开源、本地部署的自动化视频剪辑工具，通过调用阿里巴巴通义实验室开源的[FunASR](https://github.com/alibaba-damo-academy/FunASR) Paraformer系列模型进行视频的语音识别，随后用户可以自由选择识别结果中的文本片段或说话人，点击裁剪按钮即可获取对应片段的视频（快速体验 [Modelscope⭐](https://modelscope.cn/studios/iic/funasr_app_clipvideo/summary)  [HuggingFace🤗](https://huggingface.co/spaces/R1ckShi/FunClip)）。
+## 热点&特性🎨
+- 🔥FunClip集成了多种大语言模型调用方式并提供了prompt配置接口，尝试通过大语言模型进行视频裁剪~
+- FunClip集成了阿里巴巴开源的工业级模型[Paraformer-Large](https://modelscope.cn/models/iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)，是当前识别效果最优的开源中文ASR模型之一，Modelscope下载量1300w+次，并且能够一体化的准确预测时间戳。
+- FunClip集成了[SeACo-Paraformer](https://modelscope.cn/models/iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)的热词定制化功能，在ASR过程中可以指定一些实体词、人名等作为热词，提升识别效果。
+- FunClip集成了[CAM++](https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary)说话人识别模型，用户可以将自动识别出的说话人ID作为裁剪目标，将某一说话人的段落裁剪出来。
+- 通过Gradio交互实现上述功能，安装简单使用方便，并且可以在服务端搭建服务通过浏览器使用。
+- FunClip支持多段自由剪辑，并且会自动返回全视频SRT字幕、目标段落SRT字幕，使用简单方便。
+欢迎体验使用，欢迎提出关于字幕生成或语音识别的需求与宝贵建议~
+<a name="近期更新"></a>
+## 近期更新🚀
+- 2024/06/12 FunClip现在支持识别与裁剪英文视频，通过`python funclip/launch.py -l en`来启动英文版本服务。
+- 🔥2024/05/13 FunClip v2.0.0加入大语言模型智能裁剪功能，集成qwen系列，gpt系列等模型，提供默认prompt，您也可以探索并分享prompt的设置技巧，使用方法如下：
+  1. 在进行识别之后，选择大模型名称，配置你自己的apikey；
+  2. 点击'LLM智能段落选择'按钮，FunClip将自动组合两个prompt与视频的srt字幕；
+  3. 点击'LLM智能裁剪'按钮，基于前一步的大语言模型输出结果，FunClip将提取其中的时间戳进行裁剪；
+  4. 您可以尝试改变prompt来借助大语言模型的能力来获取您想要的结果；
+- 2024/05/09 FunClip更新至v1.1.0，包含如下更新与修复：
+  - 支持配置输出文件目录，保存ASR中间结果与视频裁剪中间文件；
+  - UI升级（见下方演示图例），视频与音频裁剪功能在同一页，按钮位置调整；
+  - 修复了由于FunASR接口升级引入的bug，该bug曾导致一些严重的剪辑错误；
+  - 支持为每一个段落配置不同的起止时间偏移；
+  - 代码优化等；
+- 2024/03/06 命令行调用方式更新与问题修复，相关功能可以正常使用。
+- 2024/02/28 FunClip升级到FunASR1.0模型调用方式，通过FunASR开源的SeACo-Paraformer模型在视频剪辑中进一步支持热词定制化功能。
+- 2024/02/28 原FunASR-APP/ClipVideo更名为FunClip。
+<a name="施工中"></a>
+## 施工中🌵
+- [x] FunClip将会集成Whisper模型，以提供英文视频剪辑能力(Whisper模型的时间戳预测功能需要显存较大，我们在FunASR中添加了Paraformer英文模型的时间戳预测支持以允许FunClip支持英文识别裁剪)。
+- [x] 集成大语言模型的能力，提供智能视频剪辑相关功能。大家可以基于FunClip探索使用大语言模型的视频剪辑~
+- [ ] 给定文本段落，反向选取其他段落。
+- [ ] 删除视频中无人说话的片段。
+<a name="安装环境"></a>
+## 安装🔨
+### Python环境安装
+FunClip的运行仅依赖于一个Python环境，若您是一个小白开发者，可以先了解下如何使用Python，pip等~
+```shell
+# 克隆funclip仓库
+git clone https://github.com/alibaba-damo-academy/FunClip.git
+cd FunClip
+# 安装相关Python依赖
+pip install -r ./requirements.txt
+```
+### 安装imagemagick（可选）
+1. 如果你希望使用自动生成字幕的视频裁剪功能，需要安装imagemagick
+- Ubuntu
+```shell
+apt-get -y update && apt-get -y install ffmpeg imagemagick
+sed -i 's/none/read,write/g' /etc/ImageMagick-6/policy.xml
+```
+- MacOS
+```shell
+brew install imagemagick
+sed -i 's/none/read,write/g' /usr/local/Cellar/imagemagick/7.1.1-8_1/etc/ImageMagick-7/policy.xml
+```
+- Windows
+首先下载并安装imagemagick https://imagemagick.org/script/download.php#windows
+然后确定您的Python安装位置，在其中的`site-packages\moviepy\config_defaults.py`文件中修改`IMAGEMAGICK_BINARY`为imagemagick的exe路径
+2. 下载你需要的字体文件，这里我们提供一个默认的黑体字体文件
+```shell
+wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/STHeitiMedium.ttc -O font/STHeitiMedium.ttc
+```
+<a name="使用方法"></a>
+## 使用FunClip
+### A.在本地启动Gradio服务
+```shell
+python funclip/launch.py
+# '-l en' for English audio recognize
+# '-p xxx' for setting port number
+# '-s True' for establishing service for public accessing
+```
+随后在浏览器中访问```localhost:7860```即可看到如下图所示的界面，按如下步骤即可进行视频剪辑
+1. 上传你的视频（或使用下方的视频用例）
+2. （可选）设置热词，设置文件输出路径（保存识别结果、视频等）
+3. 点击识别按钮获取识别结果，或点击识别+区分说话人在语音识别基础上识别说话人ID
+4. 将识别结果中的选段复制到对应位置，或者将说话人ID输入到对应为止
+5. （可选）配置剪辑参数，偏移量与字幕设置等
+6. 点击“裁剪”或“裁剪+字幕”按钮
+<img src="docs/images/guide.jpg"/>
+使用大语言模型裁剪请参考如下教程
+<img src="docs/images/LLM_guide.png" width=360/>
+### B.通过命令行调用使用FunClip的相关功能
+```shell
+# 步骤一：识别
+python funclip/videoclipper.py --stage 1 \
+                       --file examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output
+# ./output中生成了识别结果与srt字幕等
+# 步骤二：裁剪
+python funclip/videoclipper.py --stage 2 \
+                       --file examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output \
+                       --dest_text '我们把它跟乡村振兴去结合起来，利用我们的设计的能力' \
+                       --start_ost 0 \
+                       --end_ost 100 \
+                       --output_file './output/res.mp4'
+```
+### C.通过创空间与Space体验FunClip
+[FunClip@Modelscope创空间⭐](https://modelscope.cn/studios/iic/funasr_app_clipvideo/summary)
+[FunClip@HuggingFace Space🤗](https://huggingface.co/spaces/R1ckShi/FunClip)
+<a name="社区交流"></a>
+## 社区交流🍟
+FunClip开源项目由FunASR社区维护，欢迎加入社区，交流与讨论，以及合作开发等。
+|                              钉钉群                                |                     微信群                      |
+|:-------------------------------------------------------------------:|:-----------------------------------------------------:|
+| <div align="left"><img src="docs/images/dingding.png" width="250"/> | <img src="docs/images/wechat.png" width="215"/></div> |
+## 通过FunASR了解语音识别相关技术
+[FunASR](https://github.com/alibaba-damo-academy/FunASR)是阿里巴巴通义实验室开源的端到端语音识别工具包，目前已经成为主流ASR工具包之一。其主要包括Python pipeline，SDK部署与海量开源工业ASR模型等。
+📚FunASR论文: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
+📚SeACo-Paraformer论文：<a href="https://arxiv.org/abs/2308.03266"><img src="https://img.shields.io/badge/Arxiv-2308.03266-orange"></a>
+⭐支持FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR.stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>

docs/images/LLM_guide.png ADDED Viewed

docs/images/demo.png ADDED Viewed

Git LFS Details

SHA256: 59534f9032b63970d79c1a191708de9c0a4a2015756c71e76884fcab893f9773
Pointer size: 132 Bytes
Size of remote file: 2.14 MB

docs/images/demo_en.png ADDED Viewed

Git LFS Details

SHA256: 3285aee5690d7d9694c93f71292419058bcd832b8a70f5a6257128669fa2790d
Pointer size: 132 Bytes
Size of remote file: 1.81 MB

docs/images/dingding.png ADDED Viewed

Git LFS Details

SHA256: 33c08d66924dc055c5160de8374ec4ed823c51ba6d92eb04dc3d5e5e4065a5f6
Pointer size: 132 Bytes
Size of remote file: 1.5 MB

docs/images/guide.jpg ADDED Viewed

Git LFS Details

SHA256: df0efc89b8762d1aaf36ca7506eb68fc91fca22d8afe48412460a76b7c9f36c2
Pointer size: 132 Bytes
Size of remote file: 1.3 MB

docs/images/interface.jpg ADDED Viewed

docs/images/wechat.png ADDED Viewed

font/STHeitiMedium.ttc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8fa4a63e2cf500e98e64d4c73260daaba049306cf85dec9e3729bc285b7d645
+size 55754164

funclip/__init__.py ADDED Viewed

File without changes

funclip/introduction.py ADDED Viewed

	@@ -0,0 +1,39 @@

+top_md_1 = ("""
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+    FunClip: <a href='https://github.com/alibaba-damo-academy/FunClip'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+    🌟支持我们: <a href='https://github.com/alibaba-damo-academy/FunClip/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunClip.svg?style=social'></a>
+    </div>
+    </div>
+    基于阿里巴巴通义实验室自研并开源的[FunASR](https://github.com/alibaba-damo-academy/FunASR)工具包及Paraformer系列模型及语音识别、端点检测、标点预测、时间戳预测、说话人区分、热词定制化开源链路
+    准确识别，自由复制所需段落，或者设置说话人标识，一键裁剪、添加字幕
+    * Step1: 上传视频或音频文件（或使用下方的用例体验），点击 **<font color="#f7802b">识别</font>** 按钮
+    * Step2: 复制识别结果中所需的文字至右上方，或者右设置说话人标识，设置偏移与字幕配置（可选）
+    * Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
+    🔥 FunClip现在集成了大语言模型智能剪辑功能，选择LLM模型进行体验吧~
+    """)
+top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ParaClipper中所使用的语音处理相关模型：
+    <div align="center">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
+        🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
+    </div>
+    </div>
+    """)
+top_md_4 = ("""我们在「LLM智能裁剪」模块中提供三种LLM调用方式，
+            1. 选择阿里云百炼平台通过api调用qwen系列模型，此时需要您准备百炼平台的apikey，请访问[阿里云百炼](https://bailian.console.aliyun.com/#/home)；
+            2. 选择GPT开头的模型即为调用openai官方api，此时需要您自备sk与网络环境；
+            3. [gpt4free](https://github.com/xtekky/gpt4free?tab=readme-ov-file)项目也被集成进FunClip，可以通过它免费调用gpt模型；
+            其中方式1与方式2需要在界面中传入相应的apikey
+            方式3而可能非常不稳定，返回时间可能很长或者结果获取失败，可以多多尝试或者自己准备sk使用方式1,2
+            不要同时打开同一端口的多个界面，会导致文件上传非常缓慢或卡死，关闭其他界面即可解决
+            """)

funclip/launch.py ADDED Viewed

	@@ -0,0 +1,310 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+from http import server
+import os
+import logging
+import argparse
+import gradio as gr
+from funasr import AutoModel
+from videoclipper import VideoClipper
+from llm.openai_api import openai_call
+from llm.qwen_api import call_qwen_model
+from llm.g4f_openai_api import g4f_openai_call
+from utils.trans_utils import extract_timestamps
+from introduction import top_md_1, top_md_3, top_md_4
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='argparse testing')
+    parser.add_argument('--lang', '-l', type=str, default = "zh", help="language")
+    parser.add_argument('--share', '-s', action='store_true', help="if to establish gradio share link")
+    parser.add_argument('--port', '-p', type=int, default=7860, help='port number')
+    parser.add_argument('--listen', action='store_true', help="if to listen to all hosts")
+    args = parser.parse_args()
+    if args.lang == 'zh':
+        funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                                vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                                punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                                spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                                )
+    else:
+        funasr_model = AutoModel(model="iic/speech_paraformer_asr-en-16k-vocab4199-pytorch",
+                                vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                                punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                                spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                                )
+    audio_clipper = VideoClipper(funasr_model)
+    audio_clipper.lang = args.lang
+    server_name='127.0.0.1'
+    if args.listen:
+        server_name = '0.0.0.0'
+    def audio_recog(audio_input, sd_switch, hotwords, output_dir):
+        return audio_clipper.recog(audio_input, sd_switch, None, hotwords, output_dir=output_dir)
+    def video_recog(video_input, sd_switch, hotwords, output_dir):
+        return audio_clipper.video_recog(video_input, sd_switch, hotwords, output_dir=output_dir)
+    def video_clip(dest_text, video_spk_input, start_ost, end_ost, state, output_dir):
+        return audio_clipper.video_clip(
+            dest_text, start_ost, end_ost, state, dest_spk=video_spk_input, output_dir=output_dir
+            )
+    def mix_recog(video_input, audio_input, hotwords, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        audio_state, video_state = None, None
+        if video_input is not None:
+            res_text, res_srt, video_state = video_recog(
+                video_input, 'No', hotwords, output_dir=output_dir)
+            return res_text, res_srt, video_state, None
+        if audio_input is not None:
+            res_text, res_srt, audio_state = audio_recog(
+                audio_input, 'No', hotwords, output_dir=output_dir)
+            return res_text, res_srt, None, audio_state
+    def mix_recog_speaker(video_input, audio_input, hotwords, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        audio_state, video_state = None, None
+        if video_input is not None:
+            res_text, res_srt, video_state = video_recog(
+                video_input, 'Yes', hotwords, output_dir=output_dir)
+            return res_text, res_srt, video_state, None
+        if audio_input is not None:
+            res_text, res_srt, audio_state = audio_recog(
+                audio_input, 'Yes', hotwords, output_dir=output_dir)
+            return res_text, res_srt, None, audio_state
+    def mix_clip(dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        if video_state is not None:
+            clip_video_file, message, clip_srt = audio_clipper.video_clip(
+                dest_text, start_ost, end_ost, video_state, dest_spk=video_spk_input, output_dir=output_dir)
+            return clip_video_file, None, message, clip_srt
+        if audio_state is not None:
+            (sr, res_audio), message, clip_srt = audio_clipper.clip(
+                dest_text, start_ost, end_ost, audio_state, dest_spk=video_spk_input, output_dir=output_dir)
+            return None, (sr, res_audio), message, clip_srt
+    def video_clip_addsub(dest_text, video_spk_input, start_ost, end_ost, state, output_dir, font_size, font_color):
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        return audio_clipper.video_clip(
+            dest_text, start_ost, end_ost, state,
+            font_size=font_size, font_color=font_color,
+            add_sub=True, dest_spk=video_spk_input, output_dir=output_dir
+            )
+    def llm_inference(system_content, user_content, srt_text, model, apikey):
+        SUPPORT_LLM_PREFIX = ['qwen', 'gpt', 'g4f', 'moonshot']
+        if model.startswith('qwen'):
+            return call_qwen_model(apikey, model, user_content+'\n'+srt_text, system_content)
+        if model.startswith('gpt') or model.startswith('moonshot'):
+            return openai_call(apikey, model, system_content, user_content+'\n'+srt_text)
+        elif model.startswith('g4f'):
+            model = "-".join(model.split('-')[1:])
+            return g4f_openai_call(model, system_content, user_content+'\n'+srt_text)
+        else:
+            logging.error("LLM name error, only {} are supported as LLM name prefix."
+                          .format(SUPPORT_LLM_PREFIX))
+    def AI_clip(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+        timestamp_list = extract_timestamps(LLM_res)
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        if video_state is not None:
+            clip_video_file, message, clip_srt = audio_clipper.video_clip(
+                dest_text, start_ost, end_ost, video_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
+            return clip_video_file, None, message, clip_srt
+        if audio_state is not None:
+            (sr, res_audio), message, clip_srt = audio_clipper.clip(
+                dest_text, start_ost, end_ost, audio_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=False)
+            return None, (sr, res_audio), message, clip_srt
+    def AI_clip_subti(LLM_res, dest_text, video_spk_input, start_ost, end_ost, video_state, audio_state, output_dir):
+        timestamp_list = extract_timestamps(LLM_res)
+        output_dir = output_dir.strip()
+        if not len(output_dir):
+            output_dir = None
+        else:
+            output_dir = os.path.abspath(output_dir)
+        if video_state is not None:
+            clip_video_file, message, clip_srt = audio_clipper.video_clip(
+                dest_text, start_ost, end_ost, video_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=True)
+            return clip_video_file, None, message, clip_srt
+        if audio_state is not None:
+            (sr, res_audio), message, clip_srt = audio_clipper.clip(
+                dest_text, start_ost, end_ost, audio_state,
+                dest_spk=video_spk_input, output_dir=output_dir, timestamp_list=timestamp_list, add_sub=True)
+            return None, (sr, res_audio), message, clip_srt
+    # gradio interface
+    theme = gr.Theme.load("funclip/utils/theme.json")
+    with gr.Blocks(theme=theme) as funclip_service:
+        gr.Markdown(top_md_1)
+        # gr.Markdown(top_md_2)
+        gr.Markdown(top_md_3)
+        gr.Markdown(top_md_4)
+        video_state, audio_state = gr.State(), gr.State()
+        with gr.Row():
+            with gr.Column():
+                with gr.Row():
+                    video_input = gr.Video(label="视频输入 | Video Input")
+                    audio_input = gr.Audio(label="音频输入 | Audio Input")
+                with gr.Column():
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%B8%BA%E4%BB%80%E4%B9%88%E8%A6%81%E5%A4%9A%E8%AF%BB%E4%B9%A6%EF%BC%9F%E8%BF%99%E6%98%AF%E6%88%91%E5%90%AC%E8%BF%87%E6%9C%80%E5%A5%BD%E7%9A%84%E7%AD%94%E6%A1%88-%E7%89%87%E6%AE%B5.mp4',
+                                 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/2022%E4%BA%91%E6%A0%96%E5%A4%A7%E4%BC%9A_%E7%89%87%E6%AE%B52.mp4',
+                                 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E4%BD%BF%E7%94%A8chatgpt_%E7%89%87%E6%AE%B5.mp4'],
+                                [video_input],
+                                label='示例视频 | Demo Video')
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E8%AE%BF%E8%B0%88.mp4'],
+                                [video_input],
+                                label='多说话人示例视频 | Multi-speaker Demo Video')
+                    gr.Examples(['https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ClipVideo/%E9%B2%81%E8%82%83%E9%87%87%E8%AE%BF%E7%89%87%E6%AE%B51.wav'],
+                                [audio_input],
+                                label="示例音频 | Demo Audio")
+                    with gr.Column():
+                        # with gr.Row():
+                            # video_sd_switch = gr.Radio(["No", "Yes"], label="👥区分说话人 Get Speakers", value='No')
+                        hotwords_input = gr.Textbox(label="🚒 热词 | Hotwords(可以为空，多个热词使用空格分隔，仅支持中文热词)")
+                        output_dir = gr.Textbox(label="📁 文件输出路径 | File Output Dir (可以为空，Linux, mac系统可以稳定使用)", value=" ")
+                        with gr.Row():
+                            recog_button = gr.Button("👂 识别 | ASR", variant="primary")
+                            recog_button2 = gr.Button("👂👫 识别+区分说话人 | ASR+SD")
+                video_text_output = gr.Textbox(label="✏️ 识别结果 | Recognition Result")
+                video_srt_output = gr.Textbox(label="📖 SRT字幕内容 | RST Subtitles")
+            with gr.Column():
+                with gr.Tab("🧠 LLM智能裁剪 | LLM Clipping"):
+                    with gr.Column():
+                        prompt_head = gr.Textbox(label="Prompt System (按需更改，最好不要变动主体和要求)", value=("你是一个视频srt字幕分析剪辑器，输入视频的srt字幕，"
+                                "分析其中的精彩且尽可能连续的片段并裁剪出来，输出四条以内的片段，将片段中在时间上连续的多个句子及它们的时间戳合并为一条，"
+                                "注意确保文字与时间戳的正确匹配。输出需严格按照如下格式：1. [开始时间-结束时间] 文本，注意其中的连接符是“-”"))
+                        prompt_head2 = gr.Textbox(label="Prompt User（不需要修改，会自动拼接左下角的srt字幕）", value=("这是待裁剪的视频srt字幕："))
+                        with gr.Column():
+                            with gr.Row():
+                                llm_model = gr.Dropdown(
+                                    choices=["qwen-plus",
+                                             "gpt-3.5-turbo",
+                                             "gpt-3.5-turbo-0125",
+                                             "gpt-4-turbo",
+                                             "g4f-gpt-3.5-turbo"],
+                                    value="qwen-plus",
+                                    label="LLM Model Name",
+                                    allow_custom_value=True)
+                                apikey_input = gr.Textbox(label="APIKEY")
+                            llm_button =  gr.Button("LLM推理 | LLM Inference（首先进行识别，非g4f需配置对应apikey）", variant="primary")
+                        llm_result = gr.Textbox(label="LLM Clipper Result")
+                        with gr.Row():
+                            llm_clip_button = gr.Button("🧠 LLM智能裁剪 | AI Clip", variant="primary")
+                            llm_clip_subti_button = gr.Button("🧠 LLM智能裁剪+字幕 | AI Clip+Subtitles")
+                with gr.Tab("✂️ 根据文本/说话人裁剪 | Text/Speaker Clipping"):
+                    video_text_input = gr.Textbox(label="✏️ 待裁剪文本 | Text to Clip (多段文本使用'#'连接)")
+                    video_spk_input = gr.Textbox(label="✏️ 待裁剪说话人 | Speaker to Clip (多个说话人使用'#'连接)")
+                    with gr.Row():
+                        clip_button = gr.Button("✂️ 裁剪 | Clip", variant="primary")
+                        clip_subti_button = gr.Button("✂️ 裁剪+字幕 | Clip+Subtitles")
+                    with gr.Row():
+                        video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪ 开始位置偏移 | Start Offset (ms)")
+                        video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩ 结束位置偏移 | End Offset (ms)")
+                with gr.Row():
+                    font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠 字幕字体大小 | Subtitle Font Size")
+                    font_color = gr.Radio(["black", "white", "green", "red"], label="🌈 字幕颜色 | Subtitle Color", value='white')
+                    # font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
+                video_output = gr.Video(label="裁剪结果 | Video Clipped")
+                audio_output = gr.Audio(label="裁剪结果 | Audio Clipped")
+                clip_message = gr.Textbox(label="⚠️ 裁剪信息 | Clipping Log")
+                srt_clipped = gr.Textbox(label="📖 裁剪部分SRT字幕内容 | Clipped RST Subtitles")
+        recog_button.click(mix_recog,
+                            inputs=[video_input,
+                                    audio_input,
+                                    hotwords_input,
+                                    output_dir,
+                                    ],
+                            outputs=[video_text_output, video_srt_output, video_state, audio_state])
+        recog_button2.click(mix_recog_speaker,
+                            inputs=[video_input,
+                                    audio_input,
+                                    hotwords_input,
+                                    output_dir,
+                                    ],
+                            outputs=[video_text_output, video_srt_output, video_state, audio_state])
+        clip_button.click(mix_clip,
+                           inputs=[video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   audio_state,
+                                   output_dir
+                                   ],
+                           outputs=[video_output, audio_output, clip_message, srt_clipped])
+        clip_subti_button.click(video_clip_addsub,
+                           inputs=[video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   output_dir,
+                                   font_size,
+                                   font_color,
+                                   ],
+                           outputs=[video_output, clip_message, srt_clipped])
+        llm_button.click(llm_inference,
+                         inputs=[prompt_head, prompt_head2, video_srt_output, llm_model, apikey_input],
+                         outputs=[llm_result])
+        llm_clip_button.click(AI_clip,
+                           inputs=[llm_result,
+                                   video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   audio_state,
+                                   output_dir,
+                                   ],
+                           outputs=[video_output, audio_output, clip_message, srt_clipped])
+        llm_clip_subti_button.click(AI_clip_subti,
+                           inputs=[llm_result,
+                                   video_text_input,
+                                   video_spk_input,
+                                   video_start_ost,
+                                   video_end_ost,
+                                   video_state,
+                                   audio_state,
+                                   output_dir,
+                                   ],
+                           outputs=[video_output, audio_output, clip_message, srt_clipped])
+    # start gradio service in local or share
+    if args.listen:
+        funclip_service.launch(share=True, server_port=args.port, server_name=server_name, inbrowser=False)
+    else:
+        funclip_service.launch(share=True, server_port=args.port, server_name=server_name)

funclip/llm/demo_prompt.py ADDED Viewed

	@@ -0,0 +1,272 @@

+demo_prompt="""
+你是一个视频srt字幕剪辑工具，输入视频的srt字幕之后根据如下要求剪辑对应的片段并输出每个段落的开始与结束时间，
+剪辑出以下片段中最有意义的、尽可能连续的部分，按如下格式输出：1. [开始时间-结束时间] 文本，
+原始srt字幕如下：
+0
+00:00:00,50 --> 00:00:02,10
+读万卷书行万里路，
+1
+00:00:02,310 --> 00:00:03,990
+这里是读书三六九，
+2
+00:00:04,670 --> 00:00:07,990
+今天要和您分享的这篇文章是人民日报，
+3
+00:00:08,510 --> 00:00:09,730
+为什么要多读书？
+4
+00:00:10,90 --> 00:00:11,930
+这是我听过最好的答案，
+5
+00:00:12,310 --> 00:00:13,190
+经常有人问，
+6
+00:00:13,730 --> 00:00:14,690
+读了那么多书，
+7
+00:00:14,990 --> 00:00:17,250
+最终还不是要回到一座平凡的城，
+8
+00:00:17,610 --> 00:00:19,410
+打一份平凡的工组，
+9
+00:00:19,410 --> 00:00:20,670
+建一个平凡的家庭，
+10
+00:00:21,330 --> 00:00:25,960
+何苦折腾一个人读书的意义究竟是什么？
+11
+00:00:26,680 --> 00:00:30,80
+今天给大家分享人民日报推荐的八条理由，
+12
+00:00:30,540 --> 00:00:32,875
+告诉你人为什么要多读书？
+13
+00:00:34,690 --> 00:00:38,725
+一脚步丈量不到的地方文字可以。
+14
+00:00:40,300 --> 00:00:41,540
+钱钟书先生说过，
+15
+00:00:42,260 --> 00:00:43,140
+如果不读书，
+16
+00:00:43,520 --> 00:00:44,400
+行万里路，
+17
+00:00:44,540 --> 00:00:45,695
+也只是个邮差。
+18
+00:00:46,900 --> 00:00:47,320
+北京、
+19
+00:00:47,500 --> 00:00:47,980
+西安、
+20
+00:00:48,320 --> 00:00:51,200
+南京和洛阳少了学识的浸润，
+21
+00:00:51,600 --> 00:00:55,565
+他们只是一个个耳中熟悉又眼里陌生的地名。
+22
+00:00:56,560 --> 00:00:59,360
+故宫避暑山庄岱庙、
+23
+00:00:59,840 --> 00:01:02,920
+曲阜三孔有了文化照耀，
+24
+00:01:03,120 --> 00:01:05,340
+他们才不是被时间风化的标本。
+25
+00:01:05,820 --> 00:01:08,105
+而是活了成百上千年的生命，
+26
+00:01:09,650 --> 00:01:10,370
+不去读书，
+27
+00:01:10,670 --> 00:01:12,920
+就是一个邮差风景，
+28
+00:01:13,0 --> 00:01:13,835
+过眼就忘，
+29
+00:01:14,750 --> 00:01:17,365
+就算踏破铁鞋又有什么用处呢？
+30
+00:01:19,240 --> 00:01:22,380
+阅读不仅仅会让现实的旅行更加丰富，
+31
+00:01:23,120 --> 00:01:27,260
+更重要的是能让精神突破现实和身体的桎梏，
+32
+00:01:27,640 --> 00:01:29,985
+来一场灵魂长足的旅行。
+33
+00:01:31,850 --> 00:01:32,930
+听过这样一句话，
+34
+00:01:33,490 --> 00:01:35,190
+没有一艘非凡的船舰，
+35
+00:01:35,330 --> 00:01:36,430
+能像一册书籍，
+36
+00:01:36,690 --> 00:01:38,595
+把我们带到浩瀚的天地，
+37
+00:01:39,830 --> 00:01:42,685
+你无法到达的地方文字在你过去，
+38
+00:01:43,530 --> 00:01:45,750
+你无法经历的人生舒淇，
+39
+00:01:45,770 --> 00:01:46,595
+带你相遇。
+40
+00:01:47,640 --> 00:01:50,340
+那些读过的书会一本本充实，
+41
+00:01:50,340 --> 00:01:50,940
+你的内心，
+42
+00:01:51,640 --> 00:01:54,855
+让虚无单调的世界变得五彩斑斓。
+43
+00:01:55,930 --> 00:01:59,690
+那些书中的人物会在你深陷生活泥潭之时，
+44
+00:02:00,170 --> 00:02:01,190
+轻声的呼唤，
+45
+00:02:01,950 --> 00:02:03,270
+用他们心怀梦想、
+46
+00:02:03,630 --> 00:02:04,950
+不卑不亢的故事，
+47
+00:02:05,310 --> 00:02:07,90
+激励你抵御苦难，
+48
+00:02:07,430 --> 00:02:08,525
+勇往直前。
+49
+00:02:11,290 --> 00:02:11,695
+二、
+50
+00:02:12,440 --> 00:02:16,900
+读书的意义是使人虚心叫通达不固执、
+51
+00:02:17,200 --> 00:02:18,35
+不偏执。
+52
+00:02:20,290 --> 00:02:22,935
+读书越少的人越容易过得痛苦。
+53
+00:02:23,600 --> 00:02:24,400
+读书越多，
+54
+00:02:24,800 --> 00:02:26,185
+人才会越通透，
+55
+00:02:27,890 --> 00:02:30,30
+知乎上有位网友讲过自己的故事。
+56
+00:02:30,750 --> 00:02:31,310
+有一次，
+57
+00:02:31,530 --> 00:02:32,650
+他跟伴侣吵架，
+58
+00:02:33,190 --> 00:02:35,505
+气得连续好几个晚上没睡好，
+59
+00:02:36,360 --> 00:02:38,880
+直到他读到一本关于亲密关系的书。
+60
+00:02:39,500 --> 00:02:41,920
+书中有段关于夫妻关系的解读，
+61
+00:02:42,80 --> 00:02:43,100
+让他豁然开朗，
+62
+00:02:43,460 --> 00:02:47,170
+突然想明白了很多事气消了，
+63
+00:02:47,430 --> 00:02:48,410
+心情好了，
+64
+00:02:48,790 --> 00:02:50,194
+整个人也舒爽了。
+65
+00:02:51,780 --> 00:02:54,340
+一个人书读的不多见识，
+66
+00:02:54,380 --> 00:02:55,180
+难免受限，
+67
+00:02:55,720 --> 00:02:58,495
+结果就必须受着眼前世界的禁锢，
+68
+00:02:59,540 --> 00:03:00,740
+稍微遇到一点不顺，
+69
+00:03:00,940 --> 00:03:02,460
+就极易消极悲观，
+70
+00:03:02,900 --> 00:03:03,720
+郁郁寡欢，
+71
+00:03:04,140 --> 00:03:05,765
+让自己困在情绪里，
+72
+00:03:06,900 --> 00:03:09,760
+只有通过阅读才能看透人生真相，
+73
+00:03:10,300 --> 00:03:12,140
+收获为人处事的智慧，
+74
+00:03:12,480 --> 00:03:14,95
+把日子越过越好。
+75
+00:03:16,730 --> 00:03:17,890
+生活的艺术里说，
+76
+00:03:18,410 --> 00:03:20,30
+人一定要时时读书，
+77
+00:03:20,430 --> 00:03:22,915
+不然便会鄙令晚腐。
+78
+00:03:23,690 --> 00:03:28,730
+完剑俗剑生满身上一个人的落伍迂腐，
+79
+00:03:29,210 --> 00:03:31,205
+就是不肯实施读书所致。
+80
+00:03:33,10 --> 00:03:34,790
+只有在不断阅读的过程中，
+81
+00:03:34,990 --> 00:03:35,970
+修心养性，
+82
+00:03:36,430 --> 00:03:38,735
+才能摆脱我们的鄙俗和顽固。
+83
+00:03:39,920 --> 00:03:41,720
+这世间没有谁的生活，
+84
+00:03:41,800 --> 00:03:42,540
+没有烦恼，
+85
+00:03:43,140 --> 00:03:45,455
+唯读书是最好的解药。
+86
+00:03:47,730 --> 00:03:48,185
+三、
+87
+00:03:49,40 --> 00:03:50,720
+书中未必有黄金屋，
+88
+00:03:51,0 --> 00:03:52,595
+但一定有更好的自己。
+"""

funclip/llm/g4f_openai_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from g4f.client import Client
+if __name__ == '__main__':
+    from llm.demo_prompt import demo_prompt
+    client = Client()
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=[{"role": "user", "content": "你好你的名字是什么"}],
+    )
+    print(response.choices[0].message.content)
+def g4f_openai_call(model="gpt-3.5-turbo",
+                    user_content="如何做西红柿炖牛腩？",
+                    system_content=None):
+    client = Client()
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+    )
+    return(response.choices[0].message.content)

funclip/llm/openai_api.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import logging
+from openai import OpenAI
+if __name__ == '__main__':
+    from llm.demo_prompt import demo_prompt
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=os.environ.get("OPENAI_API_KEY"),
+    )
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": demo_prompt,
+            }
+        ],
+        model="gpt-3.5-turbo-0125",
+    )
+    print(chat_completion.choices[0].message.content)
+def openai_call(apikey,
+                model="gpt-3.5-turbo",
+                user_content="如何做西红柿炖牛腩？",
+                system_content=None):
+    client = OpenAI(
+        # This is the default and can be omitted
+        api_key=apikey,
+    )
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    chat_completion = client.chat.completions.create(
+        messages=messages,
+        model=model,
+    )
+    logging.info("Openai model inference done.")
+    return chat_completion.choices[0].message.content

funclip/llm/qwen_api.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import dashscope
+from dashscope import Generation
+def call_qwen_model(key=None,
+                    model="qwen_plus",
+                    user_content="如何做西红柿炖牛腩？",
+                    system_content=None):
+    dashscope.api_key = key
+    if system_content is not None and len(system_content.strip()):
+        messages = [
+            {'role': 'system', 'content': system_content},
+            {'role': 'user', 'content': user_content}
+      ]
+    else:
+        messages = [
+            {'role': 'user', 'content': user_content}
+      ]
+    responses = Generation.call(model,
+                                messages=messages,
+                                result_format='message',  # 设置输出为'message'格式
+                                stream=False, # 设置输出方式为流式输出
+                                incremental_output=False  # 增量式流式输出
+                                )
+    print(responses)
+    return responses['output']['choices'][0]['message']['content']
+if __name__ == '__main__':
+    call_qwen_model('YOUR_BAILIAN_APIKEY')

funclip/test/imagemagick_test.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from moviepy.editor import *
+from moviepy.video.tools.subtitles import SubtitlesClip, TextClip
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+from moviepy.video.compositing import CompositeVideoClip
+generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=48, color='white')
+subs = [((0, 2), 'sub1中文字幕'),
+        ((2, 4), 'subs2'),
+        ((4, 6), 'subs3'),
+        ((6, 8), 'subs4')]
+subtitles = SubtitlesClip(subs, generator)
+video = VideoFileClip("examples/2022云栖大会_片段.mp4.mp4")
+video = video.subclip(0, 8)
+video = CompositeVideoClip([video, subtitles.set_pos(('center','bottom'))])
+video.write_videofile("test_output.mp4")

funclip/test/test.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+# step1: Recognize
+python videoclipper.py --stage 1 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --sd_switch yes \
+                       --output_dir ./output
+# now you can find recognition results and entire SRT file in ./output/
+# step2: Clip
+python videoclipper.py --stage 2 \
+                       --file ../examples/2022云栖大会_片段.mp4 \
+                       --output_dir ./output \
+                       --dest_text '所以这个是我们办这个奖的初心啊，我们也会一届一届的办下去' \
+                    #    --dest_spk spk0 \
+                       --start_ost 0 \
+                       --end_ost 100 \
+                       --output_file './output/res.mp4'

funclip/utils/argparse_tools.py ADDED Viewed

	@@ -0,0 +1,88 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import argparse
+from pathlib import Path
+import yaml
+import sys
+class ArgumentParser(argparse.ArgumentParser):
+    """Simple implementation of ArgumentParser supporting config file
+    This class is originated from https://github.com/bw2/ConfigArgParse,
+    but this class is lack of some features that it has.
+    - Not supporting multiple config files
+    - Automatically adding "--config" as an option.
+    - Not supporting any formats other than yaml
+    - Not checking argument type
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.add_argument("--config", help="Give config file in yaml format")
+    def parse_known_args(self, args=None, namespace=None):
+        # Once parsing for setting from "--config"
+        _args, _ = super().parse_known_args(args, namespace)
+        if _args.config is not None:
+            if not Path(_args.config).exists():
+                self.error(f"No such file: {_args.config}")
+            with open(_args.config, "r", encoding="utf-8") as f:
+                d = yaml.safe_load(f)
+            if not isinstance(d, dict):
+                self.error("Config file has non dict value: {_args.config}")
+            for key in d:
+                for action in self._actions:
+                    if key == action.dest:
+                        break
+                else:
+                    self.error(f"unrecognized arguments: {key} (from {_args.config})")
+            # NOTE(kamo): Ignore "--config" from a config file
+            # NOTE(kamo): Unlike "configargparse", this module doesn't check type.
+            #   i.e. We can set any type value regardless of argument type.
+            self.set_defaults(**d)
+        return super().parse_known_args(args, namespace)
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''")
+        if all(char not in arg for char in extra_chars)
+        else "'" + arg.replace("'", "'\\''") + "'"
+        for arg in sys.argv
+    ]
+    return sys.executable + " " + " ".join(argv)

funclip/utils/subtitle_utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import re
+def time_convert(ms):
+    ms = int(ms)
+    tail = ms % 1000
+    s = ms // 1000
+    mi = s // 60
+    s = s % 60
+    h = mi // 60
+    mi = mi % 60
+    h = "00" if h == 0 else str(h)
+    mi = "00" if mi == 0 else str(mi)
+    s = "00" if s == 0 else str(s)
+    tail = str(tail)
+    if len(h) == 1: h = '0' + h
+    if len(mi) == 1: mi = '0' + mi
+    if len(s) == 1: s = '0' + s
+    return "{}:{}:{},{}".format(h, mi, s, tail)
+def str2list(text):
+    pattern = re.compile(r'[\u4e00-\u9fff]|[\w-]+', re.UNICODE)
+    elements = pattern.findall(text)
+    return elements
+class Text2SRT():
+    def __init__(self, text, timestamp, offset=0):
+        self.token_list = text
+        self.timestamp = timestamp
+        start, end = timestamp[0][0] - offset, timestamp[-1][1] - offset
+        self.start_sec, self.end_sec = start, end
+        self.start_time = time_convert(start)
+        self.end_time = time_convert(end)
+    def text(self):
+        if isinstance(self.token_list, str):
+            return self.token_list
+        else:
+            res = ""
+            for word in self.token_list:
+                if '\u4e00' <= word <= '\u9fff':
+                    res += word
+                else:
+                    res += " " + word
+            return res.lstrip()
+    def srt(self, acc_ost=0.0):
+        return "{} --> {}\n{}\n".format(
+            time_convert(self.start_sec+acc_ost*1000),
+            time_convert(self.end_sec+acc_ost*1000),
+            self.text())
+    def time(self, acc_ost=0.0):
+        return (self.start_sec/1000+acc_ost, self.end_sec/1000+acc_ost)
+def generate_srt(sentence_list):
+    srt_total = ''
+    for i, sent in enumerate(sentence_list):
+        t2s = Text2SRT(sent['text'], sent['timestamp'])
+        if 'spk' in sent:
+            srt_total += "{}  spk{}\n{}".format(i, sent['spk'], t2s.srt())
+        else:
+            srt_total += "{}\n{}".format(i, t2s.srt())
+    return srt_total
+def generate_srt_clip(sentence_list, start, end, begin_index=0, time_acc_ost=0.0):
+    start, end = int(start * 1000), int(end * 1000)
+    srt_total = ''
+    cc = 1 + begin_index
+    subs = []
+    for _, sent in enumerate(sentence_list):
+        if isinstance(sent['text'], str):
+            sent['text'] = str2list(sent['text'])
+        if sent['timestamp'][-1][1] <= start:
+            # print("CASE0")
+            continue
+        if sent['timestamp'][0][0] >= end:
+            # print("CASE4")
+            break
+        # parts in between
+        if (sent['timestamp'][-1][1] <= end and sent['timestamp'][0][0] > start) or (sent['timestamp'][-1][1] == end and sent['timestamp'][0][0] == start):
+            # print("CASE1"); import pdb; pdb.set_trace()
+            t2s = Text2SRT(sent['text'], sent['timestamp'], offset=start)
+            srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+            subs.append((t2s.time(time_acc_ost), t2s.text()))
+            cc += 1
+            continue
+        if sent['timestamp'][0][0] <= start:
+            # print("CASE2"); import pdb; pdb.set_trace()
+            if not sent['timestamp'][-1][1] > end:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        break
+                _text = sent['text'][j:]
+                _ts = sent['timestamp'][j:]
+            else:
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > start:
+                        _start = j
+                        break
+                for j, ts in enumerate(sent['timestamp']):
+                    if ts[1] > end:
+                        _end = j
+                        break
+                # _text = " ".join(sent['text'][_start:_end])
+                _text = sent['text'][_start:_end]
+                _ts = sent['timestamp'][_start:_end]
+            if len(ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append((t2s.time(time_acc_ost), t2s.text()))
+                cc += 1
+            continue
+        if sent['timestamp'][-1][1] > end:
+            # print("CASE3"); import pdb; pdb.set_trace()
+            for j, ts in enumerate(sent['timestamp']):
+                if ts[1] > end:
+                    break
+            _text = sent['text'][:j]
+            _ts = sent['timestamp'][:j]
+            if len(_ts):
+                t2s = Text2SRT(_text, _ts, offset=start)
+                srt_total += "{}\n{}".format(cc, t2s.srt(time_acc_ost))
+                subs.append(
+                    (t2s.time(time_acc_ost), t2s.text())
+                    )
+                cc += 1
+            continue
+    return srt_total, subs, cc

funclip/utils/theme.json ADDED Viewed

	@@ -0,0 +1,333 @@

+{
+	"theme": {
+	"_font": [
+	{
+	"__gradio_font__": true,
+	"name": "Montserrat",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-sans-serif",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "system-ui",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "sans-serif",
+	"class": "font"
+	}
+	],
+	"_font_mono": [
+	{
+	"__gradio_font__": true,
+	"name": "IBM Plex Mono",
+	"class": "google"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "ui-monospace",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "Consolas",
+	"class": "font"
+	},
+	{
+	"__gradio_font__": true,
+	"name": "monospace",
+	"class": "font"
+	}
+	],
+	"background_fill_primary": "*neutral_50",
+	"background_fill_primary_dark": "*neutral_950",
+	"background_fill_secondary": "*neutral_50",
+	"background_fill_secondary_dark": "*neutral_900",
+	"block_background_fill": "white",
+	"block_background_fill_dark": "*neutral_800",
+	"block_border_color": "*border_color_primary",
+	"block_border_color_dark": "*border_color_primary",
+	"block_border_width": "0px",
+	"block_border_width_dark": "0px",
+	"block_info_text_color": "*body_text_color_subdued",
+	"block_info_text_color_dark": "*body_text_color_subdued",
+	"block_info_text_size": "*text_sm",
+	"block_info_text_weight": "400",
+	"block_label_background_fill": "*primary_100",
+	"block_label_background_fill_dark": "*primary_600",
+	"block_label_border_color": "*border_color_primary",
+	"block_label_border_color_dark": "*border_color_primary",
+	"block_label_border_width": "1px",
+	"block_label_border_width_dark": "1px",
+	"block_label_margin": "*spacing_md",
+	"block_label_padding": "*spacing_sm *spacing_md",
+	"block_label_radius": "*radius_md",
+	"block_label_right_radius": "0 calc(*radius_lg - 1px) 0 calc(*radius_lg - 1px)",
+	"block_label_text_color": "*primary_500",
+	"block_label_text_color_dark": "*white",
+	"block_label_text_size": "*text_md",
+	"block_label_text_weight": "600",
+	"block_padding": "*spacing_xl calc(*spacing_xl + 2px)",
+	"block_radius": "*radius_lg",
+	"block_shadow": "none",
+	"block_shadow_dark": "none",
+	"block_title_background_fill": "*block_label_background_fill",
+	"block_title_background_fill_dark": "*block_label_background_fill",
+	"block_title_border_color": "none",
+	"block_title_border_color_dark": "none",
+	"block_title_border_width": "0px",
+	"block_title_border_width_dark": "0px",
+	"block_title_padding": "*block_label_padding",
+	"block_title_radius": "*block_label_radius",
+	"block_title_text_color": "*primary_500",
+	"block_title_text_color_dark": "*white",
+	"block_title_text_size": "*text_md",
+	"block_title_text_weight": "600",
+	"body_background_fill": "*background_fill_primary",
+	"body_background_fill_dark": "*background_fill_primary",
+	"body_text_color": "*neutral_800",
+	"body_text_color_dark": "*neutral_100",
+	"body_text_color_subdued": "*neutral_400",
+	"body_text_color_subdued_dark": "*neutral_400",
+	"body_text_size": "*text_md",
+	"body_text_weight": "400",
+	"border_color_accent": "*primary_300",
+	"border_color_accent_dark": "*neutral_600",
+	"border_color_primary": "*neutral_200",
+	"border_color_primary_dark": "*neutral_700",
+	"button_border_width": "*input_border_width",
+	"button_border_width_dark": "*input_border_width",
+	"button_cancel_background_fill": "*button_secondary_background_fill",
+	"button_cancel_background_fill_dark": "*button_secondary_background_fill",
+	"button_cancel_background_fill_hover": "*button_secondary_background_fill_hover",
+	"button_cancel_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"button_cancel_border_color": "*button_secondary_border_color",
+	"button_cancel_border_color_dark": "*button_secondary_border_color",
+	"button_cancel_border_color_hover": "*button_cancel_border_color",
+	"button_cancel_border_color_hover_dark": "*button_cancel_border_color",
+	"button_cancel_text_color": "*button_secondary_text_color",
+	"button_cancel_text_color_dark": "*button_secondary_text_color",
+	"button_cancel_text_color_hover": "*button_cancel_text_color",
+	"button_cancel_text_color_hover_dark": "*button_cancel_text_color",
+	"button_large_padding": "*spacing_lg calc(2 * *spacing_lg)",
+	"button_large_radius": "*radius_lg",
+	"button_large_text_size": "*text_lg",
+	"button_large_text_weight": "600",
+	"button_primary_background_fill": "*primary_500",
+	"button_primary_background_fill_dark": "*primary_700",
+	"button_primary_background_fill_hover": "*primary_400",
+	"button_primary_background_fill_hover_dark": "*primary_500",
+	"button_primary_border_color": "*primary_200",
+	"button_primary_border_color_dark": "*primary_600",
+	"button_primary_border_color_hover": "*button_primary_border_color",
+	"button_primary_border_color_hover_dark": "*button_primary_border_color",
+	"button_primary_text_color": "white",
+	"button_primary_text_color_dark": "white",
+	"button_primary_text_color_hover": "*button_primary_text_color",
+	"button_primary_text_color_hover_dark": "*button_primary_text_color",
+	"button_secondary_background_fill": "white",
+	"button_secondary_background_fill_dark": "*neutral_600",
+	"button_secondary_background_fill_hover": "*neutral_100",
+	"button_secondary_background_fill_hover_dark": "*primary_500",
+	"button_secondary_border_color": "*neutral_200",
+	"button_secondary_border_color_dark": "*neutral_600",
+	"button_secondary_border_color_hover": "*button_secondary_border_color",
+	"button_secondary_border_color_hover_dark": "*button_secondary_border_color",
+	"button_secondary_text_color": "*neutral_800",
+	"button_secondary_text_color_dark": "white",
+	"button_secondary_text_color_hover": "*button_secondary_text_color",
+	"button_secondary_text_color_hover_dark": "*button_secondary_text_color",
+	"button_shadow": "*shadow_drop_lg",
+	"button_shadow_active": "*shadow_inset",
+	"button_shadow_hover": "*shadow_drop_lg",
+	"button_small_padding": "*spacing_sm calc(2 * *spacing_sm)",
+	"button_small_radius": "*radius_lg",
+	"button_small_text_size": "*text_md",
+	"button_small_text_weight": "400",
+	"button_transition": "background-color 0.2s ease",
+	"checkbox_background_color": "*background_fill_primary",
+	"checkbox_background_color_dark": "*neutral_800",
+	"checkbox_background_color_focus": "*checkbox_background_color",
+	"checkbox_background_color_focus_dark": "*checkbox_background_color",
+	"checkbox_background_color_hover": "*checkbox_background_color",
+	"checkbox_background_color_hover_dark": "*checkbox_background_color",
+	"checkbox_background_color_selected": "*primary_600",
+	"checkbox_background_color_selected_dark": "*primary_700",
+	"checkbox_border_color": "*neutral_100",
+	"checkbox_border_color_dark": "*neutral_600",
+	"checkbox_border_color_focus": "*primary_500",
+	"checkbox_border_color_focus_dark": "*primary_600",
+	"checkbox_border_color_hover": "*neutral_300",
+	"checkbox_border_color_hover_dark": "*neutral_600",
+	"checkbox_border_color_selected": "*primary_600",
+	"checkbox_border_color_selected_dark": "*primary_700",
+	"checkbox_border_radius": "*radius_sm",
+	"checkbox_border_width": "1px",
+	"checkbox_border_width_dark": "*input_border_width",
+	"checkbox_check": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3cpath d='M12.207 4.793a1 1 0 010 1.414l-5 5a1 1 0 01-1.414 0l-2-2a1 1 0 011.414-1.414L6.5 9.086l4.293-4.293a1 1 0 011.414 0z'/%3e%3c/svg%3e\")",
+	"checkbox_label_background_fill": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_dark": "*button_secondary_background_fill",
+	"checkbox_label_background_fill_hover": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_hover_dark": "*button_secondary_background_fill_hover",
+	"checkbox_label_background_fill_selected": "*primary_500",
+	"checkbox_label_background_fill_selected_dark": "*primary_600",
+	"checkbox_label_border_color": "*border_color_primary",
+	"checkbox_label_border_color_dark": "*border_color_primary",
+	"checkbox_label_border_color_hover": "*checkbox_label_border_color",
+	"checkbox_label_border_color_hover_dark": "*checkbox_label_border_color",
+	"checkbox_label_border_width": "*input_border_width",
+	"checkbox_label_border_width_dark": "*input_border_width",
+	"checkbox_label_gap": "*spacing_lg",
+	"checkbox_label_padding": "*spacing_md calc(2 * *spacing_md)",
+	"checkbox_label_shadow": "*shadow_drop_lg",
+	"checkbox_label_text_color": "*body_text_color",
+	"checkbox_label_text_color_dark": "*body_text_color",
+	"checkbox_label_text_color_selected": "white",
+	"checkbox_label_text_color_selected_dark": "*checkbox_label_text_color",
+	"checkbox_label_text_size": "*text_md",
+	"checkbox_label_text_weight": "400",
+	"checkbox_shadow": "none",
+	"color_accent": "*primary_500",
+	"color_accent_soft": "*primary_50",
+	"color_accent_soft_dark": "*neutral_700",
+	"container_radius": "*radius_lg",
+	"embed_radius": "*radius_lg",
+	"error_background_fill": "#fee2e2",
+	"error_background_fill_dark": "*background_fill_primary",
+	"error_border_color": "#fecaca",
+	"error_border_color_dark": "*border_color_primary",
+	"error_border_width": "1px",
+	"error_border_width_dark": "1px",
+	"error_text_color": "#ef4444",
+	"error_text_color_dark": "#ef4444",
+	"font": "'Montserrat', 'ui-sans-serif', 'system-ui', sans-serif",
+	"font_mono": "'IBM Plex Mono', 'ui-monospace', 'Consolas', monospace",
+	"form_gap_width": "0px",
+	"input_background_fill": "white",
+	"input_background_fill_dark": "*neutral_700",
+	"input_background_fill_focus": "*secondary_500",
+	"input_background_fill_focus_dark": "*secondary_600",
+	"input_background_fill_hover": "*input_background_fill",
+	"input_background_fill_hover_dark": "*input_background_fill",
+	"input_border_color": "*neutral_50",
+	"input_border_color_dark": "*border_color_primary",
+	"input_border_color_focus": "*secondary_300",
+	"input_border_color_focus_dark": "*neutral_700",
+	"input_border_color_hover": "*input_border_color",
+	"input_border_color_hover_dark": "*input_border_color",
+	"input_border_width": "0px",
+	"input_border_width_dark": "0px",
+	"input_padding": "*spacing_xl",
+	"input_placeholder_color": "*neutral_400",
+	"input_placeholder_color_dark": "*neutral_500",
+	"input_radius": "*radius_lg",
+	"input_shadow": "*shadow_drop",
+	"input_shadow_dark": "*shadow_drop",
+	"input_shadow_focus": "*shadow_drop_lg",
+	"input_shadow_focus_dark": "*shadow_drop_lg",
+	"input_text_size": "*text_md",
+	"input_text_weight": "400",
+	"layout_gap": "*spacing_xxl",
+	"link_text_color": "*secondary_600",
+	"link_text_color_active": "*secondary_600",
+	"link_text_color_active_dark": "*secondary_500",
+	"link_text_color_dark": "*secondary_500",
+	"link_text_color_hover": "*secondary_700",
+	"link_text_color_hover_dark": "*secondary_400",
+	"link_text_color_visited": "*secondary_500",
+	"link_text_color_visited_dark": "*secondary_600",
+	"loader_color": "*color_accent",
+	"loader_color_dark": "*color_accent",
+	"name": "base",
+	"neutral_100": "#f3f4f6",
+	"neutral_200": "#e5e7eb",
+	"neutral_300": "#d1d5db",
+	"neutral_400": "#9ca3af",
+	"neutral_50": "#f9fafb",
+	"neutral_500": "#6b7280",
+	"neutral_600": "#4b5563",
+	"neutral_700": "#374151",
+	"neutral_800": "#1f2937",
+	"neutral_900": "#111827",
+	"neutral_950": "#0b0f19",
+	"panel_background_fill": "*background_fill_secondary",
+	"panel_background_fill_dark": "*background_fill_secondary",
+	"panel_border_color": "*border_color_primary",
+	"panel_border_color_dark": "*border_color_primary",
+	"panel_border_width": "1px",
+	"panel_border_width_dark": "1px",
+	"primary_100": "#e0e7ff",
+	"primary_200": "#c7d2fe",
+	"primary_300": "#a5b4fc",
+	"primary_400": "#818cf8",
+	"primary_50": "#eef2ff",
+	"primary_500": "#6366f1",
+	"primary_600": "#4f46e5",
+	"primary_700": "#4338ca",
+	"primary_800": "#3730a3",
+	"primary_900": "#312e81",
+	"primary_950": "#2b2c5e",
+	"prose_header_text_weight": "600",
+	"prose_text_size": "*text_md",
+	"prose_text_weight": "400",
+	"radio_circle": "url(\"data:image/svg+xml,%3csvg viewBox='0 0 16 16' fill='white' xmlns='http://www.w3.org/2000/svg'%3e%3ccircle cx='8' cy='8' r='3'/%3e%3c/svg%3e\")",
+	"radius_lg": "6px",
+	"radius_md": "4px",
+	"radius_sm": "2px",
+	"radius_xl": "8px",
+	"radius_xs": "1px",
+	"radius_xxl": "12px",
+	"radius_xxs": "1px",
+	"secondary_100": "#ecfccb",
+	"secondary_200": "#d9f99d",
+	"secondary_300": "#bef264",
+	"secondary_400": "#a3e635",
+	"secondary_50": "#f7fee7",
+	"secondary_500": "#84cc16",
+	"secondary_600": "#65a30d",
+	"secondary_700": "#4d7c0f",
+	"secondary_800": "#3f6212",
+	"secondary_900": "#365314",
+	"secondary_950": "#2f4e14",
+	"section_header_text_size": "*text_md",
+	"section_header_text_weight": "400",
+	"shadow_drop": "0 1px 4px 0 rgb(0 0 0 / 0.1)",
+	"shadow_drop_lg": "0 2px 5px 0 rgb(0 0 0 / 0.1)",
+	"shadow_inset": "rgba(0,0,0,0.05) 0px 2px 4px 0px inset",
+	"shadow_spread": "6px",
+	"shadow_spread_dark": "1px",
+	"slider_color": "*primary_500",
+	"slider_color_dark": "*primary_600",
+	"spacing_lg": "6px",
+	"spacing_md": "4px",
+	"spacing_sm": "2px",
+	"spacing_xl": "9px",
+	"spacing_xs": "1px",
+	"spacing_xxl": "12px",
+	"spacing_xxs": "1px",
+	"stat_background_fill": "*primary_300",
+	"stat_background_fill_dark": "*primary_500",
+	"table_border_color": "*neutral_300",
+	"table_border_color_dark": "*neutral_700",
+	"table_even_background_fill": "white",
+	"table_even_background_fill_dark": "*neutral_950",
+	"table_odd_background_fill": "*neutral_50",
+	"table_odd_background_fill_dark": "*neutral_900",
+	"table_radius": "*radius_lg",
+	"table_row_focus": "*color_accent_soft",
+	"table_row_focus_dark": "*color_accent_soft",
+	"text_lg": "16px",
+	"text_md": "14px",
+	"text_sm": "12px",
+	"text_xl": "22px",
+	"text_xs": "10px",
+	"text_xxl": "26px",
+	"text_xxs": "9px"
+	},
+	"version": "0.0.1"
+	}

funclip/utils/trans_utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import os
+import re
+import numpy as np
+PUNC_LIST = ['，', '。', '！', '？', '、', ',', '.', '?', '!']
+def pre_proc(text):
+    res = ''
+    for i in range(len(text)):
+        if text[i] in PUNC_LIST:
+            continue
+        if '\u4e00' <= text[i] <= '\u9fff':
+            if len(res) and res[-1] != " ":
+                res += ' ' + text[i]+' '
+            else:
+                res += text[i]+' '
+        else:
+            res += text[i]
+    if res[-1] == ' ':
+        res = res[:-1]
+    return res
+def proc(raw_text, timestamp, dest_text, lang='zh'):
+    # simple matching
+    ld = len(dest_text.split())
+    mi, ts = [], []
+    offset = 0
+    while True:
+        fi = raw_text.find(dest_text, offset, len(raw_text))
+        ti = raw_text[:fi].count(' ')
+        if fi == -1:
+            break
+        offset = fi + ld
+        mi.append(fi)
+        ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
+    return ts
+def proc_spk(dest_spk, sd_sentences):
+    ts = []
+    for d in sd_sentences:
+        d_start = d['timestamp'][0][0]
+        d_end = d['timestamp'][-1][1]
+        spkid=dest_spk[3:]
+        if str(d['spk']) == spkid and d_end-d_start>999:
+            ts.append([d_start*16, d_end*16])
+    return ts
+def generate_vad_data(data, sd_sentences, sr=16000):
+    assert len(data.shape) == 1
+    vad_data = []
+    for d in sd_sentences:
+        d_start = round(d['ts_list'][0][0]/1000, 2)
+        d_end = round(d['ts_list'][-1][1]/1000, 2)
+        vad_data.append([d_start, d_end, data[int(d_start * sr):int(d_end * sr)]])
+    return vad_data
+def write_state(output_dir, state):
+    for key in ['/recog_res_raw', '/timestamp', '/sentences']:#, '/sd_sentences']:
+        with open(output_dir+key, 'w') as fout:
+            fout.write(str(state[key[1:]]))
+    if 'sd_sentences' in state:
+        with open(output_dir+'/sd_sentences', 'w') as fout:
+            fout.write(str(state['sd_sentences']))
+def load_state(output_dir):
+    state = {}
+    with open(output_dir+'/recog_res_raw') as fin:
+        line = fin.read()
+        state['recog_res_raw'] = line
+    with open(output_dir+'/timestamp') as fin:
+        line = fin.read()
+        state['timestamp'] = eval(line)
+    with open(output_dir+'/sentences') as fin:
+        line = fin.read()
+        state['sentences'] = eval(line)
+    if os.path.exists(output_dir+'/sd_sentences'):
+        with open(output_dir+'/sd_sentences') as fin:
+            line = fin.read()
+            state['sd_sentences'] = eval(line)
+    return state
+def convert_pcm_to_float(data):
+    if data.dtype == np.float64:
+        return data
+    elif data.dtype == np.float32:
+        return data.astype(np.float64)
+    elif data.dtype == np.int16:
+        bit_depth = 16
+    elif data.dtype == np.int32:
+        bit_depth = 32
+    elif data.dtype == np.int8:
+        bit_depth = 8
+    else:
+        raise ValueError("Unsupported audio data type")
+    # Now handle the integer types
+    max_int_value = float(2 ** (bit_depth - 1))
+    if bit_depth == 8:
+        data = data - 128
+    return (data.astype(np.float64) / max_int_value)
+def convert_time_to_millis(time_str):
+    # 格式: [小时:分钟:秒,毫秒]
+    hours, minutes, seconds, milliseconds = map(int, re.split('[:,]', time_str))
+    return (hours * 3600 + minutes * 60 + seconds) * 1000 + milliseconds
+def extract_timestamps(input_text):
+    # 使用正则表达式查找所有时间戳
+    timestamps = re.findall(r'\[(\d{2}:\d{2}:\d{2},\d{2,3})\s*-\s*(\d{2}:\d{2}:\d{2},\d{2,3})\]', input_text)
+    times_list = []
+    print(timestamps)
+    # 循环遍历找到的所有时间戳，并转换为毫秒
+    for start_time, end_time in timestamps:
+        start_millis = convert_time_to_millis(start_time)
+        end_millis = convert_time_to_millis(end_time)
+        times_list.append([start_millis, end_millis])
+    return times_list
+if __name__ == '__main__':
+    text = ("1. [00:00:00,500-00:00:05,850] 在我们的设计普惠当中，有一个我经常津津乐道的项目叫寻找远方的美好。"
+    "2. [00:00:07,120-00:00:12,940] 啊，在这样一个我们叫寻美在这样的一个项目当中，我们把它跟乡村振兴去结合起来，利用我们的设计的能力。"
+    "3. [00:00:13,240-00:00:25,620] 问我们自身员工的设设计能力，我们设计生态伙伴的能力，帮助乡村振兴当中，要希望把他的产品推向市场，把他的农产品把他加工产品推向市场的这样的伙伴做一件事情，")
+    print(extract_timestamps(text))

funclip/videoclipper.py ADDED Viewed

	@@ -0,0 +1,444 @@

+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunClip). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+import re
+import os
+import sys
+import copy
+import librosa
+import logging
+import argparse
+import numpy as np
+import soundfile as sf
+from moviepy.editor import *
+import moviepy.editor as mpy
+from moviepy.video.tools.subtitles import SubtitlesClip, TextClip
+from moviepy.editor import VideoFileClip, concatenate_videoclips
+from moviepy.video.compositing import CompositeVideoClip
+from utils.subtitle_utils import generate_srt, generate_srt_clip
+from utils.argparse_tools import ArgumentParser, get_commandline_args
+from utils.trans_utils import pre_proc, proc, write_state, load_state, proc_spk, convert_pcm_to_float
+class VideoClipper():
+    def __init__(self, funasr_model):
+        logging.warning("Initializing VideoClipper.")
+        self.funasr_model = funasr_model
+        self.GLOBAL_COUNT = 0
+    def recog(self, audio_input, sd_switch='no', state=None, hotwords="", output_dir=None):
+        if state is None:
+            state = {}
+        sr, data = audio_input
+        # Convert to float64 consistently (includes data type checking)
+        data = convert_pcm_to_float(data)
+        # assert sr == 16000, "16kHz sample rate required, {} given.".format(sr)
+        if sr != 16000: # resample with librosa
+            data = librosa.resample(data, orig_sr=sr, target_sr=16000)
+        if len(data.shape) == 2:  # multi-channel wav input
+            logging.warning("Input wav shape: {}, only first channel reserved.".format(data.shape))
+            data = data[:,0]
+        state['audio_input'] = (sr, data)
+        if sd_switch == 'Yes':
+            rec_result = self.funasr_model.generate(data,
+                                                    return_spk_res=True,
+                                                    return_raw_text=True,
+                                                    is_final=True,
+                                                    output_dir=output_dir,
+                                                    hotword=hotwords,
+                                                    pred_timestamp=self.lang=='en',
+                                                    en_post_proc=self.lang=='en',
+                                                    cache={})
+            res_srt = generate_srt(rec_result[0]['sentence_info'])
+            state['sd_sentences'] = rec_result[0]['sentence_info']
+        else:
+            rec_result = self.funasr_model.generate(data,
+                                                    return_spk_res=False,
+                                                    sentence_timestamp=True,
+                                                    return_raw_text=True,
+                                                    is_final=True,
+                                                    hotword=hotwords,
+                                                    output_dir=output_dir,
+                                                    pred_timestamp=self.lang=='en',
+                                                    en_post_proc=self.lang=='en',
+                                                    cache={})
+            res_srt = generate_srt(rec_result[0]['sentence_info'])
+        state['recog_res_raw'] = rec_result[0]['raw_text']
+        state['timestamp'] = rec_result[0]['timestamp']
+        state['sentences'] = rec_result[0]['sentence_info']
+        res_text = rec_result[0]['text']
+        return res_text, res_srt, state
+    def clip(self, dest_text, start_ost, end_ost, state, dest_spk=None, output_dir=None, timestamp_list=None):
+        # get from state
+        audio_input = state['audio_input']
+        recog_res_raw = state['recog_res_raw']
+        timestamp = state['timestamp']
+        sentences = state['sentences']
+        sr, data = audio_input
+        data = data.astype(np.float64)
+        if timestamp_list is None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        log_append = ""
+                        offset_b, offset_e = 0, 0
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text)
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+                log_append = ""
+        else:
+            all_ts = timestamp_list
+        ts = all_ts
+        # ts.sort()
+        srt_index = 0
+        clip_srt = ""
+        if len(ts):
+            start, end = ts[0]
+            start = min(max(0, start+start_ost*16), len(data))
+            end = min(max(0, end+end_ost*16), len(data))
+            res_audio = data[start:end]
+            start_end_info = "from {} to {}".format(start/16000, end/16000)
+            srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
+            clip_srt += srt_clip
+            for _ts in ts[1:]:  # multiple sentence input or multiple output matched
+                start, end = _ts
+                start = min(max(0, start+start_ost*16), len(data))
+                end = min(max(0, end+end_ost*16), len(data))
+                start_end_info += ", from {} to {}".format(start, end)
+                res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
+                srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
+                clip_srt += srt_clip
+        if len(ts):
+            message = "{} periods found in the speech: ".format(len(ts)) + start_end_info + log_append
+        else:
+            message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
+            res_audio = data
+        return (sr, res_audio), message, clip_srt
+    def video_recog(self, video_filename, sd_switch='no', hotwords="", output_dir=None):
+        video = mpy.VideoFileClip(video_filename)
+        # Extract the base name, add '_clip.mp4', and 'wav'
+        if output_dir is not None:
+            os.makedirs(output_dir, exist_ok=True)
+            _, base_name = os.path.split(video_filename)
+            base_name, _ = os.path.splitext(base_name)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
+            audio_file = os.path.join(output_dir, audio_file)
+        else:
+            base_name, _ = os.path.splitext(video_filename)
+            clip_video_file = base_name + '_clip.mp4'
+            audio_file = base_name + '.wav'
+        video.audio.write_audiofile(audio_file)
+        wav = librosa.load(audio_file, sr=16000)[0]
+        # delete the audio file after processing
+        if os.path.exists(audio_file):
+            os.remove(audio_file)
+        state = {
+            'video_filename': video_filename,
+            'clip_video_file': clip_video_file,
+            'video': video,
+        }
+        # res_text, res_srt = self.recog((16000, wav), state)
+        return self.recog((16000, wav), sd_switch, state, hotwords, output_dir)
+    def video_clip(self,
+                   dest_text,
+                   start_ost,
+                   end_ost,
+                   state,
+                   font_size=32,
+                   font_color='white',
+                   add_sub=False,
+                   dest_spk=None,
+                   output_dir=None,
+                   timestamp_list=None):
+        # get from state
+        recog_res_raw = state['recog_res_raw']
+        timestamp = state['timestamp']
+        sentences = state['sentences']
+        video = state['video']
+        clip_video_file = state['clip_video_file']
+        video_filename = state['video_filename']
+        if timestamp_list is None:
+            all_ts = []
+            if dest_spk is None or dest_spk == '' or 'sd_sentences' not in state:
+                for _dest_text in dest_text.split('#'):
+                    if '[' in _dest_text:
+                        match = re.search(r'\[(\d+),\s*(\d+)\]', _dest_text)
+                        if match:
+                            offset_b, offset_e = map(int, match.groups())
+                            log_append = ""
+                        else:
+                            offset_b, offset_e = 0, 0
+                            log_append = "(Bracket detected in dest_text but offset time matching failed)"
+                        _dest_text = _dest_text[:_dest_text.find('[')]
+                    else:
+                        offset_b, offset_e = 0, 0
+                        log_append = ""
+                    # import pdb; pdb.set_trace()
+                    _dest_text = pre_proc(_dest_text)
+                    ts = proc(recog_res_raw, timestamp, _dest_text.lower())
+                    for _ts in ts: all_ts.append([_ts[0]+offset_b*16, _ts[1]+offset_e*16])
+                    if len(ts) > 1 and match:
+                        log_append += '(offsets detected but No.{} sub-sentence matched to {} periods in audio, \
+                            offsets are applied to all periods)'
+            else:
+                for _dest_spk in dest_spk.split('#'):
+                    ts = proc_spk(_dest_spk, state['sd_sentences'])
+                    for _ts in ts: all_ts.append(_ts)
+        else:  # AI clip pass timestamp as input directly
+            all_ts = [[i[0]*16.0, i[1]*16.0] for i in timestamp_list]
+        srt_index = 0
+        time_acc_ost = 0.0
+        ts = all_ts
+        # ts.sort()
+        clip_srt = ""
+        if len(ts):
+            if self.lang == 'en' and isinstance(sentences, str):
+                sentences = sentences.split()
+            start, end = ts[0][0] / 16000, ts[0][1] / 16000
+            srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index, time_acc_ost=time_acc_ost)
+            start, end = start+start_ost/1000.0, end+end_ost/1000.0
+            video_clip = video.subclip(start, end)
+            start_end_info = "from {} to {}".format(start, end)
+            clip_srt += srt_clip
+            if add_sub:
+                generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
+                subtitles = SubtitlesClip(subs, generator)
+                video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
+            concate_clip = [video_clip]
+            time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
+            for _ts in ts[1:]:
+                start, end = _ts[0] / 16000, _ts[1] / 16000
+                srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1, time_acc_ost=time_acc_ost)
+                if not len(subs):
+                    continue
+                chi_subs = []
+                sub_starts = subs[0][0][0]
+                for sub in subs:
+                    chi_subs.append(((sub[0][0]-sub_starts, sub[0][1]-sub_starts), sub[1]))
+                start, end = start+start_ost/1000.0, end+end_ost/1000.0
+                _video_clip = video.subclip(start, end)
+                start_end_info += ", from {} to {}".format(str(start)[:5], str(end)[:5])
+                clip_srt += srt_clip
+                if add_sub:
+                    generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
+                    subtitles = SubtitlesClip(chi_subs, generator)
+                    _video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
+                    # _video_clip.write_videofile("debug.mp4", audio_codec="aac")
+                concate_clip.append(copy.copy(_video_clip))
+                time_acc_ost += end+end_ost/1000.0 - (start+start_ost/1000.0)
+            message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
+            logging.warning("Concating...")
+            if len(concate_clip) > 1:
+                video_clip = concatenate_videoclips(concate_clip)
+            # clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+            if output_dir is not None:
+                os.makedirs(output_dir, exist_ok=True)
+                _, file_with_extension = os.path.split(clip_video_file)
+                clip_video_file_name, _ = os.path.splitext(file_with_extension)
+                print(output_dir, clip_video_file)
+                clip_video_file = os.path.join(output_dir, "{}_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+                temp_audio_file = os.path.join(output_dir, "{}_tempaudio_no{}.mp4".format(clip_video_file_name, self.GLOBAL_COUNT))
+            else:
+                clip_video_file = clip_video_file[:-4] + '_no{}.mp4'.format(self.GLOBAL_COUNT)
+                temp_audio_file = clip_video_file[:-4] + '_tempaudio_no{}.mp4'.format(self.GLOBAL_COUNT)
+            video_clip.write_videofile(clip_video_file, audio_codec="aac", temp_audiofile=temp_audio_file)
+            self.GLOBAL_COUNT += 1
+        else:
+            clip_video_file = video_filename
+            message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
+            srt_clip = ''
+        return clip_video_file, message, clip_srt
+def get_parser():
+    parser = ArgumentParser(
+        description="ClipVideo Argument",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "--stage",
+        type=int,
+        choices=(1, 2),
+        help="Stage, 0 for recognizing and 1 for clipping",
+        required=True
+    )
+    parser.add_argument(
+        "--file",
+        type=str,
+        default=None,
+        help="Input file path",
+        required=True
+    )
+    parser.add_argument(
+        "--sd_switch",
+        type=str,
+        choices=("no", "yes"),
+        default="no",
+        help="Turn on the speaker diarization or not",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default='./output',
+        help="Output files path",
+    )
+    parser.add_argument(
+        "--dest_text",
+        type=str,
+        default=None,
+        help="Destination text string for clipping",
+    )
+    parser.add_argument(
+        "--dest_spk",
+        type=str,
+        default=None,
+        help="Destination spk id for clipping",
+    )
+    parser.add_argument(
+        "--start_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at beginning for clipping"
+    )
+    parser.add_argument(
+        "--end_ost",
+        type=int,
+        default=0,
+        help="Offset time in ms at ending for clipping"
+    )
+    parser.add_argument(
+        "--output_file",
+        type=str,
+        default=None,
+        help="Output file path"
+    )
+    parser.add_argument(
+        "--lang",
+        type=str,
+        default='zh',
+        help="language"
+    )
+    return parser
+def runner(stage, file, sd_switch, output_dir, dest_text, dest_spk, start_ost, end_ost, output_file, config=None, lang='zh'):
+    audio_suffixs = ['.wav','.mp3','.aac','.m4a','.flac']
+    video_suffixs = ['.mp4','.avi','.mkv','.flv','.mov','.webm','.ts','.mpeg']
+    _,ext = os.path.splitext(file)
+    if ext.lower() in audio_suffixs:
+        mode = 'audio'
+    elif ext.lower() in video_suffixs:
+        mode = 'video'
+    else:
+        logging.error("Unsupported file format: {}\n\nplease choise one of the following: {}".format(file),audio_suffixs+video_suffixs)
+        sys.exit(1) # exit if the file is not supported
+    while output_dir.endswith('/'):
+        output_dir = output_dir[:-1]
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+    if stage == 1:
+        from funasr import AutoModel
+        # initialize funasr automodel
+        logging.warning("Initializing modelscope asr pipeline.")
+        if lang == 'zh':
+            funasr_model = AutoModel(model="iic/speech_seaco_paraformer_large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                    vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                    punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                    spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                    )
+            audio_clipper = VideoClipper(funasr_model)
+            audio_clipper.lang = 'zh'
+        elif lang == 'en':
+            funasr_model = AutoModel(model="iic/speech_paraformer_asr-en-16k-vocab4199-pytorch",
+                                vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                                punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+                                spk_model="damo/speech_campplus_sv_zh-cn_16k-common",
+                                )
+            audio_clipper = VideoClipper(funasr_model)
+            audio_clipper.lang = 'en'
+        if mode == 'audio':
+            logging.warning("Recognizing audio file: {}".format(file))
+            wav, sr = librosa.load(file, sr=16000)
+            res_text, res_srt, state = audio_clipper.recog((sr, wav), sd_switch)
+        if mode == 'video':
+            logging.warning("Recognizing video file: {}".format(file))
+            res_text, res_srt, state = audio_clipper.video_recog(file, sd_switch)
+        total_srt_file = output_dir + '/total.srt'
+        with open(total_srt_file, 'w') as fout:
+            fout.write(res_srt)
+            logging.warning("Write total subtitle to {}".format(total_srt_file))
+        write_state(output_dir, state)
+        logging.warning("Recognition successed. You can copy the text segment from below and use stage 2.")
+        print(res_text)
+    if stage == 2:
+        audio_clipper = VideoClipper(None)
+        if mode == 'audio':
+            state = load_state(output_dir)
+            wav, sr = librosa.load(file, sr=16000)
+            state['audio_input'] = (sr, wav)
+            (sr, audio), message, srt_clip = audio_clipper.clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
+            if output_file is None:
+                output_file = output_dir + '/result.wav'
+            clip_srt_file = output_file[:-3] + 'srt'
+            logging.warning(message)
+            sf.write(output_file, audio, 16000)
+            assert output_file.endswith('.wav'), "output_file must ends with '.wav'"
+            logging.warning("Save clipped wav file to {}".format(output_file))
+            with open(clip_srt_file, 'w') as fout:
+                fout.write(srt_clip)
+                logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
+        if mode == 'video':
+            state = load_state(output_dir)
+            state['video_filename'] = file
+            if output_file is None:
+                state['clip_video_file'] = file[:-4] + '_clip.mp4'
+            else:
+                state['clip_video_file'] = output_file
+            clip_srt_file = state['clip_video_file'][:-3] + 'srt'
+            state['video'] = mpy.VideoFileClip(file)
+            clip_video_file, message, srt_clip = audio_clipper.video_clip(dest_text, start_ost, end_ost, state, dest_spk=dest_spk)
+            logging.warning("Clipping Log: {}".format(message))
+            logging.warning("Save clipped mp4 file to {}".format(clip_video_file))
+            with open(clip_srt_file, 'w') as fout:
+                fout.write(srt_clip)
+                logging.warning("Write clipped subtitle to {}".format(clip_srt_file))
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    runner(**kwargs)
+if __name__ == '__main__':
+    main()

gradio.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ app: funclip/launch.py

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+librosa
+soundfile
+scikit-learn>=1.3.2
+funasr>=1.1.2
+moviepy==1.0.3
+numpy==1.26.4
+gradio
+modelscope
+torch>=1.13
+torchaudio
+openai
+g4f
+dashscope
+curl_cffi