Spaces:
Running
Running
File size: 14,793 Bytes
06a7653 cd8c136 4118c0b 007f297 4118c0b b60e79d 06a7653 14e4e0b 65d6d34 14e4e0b 03e89ce b60e79d b69555a 21c6539 007f297 21c6539 b5215aa 4118c0b c8c9dcb 4118c0b c556685 3b16e75 4118c0b 16c4623 c49e1e8 c556685 c49e1e8 3b16e75 c49e1e8 ec1175e 06a7653 d461eef 06a7653 d461eef 06a7653 8322f72 8388447 8322f72 8388447 8322f72 ec1175e 06a7653 2bb2194 06a7653 2bb2194 06a7653 03e89ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 |
"use client";
import { Typography } from "@material-tailwind/react";
import AboutCard from "@/components/about-card";
import React from 'react';
// {
// title: "Title",
// description: "",
// subTitle: "",
// imageName : "paper12.png",
// paper_links :""
// },
const EVENT_INFO = [
{
title: "Speech Research",
description: "This page lists some speech related research at Microsoft Research Asia, conducted by the team led by Xu Tan. The research topics cover text to speech, singing voice synthesis, music generation, automatic speech recognition, etc. Some research are open-sourced via NeuralSpeech and Muzic.",
subTitle: "Speech/Audio/Voice Clone",
imageName : "speechresearch.png",
paper_links :"https://speechresearch.github.io/"
},
{
title: "NaturalSpeech 3: Zero-Shot Speech Synthesis \
with Factorized Codec and Diffusion Models",
description: "FACodec is a core component of the advanced text-to-speech (TTS) model NaturalSpeech 3. FACodec converts complex speech waveform into disentangled subspaces representing speech attributes of content, prosody, timbre, and acoustic details and reconstruct high-quality speech waveform from these attributes. FACodec decomposes complex speech into subspaces representing different attributes, thus simplifying the modeling of speech representation. Hf-Demo : https://huggingface.co/spaces/amphion/naturalspeech3_facodec",
subTitle: "Zero-Shot Voice Clone / Audio",
imageName : "zero-shot.png",
paper_links :"https://arxiv.org/pdf/2403.03100.pdf"
},
{
title: "ResAdapter : Domain Consistent Resolution Adapter for Diffusion Models",
description: "Overview of ResAdapter. Left: Pipeline of ResAdapter. It is based on the frozen model (e.g., SD or SDXL) learns resolution priors from mixed-resolution datasets, which can be integrated into any personalized model to generate multi-resolution images. Right: Architecture comparison between ResAdapter and the vanilla LoRA. ResAdapter is only inserted to downsampler and upsampler, and unfreezes the group normalization of resnet blocks. Page:https://res-adapter.github.io/",
subTitle: "Lora/ResAdapter/Resolution",
imageName : "paper20.png",
paper_links :"https://arxiv.org/pdf/2403.02084.pdf"
},
{
title: "PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics \
CVPR 2024",
description: "PhysGaussian is a pioneering unified simulation-rendering pipeline that generates physics-based dynamics and photo-realistic renderings simultaneously and seamlessly. Page:https://xpandora.github.io/PhysGaussian/",
subTitle: "NeRF/Physics/3D Reconstruction",
imageName : "paper19.mp4",
paper_links :"https://arxiv.org/pdf/2311.12198.pdf"
},
{
title: "GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians",
description: "GaussianAvatars combine dynamic 3D Gaussian splats with a parametric morphable face model for photorealistic avatars. Their method excels in animation control, showcasing superior performance in reenactments from driving videos, surpassing existing techniques.",
subTitle: "Gaussian/Head Avatar",
imageName : "paper18.mp4",
paper_links :"https://arxiv.org/pdf/2312.02069.pdf"
},
{
title: "Gaussian Head Avatar:\
Ultra High-fidelity Head Avatar via Dynamic Gaussians",
description: "The Gaussian Head Avatar method combines controllable 3D Gaussians and MLP-based deformation fields to achieve high-fidelity head avatar modeling, outperforming existing sparse-view methods. It ensures fine-grained dynamic details and expression accuracy, achieving ultra high-fidelity rendering quality at 2K resolution",
subTitle: "Gaussian/Head Avatar",
imageName : "paper17.mp4",
paper_links :"https://arxiv.org/pdf/2312.03029.pdf"
},
{
title: "Tuning-Free Noise Rectification:for High Fidelity Image-to-Video Generation",
description: "Noise Rectification is a simple but effective method for image-to-video generation in open domains, and is tuning-free and plug-and-play. Below are several comparisons between method and other methods.",
subTitle: "Noise Control/Image-to-Video Generation",
imageName : "paper16.mp4",
paper_links :"https://arxiv.org/pdf/2403.02827.pdf"
},
{
title: "ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation",
description: "Image-to-video (I2V) generation aims to use the \
initial frame (alongside a text prompt) to create a \
video sequence. A grand challenge in I2V generation is to maintain visual consistency throughout \
the video: existing methods often struggle to preserve the integrity of the subject, background, and \
style from the first frame, as well as ensure a fluid \
and logical progression within the video narrative ",
subTitle: "Image2Video Generation/Consistency",
imageName : "paper15.png",
paper_links :"https://arxiv.org/pdf/2402.04324.pdf"
},
{
title: "GEA: Reconstructing Expressive 3D Gaussian Avatar from Monocular Video",
description: "A novel method utilizing 3D Gaussians for creating expressive 3D avatars achieves state-of-the-art performance in photorealistic novel view synthesis. It features accurate pose estimation, attention-aware networks, and an iterative re-initialization strategy for high-fidelity reconstructions and fine-grained control over body and hand poses. Project-Page Project page: \
https://3d-aigc.github.io/GEA/",
subTitle: "Gaussian/Nerf/3D Reconstruction",
imageName : "paper14.mp4",
paper_links :"https://arxiv.org/pdf/2402.16607.pdf"
},
{
title: "PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator",
description: "PeRFlow trains piecewise-linear rectified flow models for fast sampling. These models can be initialized from pretrained diffusion models, such as Stable Diffusion (SD). The obtained weights of PeRFlow serve as a general accelerator module which is compatible with various fine-tuned stylized SD models as well as SD-based generation/editing pipelines. Specifically, \
are computed by the PeRFlow's weights minus the pretrained SD. One can fuse the PeRFlow.\
into various SD pipelines for (conditional) image generation/editing to enable high-quality few-step inference.",
subTitle: "Finetune LORAs / Diffusion Models / PeRFlow",
imageName : "perflow-v1.mp4",
paper_links :"https://piecewise-rectified-flow.github.io/"
},
{
title: "Deformable One-shot Face Stylization via DINO Semantic Guidance",
description: "This paper presents a novel approach to one-shot face stylization, focusing on appearance and structure. They use a self-supervised vision transformer, DINO-ViT, and integrate spatial transformers into StyleGAN for deformation-aware stylization. Innovative constraints and style-mixing enhance deformability and efficiency, demonstrating superiority over existing methods through extensive comparisons. Code is available at https://github.com/zichongc/DoesFS. ",
subTitle: "GANS/StyleGAN/Deformable Stylization",
imageName : "paper13.png",
paper_links :"https://arxiv.org/pdf/2403.00459.pdf"
},
{
title: "Pix2Gif: Motion-Guided Diffusion for GIF Generation",
description: "Pix2Gif introduces a novel approach to image-to-GIF generation using text and motion prompts. Their model utilizes motion-guided warping and perceptual loss to ensure content consistency. Pretrained on curated data, it effectively translates prompts into coherent GIFs, demonstrated through extensive experiments. Page:https://hiteshk03.github.io/Pix2Gif/",
subTitle: "Text2Video/Animation/Diffusion",
imageName : "paper111.png",
paper_links :"https://arxiv.org/pdf/2403.04634.pdf"
},
{
title: "PixArt-Σ: Weak-to-Strong Training of Diffusion \
Transformer for 4K Text-to-Image Generation",
description: "PixArt-Σ is a cutting-edge Diffusion Transformer model that generates 4K images with superior fidelity and alignment to text prompts. It achieves this through high-quality training data and efficient token compression, resulting in smaller model size and superior image quality compared to existing models. Project-Page: https://pixart-alpha.github.io/PixArt-sigma-project/",
subTitle: "Speech/Talking Face Generation",
imageName : "paper12.png",
paper_links :"https://arxiv.org/pdf/2403.04692.pdf"
},
{
title: "EmoSpeaker: One-shot Fine-grained \
Emotion-Controlled Talking Face Generation",
description: "The proposal introduces EmoSpeaker, a method enhancing emotional expression in generated facial animations. It employs a visual attribute-guided audio decoupler, fine-grained emotion coefficient prediction, and intensity control to improve emotional quality and lip synchronization. Experimental results show superiority over existing methods. Project-Page: https://peterfanfan.github.io/EmoSpeaker/",
subTitle: "Speech/Talking Face Generation",
imageName : "paper10.png",
paper_links :"https://arxiv.org/pdf/2402.01422.pdf"
},
{
title: "AVI-Talking: Learning Audio-Visual Instructions for \
Expressive 3D Talking Face Generation",
description: "AVI-Talking, a system for creating lifelike talking faces that match speech with expressive facial movements. Using advanced language models, it generates instructions for facial details based on speech, resulting in realistic and emotionally consistent animations.",
subTitle: "Speech/LLMs/Talking Head",
imageName : "paper9.png",
paper_links :"https://arxiv.org/pdf/2402.16124.pdf"
},
{
title: "REAL3D-PORTRAIT: ONE-SHOT REALISTIC 3D \
TALKING PORTRAIT SYNTHESIS",
description: "Real3D-Portrait addresses limitations in one-shot 3D talking portrait generation by enhancing reconstruction accuracy, stable animation, and realism. It employs a large image-to-plane model, efficient motion adapter, and head-torso-background super-resolution model for realistic videos, alongside a generalizable audio-to-motion model for audio-driven animation.",
subTitle: "Talking Head/Face Generation/Lipsync/Nerf",
imageName : "paper8.png",
paper_links :"https://arxiv.org/pdf/2401.08503.pdf"
},
{
title: "Resolution-Agnostic Neural Compression for \
High-Fidelity Portrait Video Conferencing via \
Implicit Radiance Fields",
description: "A novel low bandwidth neural compression approach for high-fidelity portrait video conferencing is proposed. Dynamic neural radiance fields reconstruct talking heads with expression features, enabling ultra-low bandwidth transmission and high fidelity portrait rendering via volume rendering.",
subTitle: "Talking Head/Face Generation/Lipsync/Nerf",
imageName : "paper7.png",
paper_links :"https://arxiv.org/pdf/2402.16599.pdf"
},
{
title: " Learning Dynamic Tetrahedra for High-Quality Talking Head Synthesis",
description: "The paper introduces DynTet, a novel hybrid representation combining neural networks and dynamic meshes for accurate facial avatar generation. It addresses artifacts and jitters in implicit methods like NeRF, achieving fidelity, lip synchronization, and real-time performance. Code is available. https://github.com/zhangzc21/DynTet",
subTitle: "Talking Head/Face Generation/Lipsync",
imageName : "paper6.png",
paper_links :"https://arxiv.org/pdf/2402.17364.pdf"
},
{
title: "EMO: Emote Portrait Alive - Generating \
Expressive Portrait Videos with Audio2Video \
Diffusion Model under Weak Conditions",
description: "EMO, a pioneering framework for generating lifelike talking head videos by directly synthesizing video from audio inputs. Unlike traditional methods, EMO bypasses 3D models, ensuring seamless transitions and maintaining identity. Experimental results show superior expressiveness and realism, even in singing videos.",
subTitle: "Talking Head/Face Generation/Lipsync",
imageName : "paper5.png",
paper_links :"https://arxiv.org/pdf/2402.17485.pdf"
},
{
title: "Lips Are Lying: Spotting the Temporal Inconsistency between Audio and Visual in Lip-Syncing DeepFakes",
description:
" DeepFake can be bifurcated into entertainment applications like face swapping and illicit uses such as lipsyncing fraud",
subTitle: "Lipsync",
imageName : "paper1.png",
paper_links :"https://arxiv.org/pdf/2401.15668.pdf"
},
{
title: "FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces from Disentangled Audio",
description:
"This paper proposes a method for generating diverse and synchronized talking faces from a single audio input. It tackles challenges by decoupling identity, content, and emotion from audio and maintaining diversity and consistency. The method involves Progressive Audio Disentanglement and Controllable Coherent Frame generation.",
subTitle: "Lipsync",
imageName : "paper2.png",
paper_links :"https://arxiv.org/pdf/2403.01901.pdf"
},
{
title: "G4G: A Generic Framework for High Fidelity Talking Face Generation with Fine-grained Intra-modal Alignment",
description: "This paper addresses the challenge of generating high-fidelity talking faces with synchronized lip movements for arbitrary audio. They propose G4G, a framework enhancing audio-image alignment using diagonal matrices and multi-scale supervision, achieving competitive results.",
subTitle: "Lipsync",
imageName : "paper3.png",
paper_links :"https://arxiv.org/pdf/2402.18122.pdf"
},
{
title: "Context-aware Talking Face Video Generation",
description: "This paper introduces a method for generating multi-person talking face videos considering contextual interactions. It utilizes facial landmarks to control video generation stages, achieving synchronized and coherent results surpassing baselines.",
subTitle: "Talking Head/Face Generation",
imageName : "paper4.png",
paper_links :"https://arxiv.org/pdf/2402.18092.pdf"
},
];
export function AboutEvent() {
return (
<section className="container mx-auto flex flex-col items-center px-1 py-2">
<Typography variant="h6" className="text-center mb-2" color="orange" placeholder="">
Every Day Update
</Typography>
<Typography variant="h6" className="text-center mb-2" color="blue-gray" placeholder="">
Gen AI Top Papers and Research
</Typography>
<Typography variant="lead"
className="mt-2 lg:max-w-4xl mb-8 w-full text-center font-normal !text-gray-500" color="blue-gray" placeholder="">
Contribute to the AI community by sharing your insights and expertise
</Typography>
<div className="mt-8 w-full grid grid-cols-1 md:grid-cols-3 gap-4 ">
{EVENT_INFO.map((props, idx) => (
<AboutCard key={idx} {...props} />
))}
</div>
</section>
);
}
export default AboutEvent;
|