File size: 14,793 Bytes
06a7653
 
 
 
 
 
cd8c136
 
4118c0b
 
 
007f297
4118c0b
 
 
b60e79d
 
06a7653
14e4e0b
65d6d34
 
 
 
 
 
 
 
14e4e0b
 
 
 
 
 
 
 
 
03e89ce
 
 
 
 
 
 
b60e79d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b69555a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21c6539
007f297
 
 
 
 
 
 
 
 
21c6539
 
 
 
 
 
 
 
 
 
 
b5215aa
 
 
 
 
 
 
 
4118c0b
 
 
 
c8c9dcb
4118c0b
 
 
 
 
c556685
3b16e75
4118c0b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16c4623
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c49e1e8
c556685
 
 
 
 
 
 
 
 
 
c49e1e8
 
 
 
 
3b16e75
c49e1e8
 
ec1175e
 
 
 
 
 
 
 
 
 
06a7653
d461eef
06a7653
d461eef
 
 
 
06a7653
8322f72
 
 
 
 
 
 
 
 
 
 
 
 
 
8388447
8322f72
 
 
 
 
 
8388447
8322f72
 
ec1175e
06a7653
 
 
 
 
 
 
 
 
2bb2194
06a7653
 
2bb2194
06a7653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03e89ce
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
"use client";

import { Typography } from "@material-tailwind/react";
import AboutCard from "@/components/about-card";

import React from 'react';


// {
//   title: "Title",
//   description: "",
//   subTitle: "",
//   imageName : "paper12.png",
//   paper_links :""
// },


const EVENT_INFO = [

{
   title: "Speech Research",
   description: "This page lists some speech related research at Microsoft Research Asia, conducted by the team led by Xu Tan. The research topics cover text to speech, singing voice synthesis, music generation, automatic speech recognition, etc. Some research are open-sourced via NeuralSpeech and Muzic.",
   subTitle: "Speech/Audio/Voice Clone",
   imageName : "speechresearch.png",
  paper_links :"https://speechresearch.github.io/"
 },

 {
   title: "NaturalSpeech 3: Zero-Shot Speech Synthesis \
with Factorized Codec and Diffusion Models",
   description: "FACodec is a core component of the advanced text-to-speech (TTS) model NaturalSpeech 3. FACodec converts complex speech waveform into disentangled subspaces representing speech attributes of content, prosody, timbre, and acoustic details and reconstruct high-quality speech waveform from these attributes. FACodec decomposes complex speech into subspaces representing different attributes, thus simplifying the modeling of speech representation. Hf-Demo : https://huggingface.co/spaces/amphion/naturalspeech3_facodec",
   subTitle: "Zero-Shot Voice Clone / Audio",
   imageName : "zero-shot.png",
   paper_links :"https://arxiv.org/pdf/2403.03100.pdf"
 },

  {
  title: "ResAdapter : Domain Consistent Resolution Adapter for Diffusion Models",
  description: "Overview of ResAdapter. Left: Pipeline of ResAdapter. It is based on the frozen model (e.g., SD or SDXL) learns resolution priors from mixed-resolution datasets, which can be integrated into any personalized model to generate multi-resolution images. Right: Architecture comparison between ResAdapter and the vanilla LoRA. ResAdapter is only inserted to downsampler and upsampler, and unfreezes the group normalization of resnet blocks. Page:https://res-adapter.github.io/",
  subTitle: "Lora/ResAdapter/Resolution",
  imageName : "paper20.png",
  paper_links :"https://arxiv.org/pdf/2403.02084.pdf"
},

{
  title: "PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics \
  CVPR 2024",
  description: "PhysGaussian is a pioneering unified simulation-rendering pipeline that generates physics-based dynamics and photo-realistic renderings simultaneously and seamlessly. Page:https://xpandora.github.io/PhysGaussian/",
  subTitle: "NeRF/Physics/3D Reconstruction",
  imageName : "paper19.mp4",
  paper_links :"https://arxiv.org/pdf/2311.12198.pdf"
},

{
  title: "GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians",
  description: "GaussianAvatars combine dynamic 3D Gaussian splats with a parametric morphable face model for photorealistic avatars. Their method excels in animation control, showcasing superior performance in reenactments from driving videos, surpassing existing techniques.",
  subTitle: "Gaussian/Head Avatar",
  imageName : "paper18.mp4",
  paper_links :"https://arxiv.org/pdf/2312.02069.pdf"
},
  {
  title: "Gaussian Head Avatar:\
  Ultra High-fidelity Head Avatar via Dynamic Gaussians",
  description: "The Gaussian Head Avatar method combines controllable 3D Gaussians and MLP-based deformation fields to achieve high-fidelity head avatar modeling, outperforming existing sparse-view methods. It ensures fine-grained dynamic details and expression accuracy, achieving ultra high-fidelity rendering quality at 2K resolution",
  subTitle: "Gaussian/Head Avatar",
  imageName : "paper17.mp4",
  paper_links :"https://arxiv.org/pdf/2312.03029.pdf"
},
  {
  title: "Tuning-Free Noise Rectification:for High Fidelity Image-to-Video Generation",
  description: "Noise Rectification is a simple but effective method for image-to-video generation in open domains, and is tuning-free and plug-and-play. Below are several comparisons between  method and other methods.",
  subTitle: "Noise Control/Image-to-Video Generation",
  imageName : "paper16.mp4",
  paper_links :"https://arxiv.org/pdf/2403.02827.pdf"
},
  {
  title: "ConsistI2V: Enhancing Visual Consistency for Image-to-Video Generation",
  description: "Image-to-video (I2V) generation aims to use the \
  initial frame (alongside a text prompt) to create a \
  video sequence. A grand challenge in I2V generation is to maintain visual consistency throughout \
  the video: existing methods often struggle to preserve the integrity of the subject, background, and \
  style from the first frame, as well as ensure a fluid \
  and logical progression within the video narrative ",
  subTitle: "Image2Video Generation/Consistency",
  imageName : "paper15.png",
  paper_links :"https://arxiv.org/pdf/2402.04324.pdf"
},

  
  {
  title: "GEA: Reconstructing Expressive 3D Gaussian Avatar from Monocular Video",
  description: "A novel method utilizing 3D Gaussians for creating expressive 3D avatars achieves state-of-the-art performance in photorealistic novel view synthesis. It features accurate pose estimation, attention-aware networks, and an iterative re-initialization strategy for high-fidelity reconstructions and fine-grained control over body and hand poses. Project-Page Project page: \
  https://3d-aigc.github.io/GEA/",
  subTitle: "Gaussian/Nerf/3D Reconstruction",
  imageName : "paper14.mp4",
  paper_links :"https://arxiv.org/pdf/2402.16607.pdf"
},

{
  title: "PeRFlow: Piecewise Rectified Flow as Universal Plug-and-Play Accelerator",
  description: "PeRFlow trains piecewise-linear rectified flow models for fast sampling. These models can be initialized from pretrained diffusion models, such as Stable Diffusion (SD). The obtained weights of PeRFlow serve as a general accelerator module which is compatible with various fine-tuned stylized SD models as well as SD-based generation/editing pipelines. Specifically, \
  are computed by the PeRFlow's weights minus the pretrained SD. One can fuse the PeRFlow.\
  into various SD pipelines for (conditional) image generation/editing to enable high-quality few-step inference.",
  subTitle: "Finetune LORAs / Diffusion Models / PeRFlow",
  imageName : "perflow-v1.mp4",
  paper_links :"https://piecewise-rectified-flow.github.io/"
},

  {
  title: "Deformable One-shot Face Stylization via DINO Semantic Guidance",
  description: "This paper presents a novel approach to one-shot face stylization, focusing on appearance and structure. They use a self-supervised vision transformer, DINO-ViT, and integrate spatial transformers into StyleGAN for deformation-aware stylization. Innovative constraints and style-mixing enhance deformability and efficiency, demonstrating superiority over existing methods through extensive comparisons. Code is available at https://github.com/zichongc/DoesFS. ",
  subTitle: "GANS/StyleGAN/Deformable Stylization",
  imageName : "paper13.png",
  paper_links :"https://arxiv.org/pdf/2403.00459.pdf"

},
 
  {
    title: "Pix2Gif: Motion-Guided Diffusion for GIF Generation",
    description: "Pix2Gif introduces a novel approach to image-to-GIF generation using text and motion prompts. Their model utilizes motion-guided warping and perceptual loss to ensure content consistency. Pretrained on curated data, it effectively translates prompts into coherent GIFs, demonstrated through extensive experiments. Page:https://hiteshk03.github.io/Pix2Gif/",
    subTitle: "Text2Video/Animation/Diffusion",
    imageName : "paper111.png",
    paper_links :"https://arxiv.org/pdf/2403.04634.pdf"
  
  },
  

  
{
  title: "PixArt-Σ: Weak-to-Strong Training of Diffusion \
  Transformer for 4K Text-to-Image Generation",
  description: "PixArt-Σ is a cutting-edge Diffusion Transformer model that generates 4K images with superior fidelity and alignment to text prompts. It achieves this through high-quality training data and efficient token compression, resulting in smaller model size and superior image quality compared to existing models. Project-Page: https://pixart-alpha.github.io/PixArt-sigma-project/",
  subTitle: "Speech/Talking Face Generation",
  imageName : "paper12.png",
  paper_links :"https://arxiv.org/pdf/2403.04692.pdf"

},
  
  {
    title: "EmoSpeaker: One-shot Fine-grained \
    Emotion-Controlled Talking Face Generation",
    description: "The proposal introduces EmoSpeaker, a method enhancing emotional expression in generated facial animations. It employs a visual attribute-guided audio decoupler, fine-grained emotion coefficient prediction, and intensity control to improve emotional quality and lip synchronization. Experimental results show superiority over existing methods. Project-Page: https://peterfanfan.github.io/EmoSpeaker/",
    subTitle: "Speech/Talking Face Generation",
    imageName : "paper10.png",
    paper_links :"https://arxiv.org/pdf/2402.01422.pdf"
  
  },

  {
    title: "AVI-Talking: Learning Audio-Visual Instructions for \
    Expressive 3D Talking Face Generation",
    description: "AVI-Talking, a system for creating lifelike talking faces that match speech with expressive facial movements. Using advanced language models, it generates instructions for facial details based on speech, resulting in realistic and emotionally consistent animations.",
    subTitle: "Speech/LLMs/Talking Head",
    imageName : "paper9.png",
    paper_links :"https://arxiv.org/pdf/2402.16124.pdf"
  
  },
  
{
  title: "REAL3D-PORTRAIT: ONE-SHOT REALISTIC 3D \
  TALKING PORTRAIT SYNTHESIS",
  description: "Real3D-Portrait addresses limitations in one-shot 3D talking portrait generation by enhancing reconstruction accuracy, stable animation, and realism. It employs a large image-to-plane model, efficient motion adapter, and head-torso-background super-resolution model for realistic videos, alongside a generalizable audio-to-motion model for audio-driven animation.",
  subTitle: "Talking Head/Face Generation/Lipsync/Nerf",
  imageName : "paper8.png",
  paper_links :"https://arxiv.org/pdf/2401.08503.pdf"

},
 
{
  title: "Resolution-Agnostic Neural Compression for \
  High-Fidelity Portrait Video Conferencing via \
  Implicit Radiance Fields",
  description: "A novel low bandwidth neural compression approach for high-fidelity portrait video conferencing is proposed. Dynamic neural radiance fields reconstruct talking heads with expression features, enabling ultra-low bandwidth transmission and high fidelity portrait rendering via volume rendering.",
  subTitle: "Talking Head/Face Generation/Lipsync/Nerf",
  imageName : "paper7.png",
  paper_links :"https://arxiv.org/pdf/2402.16599.pdf"

},
  {
    title: " Learning Dynamic Tetrahedra for High-Quality Talking Head Synthesis",
    description: "The paper introduces DynTet, a novel hybrid representation combining neural networks and dynamic meshes for accurate facial avatar generation. It addresses artifacts and jitters in implicit methods like NeRF, achieving fidelity, lip synchronization, and real-time performance. Code is available. https://github.com/zhangzc21/DynTet",
    subTitle: "Talking Head/Face Generation/Lipsync",
    imageName : "paper6.png",
    paper_links :"https://arxiv.org/pdf/2402.17364.pdf"
  
  },
  {
    title: "EMO: Emote Portrait Alive - Generating \
  Expressive Portrait Videos with Audio2Video \
  Diffusion Model under Weak Conditions",
    description: "EMO, a pioneering framework for generating lifelike talking head videos by directly synthesizing video from audio inputs. Unlike traditional methods, EMO bypasses 3D models, ensuring seamless transitions and maintaining identity. Experimental results show superior expressiveness and realism, even in singing videos.",
    subTitle: "Talking Head/Face Generation/Lipsync",
    imageName : "paper5.png",
    paper_links :"https://arxiv.org/pdf/2402.17485.pdf"
  
  },
  {
    title: "Lips Are Lying: Spotting the Temporal Inconsistency between Audio and Visual in Lip-Syncing DeepFakes",
    description:
    " DeepFake can be bifurcated into entertainment applications like face swapping and illicit uses such as lipsyncing fraud",
    subTitle: "Lipsync",
    imageName : "paper1.png",
    paper_links :"https://arxiv.org/pdf/2401.15668.pdf"
  },
  {
    title: "FaceChain-ImagineID: Freely Crafting High-Fidelity Diverse Talking Faces from Disentangled Audio",
    description:
    "This paper proposes a method for generating diverse and synchronized talking faces from a single audio input. It tackles challenges by decoupling identity, content, and emotion from audio and maintaining diversity and consistency. The method involves Progressive Audio Disentanglement and Controllable Coherent Frame generation.",
    subTitle: "Lipsync",
    imageName : "paper2.png",
    paper_links :"https://arxiv.org/pdf/2403.01901.pdf"
  },

{
  title: "G4G: A Generic Framework for High Fidelity Talking Face Generation with Fine-grained Intra-modal Alignment",
  description: "This paper addresses the challenge of generating high-fidelity talking faces with synchronized lip movements for arbitrary audio. They propose G4G, a framework enhancing audio-image alignment using diagonal matrices and multi-scale supervision, achieving competitive results.",
  subTitle: "Lipsync",
  imageName : "paper3.png",
  paper_links :"https://arxiv.org/pdf/2402.18122.pdf"
},
{
  title: "Context-aware Talking Face Video Generation",
  description: "This paper introduces a method for generating multi-person talking face videos considering contextual interactions. It utilizes facial landmarks to control video generation stages, achieving synchronized and coherent results surpassing baselines.",
  subTitle: "Talking Head/Face Generation",
  imageName : "paper4.png",
  paper_links :"https://arxiv.org/pdf/2402.18092.pdf"
},


];

export function AboutEvent() {
  return (

    <section className="container mx-auto flex flex-col items-center px-1 py-2">
      

      <Typography variant="h6" className="text-center mb-2" color="orange" placeholder="">
        Every Day Update
      </Typography>
      <Typography variant="h6" className="text-center mb-2"  color="blue-gray" placeholder="">
        Gen AI Top Papers and Research
      </Typography>

      <Typography  variant="lead"
        className="mt-2 lg:max-w-4xl mb-8 w-full text-center font-normal !text-gray-500"  color="blue-gray" placeholder="">
        Contribute to the AI community by sharing your insights and expertise
      </Typography>

      <div className="mt-8 w-full grid grid-cols-1 md:grid-cols-3 gap-4 ">
        {EVENT_INFO.map((props, idx) => (
          <AboutCard key={idx} {...props} />
        ))}
      </div>

    </section>
  );
}

export default AboutEvent;