diff --git a/app.py b/app.py
index 6fdd8e7efdecf9f87b64f846be0ffe8444053a7b..83b90a7d0ea4961aa0105d469735d7a59eca7756 100644
--- a/app.py
+++ b/app.py
@@ -1,10 +1,22 @@
 import gradio as gr
 from gradio.components import Dropdown
+
 import cv2 as cv
 import torch
 from torchvision import transforms
 from DeePixBiS.Model import DeePixBiS
 
+import yaml
+import numpy as np
+import pandas as pd
+from skimage.io import imread, imsave
+# from tddfa.TDDFA import TDDFA
+from tddfa.utils.depth import depth
+from tddfa.TDDFA_ONNX import TDDFA_ONNX
+
+import os
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
+os.environ['OMP_NUM_THREADS'] = '4'
 
 labels = ['Live', 'Spoof']
 thresh = 0.45
@@ -25,6 +37,8 @@ model = DeePixBiS(pretrained=False)
 model.load_state_dict(torch.load('./DeePixBiS/DeePixBiS.pth'))
 model.eval()
 
+cfg = yaml.load(open('tddfa/configs/mb1_120x120.yml'), Loader=yaml.SafeLoader)
+tddfa = TDDFA_ONNX(gpu_mode=False, **cfg)
 
 def find_largest_face(faces):
     largest_face = None
@@ -37,21 +51,21 @@ def find_largest_face(faces):
             largest_face = (x, y, w, h)
     return largest_face
 
-
 def inference(img, model_name):
     confidences = {}
-    if model_name == 'DeePixBiS':
-        grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
-        faces = faceClassifier.detectMultiScale(
-            grey, scaleFactor=1.1, minNeighbors=4)
-        face = find_largest_face(faces)
-        
-        if face is not None:
-            x, y, w, h = face
-            faceRegion = img[y:y + h, x:x + w]
-            faceRegion = cv.cvtColor(faceRegion, cv.COLOR_BGR2RGB)
-            faceRegion = tfms(faceRegion)
-            faceRegion = faceRegion.unsqueeze(0)
+    grey = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
+    faces = faceClassifier.detectMultiScale(
+        grey, scaleFactor=1.1, minNeighbors=4)
+    face = find_largest_face(faces)
+    
+    if face is not None:
+        x, y, w, h = face
+        faceRegion = img[y:y + h, x:x + w]
+        faceRegion = cv.cvtColor(faceRegion, cv.COLOR_BGR2RGB)
+        faceRegion = tfms(faceRegion)
+        faceRegion = faceRegion.unsqueeze(0)
+
+        if model_name == 'DeePixBiS':
             mask, binary = model.forward(faceRegion)
             res = torch.mean(mask).item()
             if res < thresh:
@@ -61,11 +75,25 @@ def inference(img, model_name):
             else:
                 cls = 'Real'
                 color = (0, 255, 0)
-            label = f'{cls} {res:.2f}'
-            cv.rectangle(img, (x, y), (x + w, y + h), color, 2)
-            cv.putText(img, label, (x, y + h + 30),
+            
+        else:
+            dense_flag = True
+            boxes = list(face)
+            boxes.append(1)
+            param_lst, roi_box_lst = tddfa(img, [boxes])
+            
+            ver_lst = tddfa.recon_vers(param_lst, roi_box_lst, dense_flag=dense_flag)
+            img = depth(img, ver_lst, tddfa.tri, with_bg_flag=False)
+            cls = 'Other'
+            res = 0.5
+            color = (0, 0, 255)
+
+        label = f'{cls} {res:.2f}'
+        confidences = {label: res}
+        cv.rectangle(img, (x, y), (x + w, y + h), color, 2)
+        cv.putText(img, label, (x, y + h + 30),
                     cv.FONT_HERSHEY_COMPLEX, 1, color)
-            confidences = {label: res}
+
     return img, confidences
 
 
diff --git a/requirements.txt b/requirements.txt
index 76ef8c4ac4697e707ffd69eb7e0936eab9b9e8ca..ac74889e181f3a68801cdd2e535f7f0fe3528b4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,11 @@ torch
 torchvision
 numpy
 opencv-python
+pyyaml
+onnx
+scikit-image # skimage
+scipy
+onnx
+onnxruntime
+cython
 --index-url=https://download.pytorch.org/whl/cpu --extra-index-url=https://pypi.org/simple
\ No newline at end of file
diff --git a/tddfa/Sim3DR/.gitignore b/tddfa/Sim3DR/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..60cd06f3aa00ecc33b1e33a7074f4c641713adcd
--- /dev/null
+++ b/tddfa/Sim3DR/.gitignore
@@ -0,0 +1,8 @@
+.DS_Store
+cmake-build-debug/
+.idea/
+build/
+*.so
+data/
+
+lib/rasterize.cpp
\ No newline at end of file
diff --git a/tddfa/Sim3DR/Sim3DR.py b/tddfa/Sim3DR/Sim3DR.py
new file mode 100644
index 0000000000000000000000000000000000000000..d734dad8cd85638ea4a781dd31cd2bec7aec3ebc
--- /dev/null
+++ b/tddfa/Sim3DR/Sim3DR.py
@@ -0,0 +1,29 @@
+# coding: utf-8
+
+from . import _init_paths
+import numpy as np
+import Sim3DR_Cython
+
+
+def get_normal(vertices, triangles):
+    normal = np.zeros_like(vertices, dtype=np.float32)
+    Sim3DR_Cython.get_normal(normal, vertices, triangles, vertices.shape[0], triangles.shape[0])
+    return normal
+
+
+def rasterize(vertices, triangles, colors, bg=None,
+              height=None, width=None, channel=None,
+              reverse=False):
+    if bg is not None:
+        height, width, channel = bg.shape
+    else:
+        assert height is not None and width is not None and channel is not None
+        bg = np.zeros((height, width, channel), dtype=np.uint8)
+
+    buffer = np.zeros((height, width), dtype=np.float32) - 1e8
+
+    if colors.dtype != np.float32:
+        colors = colors.astype(np.float32)
+    Sim3DR_Cython.rasterize(bg, vertices, triangles, colors, buffer, triangles.shape[0], height, width, channel,
+                            reverse=reverse)
+    return bg
diff --git a/tddfa/Sim3DR/__init__.py b/tddfa/Sim3DR/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..672abec77ca2f1b73ab9bd9cca579dba1193acee
--- /dev/null
+++ b/tddfa/Sim3DR/__init__.py
@@ -0,0 +1,4 @@
+# coding: utf-8
+
+from .Sim3DR import get_normal, rasterize
+from .lighting import RenderPipeline
diff --git a/tddfa/Sim3DR/_init_paths.py b/tddfa/Sim3DR/_init_paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..45c30fc2a0f5a6f6dd2191a3193b72f0a32a9274
--- /dev/null
+++ b/tddfa/Sim3DR/_init_paths.py
@@ -0,0 +1,14 @@
+# coding: utf-8
+
+import os.path as osp
+import sys
+
+
+def add_path(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
+
+
+this_dir = osp.dirname(__file__)
+lib_path = osp.join(this_dir, '.')
+add_path(lib_path)
diff --git a/tddfa/Sim3DR/build_sim3dr.sh b/tddfa/Sim3DR/build_sim3dr.sh
new file mode 100755
index 0000000000000000000000000000000000000000..97c5182275a35f3ba29d0aa94e15dbed4f927690
--- /dev/null
+++ b/tddfa/Sim3DR/build_sim3dr.sh
@@ -0,0 +1 @@
+python3 setup.py build_ext --inplace
\ No newline at end of file
diff --git a/tddfa/Sim3DR/lib/rasterize.h b/tddfa/Sim3DR/lib/rasterize.h
new file mode 100644
index 0000000000000000000000000000000000000000..de7a19c098c2719ab1557643b2e113b4565ede02
--- /dev/null
+++ b/tddfa/Sim3DR/lib/rasterize.h
@@ -0,0 +1,115 @@
+#ifndef MESH_CORE_HPP_
+#define MESH_CORE_HPP_
+
+#include <stdio.h>
+#include <cmath>
+#include <algorithm>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+using namespace std;
+
+class Point3D {
+public:
+    float x;
+    float y;
+    float z;
+
+public:
+    Point3D() : x(0.f), y(0.f), z(0.f) {}
+    Point3D(float x_, float y_, float z_) : x(x_), y(y_), z(z_) {}
+
+    void initialize(float x_, float y_, float z_){
+        this->x = x_; this->y = y_; this->z = z_;
+    }
+
+    Point3D cross(Point3D &p){
+        Point3D c;
+        c.x = this->y * p.z - this->z * p.y;
+        c.y = this->z * p.x - this->x * p.z;
+        c.z = this->x * p.y - this->y * p.x;
+        return c;
+    }
+
+    float dot(Point3D &p) {
+        return this->x * p.x + this->y * p.y + this->z * p.z;
+    }
+
+    Point3D operator-(const Point3D &p) {
+        Point3D np;
+        np.x = this->x - p.x;
+        np.y = this->y - p.y;
+        np.z = this->z - p.z;
+        return np;
+    }
+
+};
+
+class Point {
+public:
+    float x;
+    float y;
+
+public:
+    Point() : x(0.f), y(0.f) {}
+    Point(float x_, float y_) : x(x_), y(y_) {}
+    float dot(Point p) {
+        return this->x * p.x + this->y * p.y;
+    }
+
+    Point operator-(const Point &p) {
+        Point np;
+        np.x = this->x - p.x;
+        np.y = this->y - p.y;
+        return np;
+    }
+
+    Point operator+(const Point &p) {
+        Point np;
+        np.x = this->x + p.x;
+        np.y = this->y + p.y;
+        return np;
+    }
+
+    Point operator*(float s) {
+        Point np;
+        np.x = s * this->x;
+        np.y = s * this->y;
+        return np;
+    }
+};
+
+
+bool is_point_in_tri(Point p, Point p0, Point p1, Point p2);
+
+void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2);
+
+void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg);
+
+void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri);
+
+void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri);
+
+void _rasterize_triangles(
+        float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight,
+        int ntri, int h, int w);
+
+void _rasterize(
+        unsigned char *image, float *vertices, int *triangles, float *colors,
+        float *depth_buffer, int ntri, int h, int w, int c, float alpha, bool reverse);
+
+void _render_texture_core(
+        float *image, float *vertices, int *triangles,
+        float *texture, float *tex_coords, int *tex_triangles,
+        float *depth_buffer,
+        int nver, int tex_nver, int ntri,
+        int h, int w, int c,
+        int tex_h, int tex_w, int tex_c,
+        int mapping_type);
+
+void _write_obj_with_colors_texture(string filename, string mtl_name,
+                                    float *vertices, int *triangles, float *colors, float *uv_coords,
+                                    int nver, int ntri, int ntexver);
+
+#endif
\ No newline at end of file
diff --git a/tddfa/Sim3DR/lib/rasterize.pyx b/tddfa/Sim3DR/lib/rasterize.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..3fb88bb6274461c82c10465fb2fc42cecc8db241
--- /dev/null
+++ b/tddfa/Sim3DR/lib/rasterize.pyx
@@ -0,0 +1,134 @@
+import numpy as np
+cimport numpy as np
+# from libcpp.string cimport string
+cimport cython
+from libcpp cimport bool
+
+# from cpython import bool
+
+# use the Numpy-C-API from Cython
+np.import_array()
+
+# cdefine the signature of our c function
+cdef extern from "rasterize.h":
+    void _rasterize_triangles(
+            float*vertices, int*triangles, float*depth_buffer, int*triangle_buffer, float*barycentric_weight,
+            int ntri, int h, int w
+    )
+
+    void _rasterize(
+            unsigned char*image, float*vertices, int*triangles, float*colors, float*depth_buffer,
+            int ntri, int h, int w, int c, float alpha, bool reverse
+    )
+
+    # void _render_texture_core(
+    #     float* image, float* vertices, int* triangles,
+    #     float* texture, float* tex_coords, int* tex_triangles,
+    #     float* depth_buffer,
+    #     int nver, int tex_nver, int ntri,
+    #     int h, int w, int c,
+    #     int tex_h, int tex_w, int tex_c,
+    #     int mapping_type)
+
+    void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int nver, bool norm_flg)
+    void _get_ver_normal(float *ver_normal, float*tri_normal, int*triangles, int nver, int ntri)
+    void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri)
+
+
+    # void _write_obj_with_colors_texture(string filename, string mtl_name,
+    #     float* vertices, int* triangles, float* colors, float* uv_coords,
+    #     int nver, int ntri, int ntexver)
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def get_tri_normal(np.ndarray[float, ndim=2, mode="c"] tri_normal not None,
+                   np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+                   np.ndarray[int, ndim=2, mode="c"] triangles not None,
+                   int ntri, bool norm_flg = False):
+    _get_tri_normal(<float*> np.PyArray_DATA(tri_normal), <float*> np.PyArray_DATA(vertices),
+                    <int*> np.PyArray_DATA(triangles), ntri, norm_flg)
+
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+def get_ver_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None,
+                   np.ndarray[float, ndim=2, mode = "c"] tri_normal not None,
+                   np.ndarray[int, ndim=2, mode="c"] triangles not None,
+                   int nver, int ntri):
+    _get_ver_normal(
+        <float*> np.PyArray_DATA(ver_normal), <float*> np.PyArray_DATA(tri_normal), <int*> np.PyArray_DATA(triangles),
+        nver, ntri)
+
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+def get_normal(np.ndarray[float, ndim=2, mode = "c"] ver_normal not None,
+                   np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+                   np.ndarray[int, ndim=2, mode="c"] triangles not None,
+                   int nver, int ntri):
+    _get_normal(
+        <float*> np.PyArray_DATA(ver_normal), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+        nver, ntri)
+
+
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+def rasterize_triangles(
+        np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+        np.ndarray[int, ndim=2, mode="c"] triangles not None,
+        np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+        np.ndarray[int, ndim=2, mode = "c"] triangle_buffer not None,
+        np.ndarray[float, ndim=2, mode = "c"] barycentric_weight not None,
+        int ntri, int h, int w
+):
+    _rasterize_triangles(
+        <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+        <float*> np.PyArray_DATA(depth_buffer), <int*> np.PyArray_DATA(triangle_buffer),
+        <float*> np.PyArray_DATA(barycentric_weight),
+        ntri, h, w)
+
+@cython.boundscheck(False)  # turn off bounds-checking for entire function
+@cython.wraparound(False)  # turn off negative index wrapping for entire function
+def rasterize(np.ndarray[unsigned char, ndim=3, mode = "c"] image not None,
+              np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+              np.ndarray[int, ndim=2, mode="c"] triangles not None,
+              np.ndarray[float, ndim=2, mode = "c"] colors not None,
+              np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+              int ntri, int h, int w, int c, float alpha = 1, bool reverse = False
+              ):
+    _rasterize(
+        <unsigned char*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices),
+        <int*> np.PyArray_DATA(triangles),
+        <float*> np.PyArray_DATA(colors),
+        <float*> np.PyArray_DATA(depth_buffer),
+        ntri, h, w, c, alpha, reverse)
+
+# def render_texture_core(np.ndarray[float, ndim=3, mode = "c"] image not None,
+#                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+#                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+#                 np.ndarray[float, ndim=3, mode = "c"] texture not None,
+#                 np.ndarray[float, ndim=2, mode = "c"] tex_coords not None,
+#                 np.ndarray[int, ndim=2, mode="c"] tex_triangles not None,
+#                 np.ndarray[float, ndim=2, mode = "c"] depth_buffer not None,
+#                 int nver, int tex_nver, int ntri,
+#                 int h, int w, int c,
+#                 int tex_h, int tex_w, int tex_c,
+#                 int mapping_type
+#                 ):
+#     _render_texture_core(
+#         <float*> np.PyArray_DATA(image), <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles),
+#         <float*> np.PyArray_DATA(texture), <float*> np.PyArray_DATA(tex_coords), <int*> np.PyArray_DATA(tex_triangles),
+#         <float*> np.PyArray_DATA(depth_buffer),
+#         nver, tex_nver, ntri,
+#         h, w, c,
+#         tex_h, tex_w, tex_c,
+#         mapping_type)
+#
+# def write_obj_with_colors_texture_core(string filename, string mtl_name,
+#                 np.ndarray[float, ndim=2, mode = "c"] vertices not None,
+#                 np.ndarray[int, ndim=2, mode="c"] triangles not None,
+#                 np.ndarray[float, ndim=2, mode = "c"] colors not None,
+#                 np.ndarray[float, ndim=2, mode = "c"] uv_coords not None,
+#                 int nver, int ntri, int ntexver
+#                 ):
+#     _write_obj_with_colors_texture(filename, mtl_name,
+#         <float*> np.PyArray_DATA(vertices), <int*> np.PyArray_DATA(triangles), <float*> np.PyArray_DATA(colors), <float*> np.PyArray_DATA(uv_coords),
+#         nver, ntri, ntexver)
diff --git a/tddfa/Sim3DR/lib/rasterize_kernel.cpp b/tddfa/Sim3DR/lib/rasterize_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c714f96ab078dcd35cb33a157c2c5b8614c9a0c9
--- /dev/null
+++ b/tddfa/Sim3DR/lib/rasterize_kernel.cpp
@@ -0,0 +1,499 @@
+/*
+ Author: Yao Feng
+ Modified by Jianzhu Guo
+
+ functions that can not be optimazed by vertorization in python.
+ 1. rasterization.(need process each triangle)
+ 2. normal of each vertex.(use one-ring, need process each vertex)
+ 3. write obj(seems that it can be verctorized? anyway, writing it in c++ is simple, so also add function here. --> however, why writting in c++ is still slow?)
+
+
+
+*/
+
+#include "rasterize.h"
+
+
+/* Judge whether the Point is in the triangle
+Method:
+    http://blackpawn.com/texts/pointinpoly/
+Args:
+    Point: [x, y]
+    tri_points: three vertices(2d points) of a triangle. 2 coords x 3 vertices
+Returns:
+    bool: true for in triangle
+*/
+bool is_point_in_tri(Point p, Point p0, Point p1, Point p2) {
+    // vectors
+    Point v0, v1, v2;
+    v0 = p2 - p0;
+    v1 = p1 - p0;
+    v2 = p - p0;
+
+    // dot products
+    float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
+    float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
+    float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
+    float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
+    float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
+
+    // barycentric coordinates
+    float inverDeno;
+    if (dot00 * dot11 - dot01 * dot01 == 0)
+        inverDeno = 0;
+    else
+        inverDeno = 1 / (dot00 * dot11 - dot01 * dot01);
+
+    float u = (dot11 * dot02 - dot01 * dot12) * inverDeno;
+    float v = (dot00 * dot12 - dot01 * dot02) * inverDeno;
+
+    // check if Point in triangle
+    return (u >= 0) && (v >= 0) && (u + v < 1);
+}
+
+void get_point_weight(float *weight, Point p, Point p0, Point p1, Point p2) {
+    // vectors
+    Point v0, v1, v2;
+    v0 = p2 - p0;
+    v1 = p1 - p0;
+    v2 = p - p0;
+
+    // dot products
+    float dot00 = v0.dot(v0); //v0.x * v0.x + v0.y * v0.y //np.dot(v0.T, v0)
+    float dot01 = v0.dot(v1); //v0.x * v1.x + v0.y * v1.y //np.dot(v0.T, v1)
+    float dot02 = v0.dot(v2); //v0.x * v2.x + v0.y * v2.y //np.dot(v0.T, v2)
+    float dot11 = v1.dot(v1); //v1.x * v1.x + v1.y * v1.y //np.dot(v1.T, v1)
+    float dot12 = v1.dot(v2); //v1.x * v2.x + v1.y * v2.y//np.dot(v1.T, v2)
+
+    // barycentric coordinates
+    float inverDeno;
+    if (dot00 * dot11 - dot01 * dot01 == 0)
+        inverDeno = 0;
+    else
+        inverDeno = 1 / (dot00 * dot11 - dot01 * dot01);
+
+    float u = (dot11 * dot02 - dot01 * dot12) * inverDeno;
+    float v = (dot00 * dot12 - dot01 * dot02) * inverDeno;
+
+    // weight
+    weight[0] = 1 - u - v;
+    weight[1] = v;
+    weight[2] = u;
+}
+
+/*
+ * Get normals of triangles.
+ */
+void _get_tri_normal(float *tri_normal, float *vertices, int *triangles, int ntri, bool norm_flg) {
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    float v1x, v1y, v1z, v2x, v2y, v2z;
+
+    for (int i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        // counter clockwise order
+        v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind];
+        v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1];
+        v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2];
+
+        v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind];
+        v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1];
+        v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2];
+
+        if (norm_flg) {
+            float c1 = v1y * v2z - v1z * v2y;
+            float c2 = v1z * v2x - v1x * v2z;
+            float c3 = v1x * v2y - v1y * v2x;
+            float det = sqrt(c1 * c1 + c2 * c2 + c3 * c3);
+            if (det <= 0) det = 1e-6;
+            tri_normal[3 * i] = c1 / det;
+            tri_normal[3 * i + 1] = c2 / det;
+            tri_normal[3 * i + 2] = c3 / det;
+        } else {
+            tri_normal[3 * i] = v1y * v2z - v1z * v2y;
+            tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z;
+            tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x;
+        }
+    }
+}
+
+/*
+ * Get normal vector of vertices using triangle normals
+ */
+void _get_ver_normal(float *ver_normal, float *tri_normal, int *triangles, int nver, int ntri) {
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+
+    for (int i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        for (int j = 0; j < 3; j++) {
+            ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j];
+            ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j];
+            ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j];
+        }
+    }
+
+    // normalizing
+    float nx, ny, nz, det;
+    for (int i = 0; i < nver; ++i) {
+        nx = ver_normal[3 * i];
+        ny = ver_normal[3 * i + 1];
+        nz = ver_normal[3 * i + 2];
+
+        det = sqrt(nx * nx + ny * ny + nz * nz);
+        if (det <= 0) det = 1e-6;
+        ver_normal[3 * i] = nx / det;
+        ver_normal[3 * i + 1] = ny / det;
+        ver_normal[3 * i + 2] = nz / det;
+    }
+}
+
+/*
+ * Directly get normal of vertices, which can be regraded as a combination of _get_tri_normal and _get_ver_normal
+ */
+void _get_normal(float *ver_normal, float *vertices, int *triangles, int nver, int ntri) {
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    float v1x, v1y, v1z, v2x, v2y, v2z;
+
+    // get tri_normal
+//    float tri_normal[3 * ntri];
+    float *tri_normal;
+    tri_normal = new float[3 * ntri];
+    for (int i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        // counter clockwise order
+        v1x = vertices[3 * tri_p1_ind] - vertices[3 * tri_p0_ind];
+        v1y = vertices[3 * tri_p1_ind + 1] - vertices[3 * tri_p0_ind + 1];
+        v1z = vertices[3 * tri_p1_ind + 2] - vertices[3 * tri_p0_ind + 2];
+
+        v2x = vertices[3 * tri_p2_ind] - vertices[3 * tri_p0_ind];
+        v2y = vertices[3 * tri_p2_ind + 1] - vertices[3 * tri_p0_ind + 1];
+        v2z = vertices[3 * tri_p2_ind + 2] - vertices[3 * tri_p0_ind + 2];
+
+
+        tri_normal[3 * i] = v1y * v2z - v1z * v2y;
+        tri_normal[3 * i + 1] = v1z * v2x - v1x * v2z;
+        tri_normal[3 * i + 2] = v1x * v2y - v1y * v2x;
+
+    }
+
+    // get ver_normal
+    for (int i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        for (int j = 0; j < 3; j++) {
+            ver_normal[3 * tri_p0_ind + j] += tri_normal[3 * i + j];
+            ver_normal[3 * tri_p1_ind + j] += tri_normal[3 * i + j];
+            ver_normal[3 * tri_p2_ind + j] += tri_normal[3 * i + j];
+        }
+    }
+
+    // normalizing
+    float nx, ny, nz, det;
+    for (int i = 0; i < nver; ++i) {
+        nx = ver_normal[3 * i];
+        ny = ver_normal[3 * i + 1];
+        nz = ver_normal[3 * i + 2];
+
+        det = sqrt(nx * nx + ny * ny + nz * nz);
+        if (det <= 0) det = 1e-6;
+        ver_normal[3 * i] = nx / det;
+        ver_normal[3 * i + 1] = ny / det;
+        ver_normal[3 * i + 2] = nz / det;
+    }
+
+    delete[] tri_normal;
+}
+
+// rasterization by Z-Buffer with optimization
+// Complexity: < ntri * h * w * c
+void _rasterize(
+        unsigned char *image, float *vertices, int *triangles, float *colors, float *depth_buffer,
+        int ntri, int h, int w, int c, float alpha, bool reverse) {
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    Point p0, p1, p2, p;
+    int x_min, x_max, y_min, y_max;
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float p_color, p0_color, p1_color, p2_color;
+    float weight[3];
+
+    for (int i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        p0.x = vertices[3 * tri_p0_ind];
+        p0.y = vertices[3 * tri_p0_ind + 1];
+        p0_depth = vertices[3 * tri_p0_ind + 2];
+        p1.x = vertices[3 * tri_p1_ind];
+        p1.y = vertices[3 * tri_p1_ind + 1];
+        p1_depth = vertices[3 * tri_p1_ind + 2];
+        p2.x = vertices[3 * tri_p2_ind];
+        p2.y = vertices[3 * tri_p2_ind + 1];
+        p2_depth = vertices[3 * tri_p2_ind + 2];
+
+        x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+
+        y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+        if (x_max < x_min || y_max < y_min) {
+            continue;
+        }
+
+        for (y = y_min; y <= y_max; y++) {
+            for (x = x_min; x <= x_max; x++) {
+                p.x = float(x);
+                p.y = float(y);
+
+                // call get_point_weight function once
+                get_point_weight(weight, p, p0, p1, p2);
+
+                // and judge is_point_in_tri by below line of code
+                if (weight[2] > 0 && weight[1] > 0 && weight[0] > 0) {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
+
+                    if ((p_depth > depth_buffer[y * w + x])) {
+                        for (k = 0; k < c; k++) {
+                            p0_color = colors[c * tri_p0_ind + k];
+                            p1_color = colors[c * tri_p1_ind + k];
+                            p2_color = colors[c * tri_p2_ind + k];
+
+                            p_color = weight[0] * p0_color + weight[1] * p1_color + weight[2] * p2_color;
+                            if (reverse) {
+                                image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) (
+                                        (1 - alpha) * image[(h - 1 - y) * w * c + x * c + k] + alpha * 255 * p_color);
+//                                image[(h - 1 - y) * w * c + x * c + k] = (unsigned char) (255 * p_color);
+                            } else {
+                                image[y * w * c + x * c + k] = (unsigned char) (
+                                        (1 - alpha) * image[y * w * c + x * c + k] + alpha * 255 * p_color);
+//                                image[y * w * c + x * c + k] = (unsigned char) (255 * p_color);
+                            }
+                        }
+
+                        depth_buffer[y * w + x] = p_depth;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+void _rasterize_triangles(
+        float *vertices, int *triangles, float *depth_buffer, int *triangle_buffer, float *barycentric_weight,
+        int ntri, int h, int w) {
+    int i;
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    Point p0, p1, p2, p;
+    int x_min, x_max, y_min, y_max;
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float weight[3];
+
+    for (i = 0; i < ntri; i++) {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        p0.x = vertices[3 * tri_p0_ind];
+        p0.y = vertices[3 * tri_p0_ind + 1];
+        p0_depth = vertices[3 * tri_p0_ind + 2];
+        p1.x = vertices[3 * tri_p1_ind];
+        p1.y = vertices[3 * tri_p1_ind + 1];
+        p1_depth = vertices[3 * tri_p1_ind + 2];
+        p2.x = vertices[3 * tri_p2_ind];
+        p2.y = vertices[3 * tri_p2_ind + 1];
+        p2_depth = vertices[3 * tri_p2_ind + 2];
+
+        x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+
+        y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+        if (x_max < x_min || y_max < y_min) {
+            continue;
+        }
+
+        for (y = y_min; y <= y_max; y++) //h
+        {
+            for (x = x_min; x <= x_max; x++) //w
+            {
+                p.x = x;
+                p.y = y;
+//                if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) {
+                if (is_point_in_tri(p, p0, p1, p2)) {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
+
+                    if ((p_depth > depth_buffer[y * w + x])) {
+                        depth_buffer[y * w + x] = p_depth;
+                        triangle_buffer[y * w + x] = i;
+                        for (k = 0; k < 3; k++) {
+                            barycentric_weight[y * w * 3 + x * 3 + k] = weight[k];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// Depth-Buffer 算法
+// https://blog.csdn.net/Jurbo/article/details/75007260
+void _render_texture_core(
+        float *image, float *vertices, int *triangles,
+        float *texture, float *tex_coords, int *tex_triangles,
+        float *depth_buffer,
+        int nver, int tex_nver, int ntri,
+        int h, int w, int c,
+        int tex_h, int tex_w, int tex_c,
+        int mapping_type) {
+    int i;
+    int x, y, k;
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    int tex_tri_p0_ind, tex_tri_p1_ind, tex_tri_p2_ind;
+    Point p0, p1, p2, p;
+    Point tex_p0, tex_p1, tex_p2, tex_p;
+    int x_min, x_max, y_min, y_max;
+    float weight[3];
+    float p_depth, p0_depth, p1_depth, p2_depth;
+    float xd, yd;
+    float ul, ur, dl, dr;
+    for (i = 0; i < ntri; i++) {
+        // mesh
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        p0.x = vertices[3 * tri_p0_ind];
+        p0.y = vertices[3 * tri_p0_ind + 1];
+        p0_depth = vertices[3 * tri_p0_ind + 2];
+        p1.x = vertices[3 * tri_p1_ind];
+        p1.y = vertices[3 * tri_p1_ind + 1];
+        p1_depth = vertices[3 * tri_p1_ind + 2];
+        p2.x = vertices[3 * tri_p2_ind];
+        p2.y = vertices[3 * tri_p2_ind + 1];
+        p2_depth = vertices[3 * tri_p2_ind + 2];
+
+        // texture
+        tex_tri_p0_ind = tex_triangles[3 * i];
+        tex_tri_p1_ind = tex_triangles[3 * i + 1];
+        tex_tri_p2_ind = tex_triangles[3 * i + 2];
+
+        tex_p0.x = tex_coords[3 * tex_tri_p0_ind];
+        tex_p0.y = tex_coords[3 * tri_p0_ind + 1];
+        tex_p1.x = tex_coords[3 * tex_tri_p1_ind];
+        tex_p1.y = tex_coords[3 * tri_p1_ind + 1];
+        tex_p2.x = tex_coords[3 * tex_tri_p2_ind];
+        tex_p2.y = tex_coords[3 * tri_p2_ind + 1];
+
+
+        x_min = max((int) ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        x_max = min((int) floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+
+        y_min = max((int) ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        y_max = min((int) floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+
+        if (x_max < x_min || y_max < y_min) {
+            continue;
+        }
+
+        for (y = y_min; y <= y_max; y++) //h
+        {
+            for (x = x_min; x <= x_max; x++) //w
+            {
+                p.x = x;
+                p.y = y;
+                if (p.x < 2 || p.x > w - 3 || p.y < 2 || p.y > h - 3 || is_point_in_tri(p, p0, p1, p2)) {
+                    get_point_weight(weight, p, p0, p1, p2);
+                    p_depth = weight[0] * p0_depth + weight[1] * p1_depth + weight[2] * p2_depth;
+
+                    if ((p_depth > depth_buffer[y * w + x])) {
+                        // -- color from texture
+                        // cal weight in mesh tri
+                        get_point_weight(weight, p, p0, p1, p2);
+                        // cal coord in texture
+                        tex_p = tex_p0 * weight[0] + tex_p1 * weight[1] + tex_p2 * weight[2];
+                        tex_p.x = max(min(tex_p.x, float(tex_w - 1)), float(0));
+                        tex_p.y = max(min(tex_p.y, float(tex_h - 1)), float(0));
+
+                        yd = tex_p.y - floor(tex_p.y);
+                        xd = tex_p.x - floor(tex_p.x);
+                        for (k = 0; k < c; k++) {
+                            if (mapping_type == 0)// nearest
+                            {
+                                image[y * w * c + x * c + k] = texture[int(round(tex_p.y)) * tex_w * tex_c +
+                                                                       int(round(tex_p.x)) * tex_c + k];
+                            } else//bilinear interp
+                            {
+                                ul = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k];
+                                ur = texture[(int) floor(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k];
+                                dl = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) floor(tex_p.x) * tex_c + k];
+                                dr = texture[(int) ceil(tex_p.y) * tex_w * tex_c + (int) ceil(tex_p.x) * tex_c + k];
+
+                                image[y * w * c + x * c + k] =
+                                        ul * (1 - xd) * (1 - yd) + ur * xd * (1 - yd) + dl * (1 - xd) * yd +
+                                        dr * xd * yd;
+                            }
+
+                        }
+
+                        depth_buffer[y * w + x] = p_depth;
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+// ------------------------------------------------- write
+// obj write
+// Ref: https://github.com/patrikhuber/eos/blob/master/include/eos/core/Mesh.hpp
+void _write_obj_with_colors_texture(string filename, string mtl_name,
+                                    float *vertices, int *triangles, float *colors, float *uv_coords,
+                                    int nver, int ntri, int ntexver) {
+    int i;
+
+    ofstream obj_file(filename);
+
+    // first line of the obj file: the mtl name
+    obj_file << "mtllib " << mtl_name << endl;
+
+    // write vertices
+    for (i = 0; i < nver; ++i) {
+        obj_file << "v " << vertices[3 * i] << " " << vertices[3 * i + 1] << " " << vertices[3 * i + 2] << colors[3 * i]
+                 << " " << colors[3 * i + 1] << " " << colors[3 * i + 2] << endl;
+    }
+
+    // write uv coordinates
+    for (i = 0; i < ntexver; ++i) {
+        //obj_file << "vt " << uv_coords[2*i] << " " << (1 - uv_coords[2*i + 1]) << endl;
+        obj_file << "vt " << uv_coords[2 * i] << " " << uv_coords[2 * i + 1] << endl;
+    }
+
+    obj_file << "usemtl FaceTexture" << endl;
+    // write triangles
+    for (i = 0; i < ntri; ++i) {
+        // obj_file << "f " << triangles[3*i] << "/" << triangles[3*i] << " " << triangles[3*i + 1] << "/" << triangles[3*i + 1] << " " << triangles[3*i + 2] << "/" << triangles[3*i + 2] << endl;
+        obj_file << "f " << triangles[3 * i + 2] << "/" << triangles[3 * i + 2] << " " << triangles[3 * i + 1] << "/"
+                 << triangles[3 * i + 1] << " " << triangles[3 * i] << "/" << triangles[3 * i] << endl;
+    }
+
+}
\ No newline at end of file
diff --git a/tddfa/Sim3DR/lighting.py b/tddfa/Sim3DR/lighting.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f4a3520b36056a09a80b9eb1a09a8bdd159ee94
--- /dev/null
+++ b/tddfa/Sim3DR/lighting.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+
+import numpy as np
+from .Sim3DR import get_normal, rasterize
+
+_norm = lambda arr: arr / np.sqrt(np.sum(arr ** 2, axis=1))[:, None]
+
+
+def norm_vertices(vertices):
+    vertices -= vertices.min(0)[None, :]
+    vertices /= vertices.max()
+    vertices *= 2
+    vertices -= vertices.max(0)[None, :] / 2
+    return vertices
+
+
+def convert_type(obj):
+    if isinstance(obj, tuple) or isinstance(obj, list):
+        return np.array(obj, dtype=np.float32)[None, :]
+    return obj
+
+
+class RenderPipeline(object):
+    def __init__(self, **kwargs):
+        self.intensity_ambient = convert_type(kwargs.get('intensity_ambient', 0.3))
+        self.intensity_directional = convert_type(kwargs.get('intensity_directional', 0.6))
+        self.intensity_specular = convert_type(kwargs.get('intensity_specular', 0.1))
+        self.specular_exp = kwargs.get('specular_exp', 5)
+        self.color_ambient = convert_type(kwargs.get('color_ambient', (1, 1, 1)))
+        self.color_directional = convert_type(kwargs.get('color_directional', (1, 1, 1)))
+        self.light_pos = convert_type(kwargs.get('light_pos', (0, 0, 5)))
+        self.view_pos = convert_type(kwargs.get('view_pos', (0, 0, 5)))
+
+    def update_light_pos(self, light_pos):
+        self.light_pos = convert_type(light_pos)
+
+    def __call__(self, vertices, triangles, bg, texture=None):
+        normal = get_normal(vertices, triangles)
+
+        # 2. lighting
+        light = np.zeros_like(vertices, dtype=np.float32)
+        # ambient component
+        if self.intensity_ambient > 0:
+            light += self.intensity_ambient * self.color_ambient
+
+        vertices_n = norm_vertices(vertices.copy())
+        if self.intensity_directional > 0:
+            # diffuse component
+            direction = _norm(self.light_pos - vertices_n)
+            cos = np.sum(normal * direction, axis=1)[:, None]
+            # cos = np.clip(cos, 0, 1)
+            #  todo: check below
+            light += self.intensity_directional * (self.color_directional * np.clip(cos, 0, 1))
+
+            # specular component
+            if self.intensity_specular > 0:
+                v2v = _norm(self.view_pos - vertices_n)
+                reflection = 2 * cos * normal - direction
+                spe = np.sum((v2v * reflection) ** self.specular_exp, axis=1)[:, None]
+                spe = np.where(cos != 0, np.clip(spe, 0, 1), np.zeros_like(spe))
+                light += self.intensity_specular * self.color_directional * np.clip(spe, 0, 1)
+        light = np.clip(light, 0, 1)
+
+        # 2. rasterization, [0, 1]
+        if texture is None:
+            render_img = rasterize(vertices, triangles, light, bg=bg)
+            return render_img
+        else:
+            texture *= light
+            render_img = rasterize(vertices, triangles, texture, bg=bg)
+            return render_img
+
+
+def main():
+    pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tddfa/Sim3DR/readme.md b/tddfa/Sim3DR/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..bec8938a5ed0ef6eda66bfc8fc9cf73e887133fd
--- /dev/null
+++ b/tddfa/Sim3DR/readme.md
@@ -0,0 +1,8 @@
+## Sim3DR
+This is a simple 3D render, written by c++ and cython. 
+
+### Build Sim3DR
+
+```shell script
+python3 setup.py build_ext --inplace
+```
\ No newline at end of file
diff --git a/tddfa/Sim3DR/setup.py b/tddfa/Sim3DR/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae1614f3e314b49fa7983626b6a575670a395ddf
--- /dev/null
+++ b/tddfa/Sim3DR/setup.py
@@ -0,0 +1,19 @@
+'''
+python setup.py build_ext -i
+to compile
+'''
+
+from distutils.core import setup, Extension
+from Cython.Build import cythonize
+from Cython.Distutils import build_ext
+import numpy
+
+setup(
+    name='Sim3DR_Cython',  # not the package name
+    cmdclass={'build_ext': build_ext},
+    ext_modules=[Extension("Sim3DR_Cython",
+                           sources=["lib/rasterize.pyx", "lib/rasterize_kernel.cpp"],
+                           language='c++',
+                           include_dirs=[numpy.get_include()],
+                           extra_compile_args=["-std=c++11"])],
+)
diff --git a/tddfa/Sim3DR/tests/.gitignore b/tddfa/Sim3DR/tests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..567609b1234a9b8806c5a05da6c866e480aa148d
--- /dev/null
+++ b/tddfa/Sim3DR/tests/.gitignore
@@ -0,0 +1 @@
+build/
diff --git a/tddfa/Sim3DR/tests/CMakeLists.txt b/tddfa/Sim3DR/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6fbc312395691a93b27ba68d0002eb8626789c5e
--- /dev/null
+++ b/tddfa/Sim3DR/tests/CMakeLists.txt
@@ -0,0 +1,13 @@
+cmake_minimum_required(VERSION 2.8)
+
+set(TARGET test)
+project(${TARGET})
+
+#find_package( OpenCV REQUIRED )
+#include_directories( ${OpenCV_INCLUDE_DIRS} )
+
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -O3")
+include_directories(../lib)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -std=c++11")
+add_executable(${TARGET} test.cpp ../lib/rasterize_kernel.cpp io.cpp)
+target_include_directories(${TARGET} PRIVATE ${PROJECT_SOURCE_DIR})
diff --git a/tddfa/Sim3DR/tests/io.cpp b/tddfa/Sim3DR/tests/io.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd7b115f1cef9e66eeb7381767cb45478ec0695b
--- /dev/null
+++ b/tddfa/Sim3DR/tests/io.cpp
@@ -0,0 +1,89 @@
+#include "io.h"
+
+//void load_obj(const string obj_fp, float* vertices, float* colors, float* triangles){
+//    string line;
+//    ifstream in(obj_fp);
+//
+//    if(in.is_open()){
+//        while (getline(in, line)){
+//            stringstream ss(line);
+//
+//            char t; // type: v, f
+//            ss >> t;
+//            if (t == 'v'){
+//
+//            }
+//        }
+//    }
+//}
+
+void load_obj(const char *obj_fp, float *vertices, float *colors, int *triangles, int nver, int ntri) {
+    FILE *fp;
+    fp = fopen(obj_fp, "r");
+
+    char t; // type: v or f
+    if (fp != nullptr) {
+        for (int i = 0; i < nver; ++i) {
+            fscanf(fp, "%c", &t);
+            for (int j = 0; j < 3; ++j)
+                fscanf(fp, " %f", &vertices[3 * i + j]);
+            for (int j = 0; j < 3; ++j)
+                fscanf(fp, " %f", &colors[3 * i + j]);
+            fscanf(fp, "\n");
+        }
+//        fscanf(fp, "%c", &t);
+        for (int i = 0; i < ntri; ++i) {
+            fscanf(fp, "%c", &t);
+            for (int j = 0; j < 3; ++j) {
+                fscanf(fp, " %d", &triangles[3 * i + j]);
+                triangles[3 * i + j] -= 1;
+            }
+            fscanf(fp, "\n");
+        }
+
+        fclose(fp);
+    }
+}
+
+void load_ply(const char *ply_fp, float *vertices, int *triangles, int nver, int ntri) {
+    FILE *fp;
+    fp = fopen(ply_fp, "r");
+
+//    char s[256];
+    char t;
+    if (fp != nullptr) {
+//        for (int i = 0; i < 9; ++i)
+//            fscanf(fp, "%s", s);
+        for (int i = 0; i < nver; ++i)
+            fscanf(fp, "%f %f %f\n", &vertices[3 * i], &vertices[3 * i + 1], &vertices[3 * i + 2]);
+
+        for (int i = 0; i < ntri; ++i)
+            fscanf(fp, "%c %d %d %d\n", &t, &triangles[3 * i], &triangles[3 * i + 1], &triangles[3 * i + 2]);
+
+        fclose(fp);
+    }
+}
+
+void write_ppm(const char *filename, unsigned char *img, int h, int w, int c) {
+    FILE *fp;
+    //open file for output
+    fp = fopen(filename, "wb");
+    if (!fp) {
+        fprintf(stderr, "Unable to open file '%s'\n", filename);
+        exit(1);
+    }
+
+    //write the header file
+    //image format
+    fprintf(fp, "P6\n");
+
+    //image size
+    fprintf(fp, "%d %d\n", w, h);
+
+    // rgb component depth
+    fprintf(fp, "%d\n", MAX_PXL_VALUE);
+
+    // pixel data
+    fwrite(img, sizeof(unsigned char), size_t(h * w * c), fp);
+    fclose(fp);
+}
\ No newline at end of file
diff --git a/tddfa/Sim3DR/tests/io.h b/tddfa/Sim3DR/tests/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..1f6d3eecf8392a1d5440c28f54434b6d0bfc6c75
--- /dev/null
+++ b/tddfa/Sim3DR/tests/io.h
@@ -0,0 +1,20 @@
+#ifndef IO_H_
+#define IO_H_
+
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <stdio.h>
+
+using namespace std;
+
+#define MAX_PXL_VALUE 255
+
+void load_obj(const char* obj_fp, float* vertices, float* colors, int* triangles, int nver, int ntri);
+void load_ply(const char* ply_fp, float* vertices, int* triangles, int nver, int ntri);
+
+
+void write_ppm(const char *filename, unsigned char *img, int h, int w, int c);
+
+#endif
\ No newline at end of file
diff --git a/tddfa/Sim3DR/tests/test.cpp b/tddfa/Sim3DR/tests/test.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74273681f14110ddcb06217c1fe5c702a4052227
--- /dev/null
+++ b/tddfa/Sim3DR/tests/test.cpp
@@ -0,0 +1,172 @@
+/*
+ * Tesing cases
+ */
+
+#include <iostream>
+#include <time.h>
+#include "rasterize.h"
+#include "io.h"
+
+void test_isPointInTri() {
+    Point p0(0, 0);
+    Point p1(1, 0);
+    Point p2(1, 1);
+
+    Point p(0.2, 0.2);
+
+    if (is_point_in_tri(p, p0, p1, p2))
+        std::cout << "In";
+    else
+        std::cout << "Out";
+    std::cout << std::endl;
+}
+
+void test_getPointWeight() {
+    Point p0(0, 0);
+    Point p1(1, 0);
+    Point p2(1, 1);
+
+    Point p(0.2, 0.2);
+
+    float weight[3];
+    get_point_weight(weight, p, p0, p1, p2);
+    std::cout << weight[0] << " " << weight[1] << " " << weight[2] << std::endl;
+}
+
+void test_get_tri_normal() {
+    float tri_normal[3];
+//    float vertices[9] = {1, 0, 0, 0, 0, 0, 0, 1, 0};
+    float vertices[9] = {1, 1.1, 0, 0, 0, 0, 0, 0.6, 0.7};
+    int triangles[3] = {0, 1, 2};
+    int ntri = 1;
+
+    _get_tri_normal(tri_normal, vertices, triangles, ntri);
+
+    for (int i = 0; i < 3; ++i)
+        std::cout << tri_normal[i] << ", ";
+    std::cout << std::endl;
+}
+
+void test_load_obj() {
+    const char *fp = "../data/vd005_mesh.obj";
+    int nver = 35709;
+    int ntri = 70789;
+
+    auto *vertices = new float[nver];
+    auto *colors = new float[nver];
+    auto *triangles = new int[ntri];
+    load_obj(fp, vertices, colors, triangles, nver, ntri);
+
+    delete[] vertices;
+    delete[] colors;
+    delete[] triangles;
+}
+
+void test_render() {
+    // 1. loading obj
+//    const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/vd005_mesh.obj";
+    const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/face1.obj";
+    int nver = 35709; //53215; //35709;
+    int ntri = 70789; //105840;//70789;
+
+    auto *vertices = new float[3 * nver];
+    auto *colors = new float[3 * nver];
+    auto *triangles = new int[3 * ntri];
+    load_obj(fp, vertices, colors, triangles, nver, ntri);
+
+    // 2. rendering
+    int h = 224, w = 224, c = 3;
+
+    // enlarging
+    int scale = 4;
+    h *= scale;
+    w *= scale;
+    for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale;
+
+    auto *image = new unsigned char[h * w * c]();
+    auto *depth_buffer = new float[h * w]();
+
+    for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999;
+
+    clock_t t;
+    t = clock();
+
+    _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true);
+    t = clock() - t;
+    double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds
+    printf("Render took %f seconds to execute \n", time_taken);
+
+
+//    auto *image_char = new u_char[h * w * c]();
+//    for (int i = 0; i < h * w * c; ++i)
+//        image_char[i] = u_char(255 * image[i]);
+    write_ppm("res.ppm", image, h, w, c);
+
+//    delete[] image_char;
+    delete[] vertices;
+    delete[] colors;
+    delete[] triangles;
+    delete[] image;
+    delete[] depth_buffer;
+}
+
+void test_light() {
+    // 1. loading obj
+    const char *fp = "/Users/gjz/gjzprojects/Sim3DR/data/emma_input_0_noheader.ply";
+    int nver = 53215; //35709;
+    int ntri = 105840; //70789;
+
+    auto *vertices = new float[3 * nver];
+    auto *colors = new float[3 * nver];
+    auto *triangles = new int[3 * ntri];
+    load_ply(fp, vertices, triangles, nver, ntri);
+
+    // 2. rendering
+//    int h = 1901, w = 3913, c = 3;
+    int h = 2000, w = 4000, c = 3;
+
+    // enlarging
+//    int scale = 1;
+//    h *= scale;
+//    w *= scale;
+//    for (int i = 0; i < nver * 3; ++i) vertices[i] *= scale;
+
+    auto *image = new unsigned char[h * w * c]();
+    auto *depth_buffer = new float[h * w]();
+
+    for (int i = 0; i < h * w; ++i) depth_buffer[i] = -999999;
+    for (int i = 0; i < 3 * nver; ++i) colors[i] = 0.8;
+
+    clock_t t;
+    t = clock();
+
+    _rasterize(image, vertices, triangles, colors, depth_buffer, ntri, h, w, c, true);
+    t = clock() - t;
+    double time_taken = ((double) t) / CLOCKS_PER_SEC; // in seconds
+    printf("Render took %f seconds to execute \n", time_taken);
+
+
+//    auto *image_char = new u_char[h * w * c]();
+//    for (int i = 0; i < h * w * c; ++i)
+//        image_char[i] = u_char(255 * image[i]);
+    write_ppm("emma.ppm", image, h, w, c);
+
+//    delete[] image_char;
+    delete[] vertices;
+    delete[] colors;
+    delete[] triangles;
+    delete[] image;
+    delete[] depth_buffer;
+}
+
+int main(int argc, char *argv[]) {
+//    std::cout << "Hello CMake!" << std::endl;
+
+//    test_isPointInTri();
+//    test_getPointWeight();
+//    test_get_tri_normal();
+//    test_load_obj();
+//    test_render();
+    test_light();
+    return 0;
+}
\ No newline at end of file
diff --git a/tddfa/TDDFA.py b/tddfa/TDDFA.py
new file mode 100644
index 0000000000000000000000000000000000000000..970294e4fc2e470dd3bf4ceb9c63efe719b13fa9
--- /dev/null
+++ b/tddfa/TDDFA.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import os.path as osp
+import time
+import numpy as np
+import cv2
+import torch
+from torchvision.transforms import Compose
+import torch.backends.cudnn as cudnn
+
+import models
+from bfm import BFMModel
+from utils.io import _load
+from utils.functions import (
+    crop_img, parse_roi_box_from_bbox, parse_roi_box_from_landmark,
+)
+from utils.tddfa_util import (
+    load_model, _parse_param, similar_transform,
+    ToTensorGjz, NormalizeGjz
+)
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+class TDDFA(object):
+    """TDDFA: named Three-D Dense Face Alignment (TDDFA)"""
+
+    def __init__(self, **kvs):
+        torch.set_grad_enabled(False)
+
+        # load BFM
+        self.bfm = BFMModel(
+            bfm_fp=kvs.get('bfm_fp', make_abs_path('configs/bfm_noneck_v3.pkl')),
+            shape_dim=kvs.get('shape_dim', 40),
+            exp_dim=kvs.get('exp_dim', 10)
+        )
+        self.tri = self.bfm.tri
+
+        # config
+        self.gpu_mode = kvs.get('gpu_mode', False)
+        self.gpu_id = kvs.get('gpu_id', 0)
+        self.size = kvs.get('size', 120)
+
+        param_mean_std_fp = kvs.get(
+            'param_mean_std_fp', make_abs_path(f'configs/param_mean_std_62d_{self.size}x{self.size}.pkl')
+        )
+
+        # load model, default output is dimension with length 62 = 12(pose) + 40(shape) +10(expression)
+        model = getattr(models, kvs.get('arch'))(
+            num_classes=kvs.get('num_params', 62),
+            widen_factor=kvs.get('widen_factor', 1),
+            size=self.size,
+            mode=kvs.get('mode', 'small')
+        )
+        model = load_model(model, kvs.get('checkpoint_fp'))
+
+        if self.gpu_mode:
+            cudnn.benchmark = True
+            model = model.cuda(device=self.gpu_id)
+
+        self.model = model
+        self.model.eval()  # eval mode, fix BN
+
+        # data normalization
+        transform_normalize = NormalizeGjz(mean=127.5, std=128)
+        transform_to_tensor = ToTensorGjz()
+        transform = Compose([transform_to_tensor, transform_normalize])
+        self.transform = transform
+
+        # params normalization config
+        r = _load(param_mean_std_fp)
+        self.param_mean = r.get('mean')
+        self.param_std = r.get('std')
+
+        # print('param_mean and param_srd', self.param_mean, self.param_std)
+
+    def __call__(self, img_ori, objs, **kvs):
+        """The main call of TDDFA, given image and box / landmark, return 3DMM params and roi_box
+        :param img_ori: the input image
+        :param objs: the list of box or landmarks
+        :param kvs: options
+        :return: param list and roi_box list
+        """
+        # Crop image, forward to get the param
+        param_lst = []
+        roi_box_lst = []
+
+        crop_policy = kvs.get('crop_policy', 'box')
+        for obj in objs:
+            if crop_policy == 'box':
+                # by face box
+                roi_box = parse_roi_box_from_bbox(obj)
+            elif crop_policy == 'landmark':
+                # by landmarks
+                roi_box = parse_roi_box_from_landmark(obj)
+            else:
+                raise ValueError(f'Unknown crop policy {crop_policy}')
+
+            roi_box_lst.append(roi_box)
+            img = crop_img(img_ori, roi_box)
+            img = cv2.resize(img, dsize=(self.size, self.size), interpolation=cv2.INTER_LINEAR)
+            inp = self.transform(img).unsqueeze(0)
+
+            if self.gpu_mode:
+                inp = inp.cuda(device=self.gpu_id)
+
+            if kvs.get('timer_flag', False):
+                end = time.time()
+                param = self.model(inp)
+                elapse = f'Inference: {(time.time() - end) * 1000:.1f}ms'
+                print(elapse)
+            else:
+                param = self.model(inp)
+
+            param = param.squeeze().cpu().numpy().flatten().astype(np.float32)
+            param = param * self.param_std + self.param_mean  # re-scale
+            # print('output', param)
+            param_lst.append(param)
+
+        return param_lst, roi_box_lst
+
+    def recon_vers(self, param_lst, roi_box_lst, **kvs):
+        dense_flag = kvs.get('dense_flag', False)
+        size = self.size
+
+        ver_lst = []
+        for param, roi_box in zip(param_lst, roi_box_lst):
+            if dense_flag:
+                R, offset, alpha_shp, alpha_exp = _parse_param(param)
+                pts3d = R @ (self.bfm.u + self.bfm.w_shp @ alpha_shp + self.bfm.w_exp @ alpha_exp). \
+                    reshape(3, -1, order='F') + offset
+                pts3d = similar_transform(pts3d, roi_box, size)
+            else:
+                R, offset, alpha_shp, alpha_exp = _parse_param(param)
+                pts3d = R @ (self.bfm.u_base + self.bfm.w_shp_base @ alpha_shp + self.bfm.w_exp_base @ alpha_exp). \
+                    reshape(3, -1, order='F') + offset
+                pts3d = similar_transform(pts3d, roi_box, size)
+
+            ver_lst.append(pts3d)
+
+        return ver_lst
diff --git a/tddfa/TDDFA_ONNX.py b/tddfa/TDDFA_ONNX.py
new file mode 100644
index 0000000000000000000000000000000000000000..75fc00691b6d5f89f87439f51073a40c7cd9d132
--- /dev/null
+++ b/tddfa/TDDFA_ONNX.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import os.path as osp
+import numpy as np
+import cv2
+import onnxruntime
+
+from tddfa.utils.onnx import convert_to_onnx
+from tddfa.utils.io import _load
+from tddfa.utils.functions import (
+    crop_img, parse_roi_box_from_bbox, parse_roi_box_from_landmark,
+)
+from tddfa.utils.tddfa_util import _parse_param, similar_transform
+from tddfa.bfm.bfm import BFMModel
+from tddfa.bfm.bfm_onnx import convert_bfm_to_onnx
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+class TDDFA_ONNX(object):
+    """TDDFA_ONNX: the ONNX version of Three-D Dense Face Alignment (TDDFA)"""
+
+    def __init__(self, **kvs):
+        # torch.set_grad_enabled(False)
+
+        # load onnx version of BFM
+        bfm_fp = kvs.get('bfm_fp', make_abs_path('configs/bfm_noneck_v3.pkl'))
+        bfm_onnx_fp = bfm_fp.replace('.pkl', '.onnx')
+        if not osp.exists(bfm_onnx_fp):
+            convert_bfm_to_onnx(
+                bfm_onnx_fp,
+                shape_dim=kvs.get('shape_dim', 40),
+                exp_dim=kvs.get('exp_dim', 10)
+            )
+        self.bfm_session = onnxruntime.InferenceSession(bfm_onnx_fp, None)
+
+        # load for optimization
+        bfm = BFMModel(bfm_fp, shape_dim=kvs.get('shape_dim', 40), exp_dim=kvs.get('exp_dim', 10))
+        self.tri = bfm.tri
+        self.u_base, self.w_shp_base, self.w_exp_base = bfm.u_base, bfm.w_shp_base, bfm.w_exp_base
+
+        # config
+        self.gpu_mode = kvs.get('gpu_mode', False)
+        self.gpu_id = kvs.get('gpu_id', 0)
+        self.size = kvs.get('size', 120)
+
+        param_mean_std_fp = kvs.get(
+            'param_mean_std_fp', make_abs_path(f'configs/param_mean_std_62d_{self.size}x{self.size}.pkl')
+        )
+
+        onnx_fp = kvs.get('onnx_fp', kvs.get('checkpoint_fp').replace('.pth', '.onnx'))
+
+        # convert to onnx online if not existed
+        if onnx_fp is None or not osp.exists(onnx_fp):
+            print(f'{onnx_fp} does not exist, try to convert the `.pth` version to `.onnx` online')
+            onnx_fp = convert_to_onnx(**kvs)
+
+        self.session = onnxruntime.InferenceSession(onnx_fp, None)
+
+        # params normalization config
+        r = _load(param_mean_std_fp)
+        self.param_mean = r.get('mean')
+        self.param_std = r.get('std')
+
+    def __call__(self, img_ori, objs, **kvs):
+        # Crop image, forward to get the param
+        param_lst = []
+        roi_box_lst = []
+
+        crop_policy = kvs.get('crop_policy', 'box')
+        for obj in objs:
+            if crop_policy == 'box':
+                # by face box
+                roi_box = parse_roi_box_from_bbox(obj)
+            elif crop_policy == 'landmark':
+                # by landmarks
+                roi_box = parse_roi_box_from_landmark(obj)
+            else:
+                raise ValueError(f'Unknown crop policy {crop_policy}')
+
+            roi_box_lst.append(roi_box)
+            img = crop_img(img_ori, roi_box)
+            img = cv2.resize(img, dsize=(self.size, self.size), interpolation=cv2.INTER_LINEAR)
+            img = img.astype(np.float32).transpose(2, 0, 1)[np.newaxis, ...]
+            img = (img - 127.5) / 128.
+
+            inp_dct = {'input': img}
+
+            param = self.session.run(None, inp_dct)[0]
+            param = param.flatten().astype(np.float32)
+            param = param * self.param_std + self.param_mean  # re-scale
+            param_lst.append(param)
+
+        return param_lst, roi_box_lst
+
+    def recon_vers(self, param_lst, roi_box_lst, **kvs):
+        dense_flag = kvs.get('dense_flag', False)
+        size = self.size
+
+        ver_lst = []
+        for param, roi_box in zip(param_lst, roi_box_lst):
+            R, offset, alpha_shp, alpha_exp = _parse_param(param)
+            if dense_flag:
+                inp_dct = {
+                    'R': R, 'offset': offset, 'alpha_shp': alpha_shp, 'alpha_exp': alpha_exp
+                }
+                pts3d = self.bfm_session.run(None, inp_dct)[0]
+                pts3d = similar_transform(pts3d, roi_box, size)
+            else:
+                pts3d = R @ (self.u_base + self.w_shp_base @ alpha_shp + self.w_exp_base @ alpha_exp). \
+                    reshape(3, -1, order='F') + offset
+                pts3d = similar_transform(pts3d, roi_box, size)
+
+            ver_lst.append(pts3d)
+
+        return ver_lst
diff --git a/tddfa/bfm/.gitignore b/tddfa/bfm/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d0f1ed02b725c4d1bfbd7715b72adeb7d2d020e1
--- /dev/null
+++ b/tddfa/bfm/.gitignore
@@ -0,0 +1 @@
+*.ply
diff --git a/tddfa/bfm/__init__.py b/tddfa/bfm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d16cb66031567401786aaa7a7e38c5f818c8290a
--- /dev/null
+++ b/tddfa/bfm/__init__.py
@@ -0,0 +1 @@
+from .bfm import BFMModel
\ No newline at end of file
diff --git a/tddfa/bfm/bfm.py b/tddfa/bfm/bfm.py
new file mode 100644
index 0000000000000000000000000000000000000000..be2a3896f36fb297701a11ec0986f693b61c07ce
--- /dev/null
+++ b/tddfa/bfm/bfm.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import os.path as osp
+import numpy as np
+from tddfa.utils.io import _load
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def _to_ctype(arr):
+    if not arr.flags.c_contiguous:
+        return arr.copy(order='C')
+    return arr
+
+
+class BFMModel(object):
+    def __init__(self, bfm_fp, shape_dim=40, exp_dim=10):
+        bfm = _load(bfm_fp)
+        self.u = bfm.get('u').astype(np.float32)  # fix bug
+        self.w_shp = bfm.get('w_shp').astype(np.float32)[..., :shape_dim]
+        self.w_exp = bfm.get('w_exp').astype(np.float32)[..., :exp_dim]
+        if osp.split(bfm_fp)[-1] == 'bfm_noneck_v3.pkl':
+            self.tri = _load(make_abs_path('../configs/tri.pkl'))  # this tri/face is re-built for bfm_noneck_v3
+        else:
+            self.tri = bfm.get('tri')
+
+        self.tri = _to_ctype(self.tri.T).astype(np.int32)
+        self.keypoints = bfm.get('keypoints').astype(np.long)  # fix bug
+        w = np.concatenate((self.w_shp, self.w_exp), axis=1)
+        self.w_norm = np.linalg.norm(w, axis=0)
+
+        self.u_base = self.u[self.keypoints].reshape(-1, 1)
+        self.w_shp_base = self.w_shp[self.keypoints]
+        self.w_exp_base = self.w_exp[self.keypoints]
diff --git a/tddfa/bfm/bfm_onnx.py b/tddfa/bfm/bfm_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..70b17a44aa06b5d99246ac0f6681c81f3a22bf2d
--- /dev/null
+++ b/tddfa/bfm/bfm_onnx.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import os.path as osp
+import numpy as np
+import torch
+import torch.nn as nn
+
+from tddfa.utils.io import _load, _numpy_to_cuda, _numpy_to_tensor
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def _to_ctype(arr):
+    if not arr.flags.c_contiguous:
+        return arr.copy(order='C')
+    return arr
+
+
+def _load_tri(bfm_fp):
+    if osp.split(bfm_fp)[-1] == 'bfm_noneck_v3.pkl':
+        tri = _load(make_abs_path('../configs/tri.pkl'))  # this tri/face is re-built for bfm_noneck_v3
+    else:
+        tri = _load(bfm_fp).get('tri')
+
+    tri = _to_ctype(tri.T).astype(np.int32)
+    return tri
+
+
+class BFMModel_ONNX(nn.Module):
+    """BFM serves as a decoder"""
+
+    def __init__(self, bfm_fp, shape_dim=40, exp_dim=10):
+        super(BFMModel_ONNX, self).__init__()
+
+        _to_tensor = _numpy_to_tensor
+
+        # load bfm
+        bfm = _load(bfm_fp)
+
+        u = _to_tensor(bfm.get('u').astype(np.float32))
+        self.u = u.view(-1, 3).transpose(1, 0)
+        w_shp = _to_tensor(bfm.get('w_shp').astype(np.float32)[..., :shape_dim])
+        w_exp = _to_tensor(bfm.get('w_exp').astype(np.float32)[..., :exp_dim])
+        w = torch.cat((w_shp, w_exp), dim=1)
+        self.w = w.view(-1, 3, w.shape[-1]).contiguous().permute(1, 0, 2)
+
+        # self.u = _to_tensor(bfm.get('u').astype(np.float32))  # fix bug
+        # w_shp = _to_tensor(bfm.get('w_shp').astype(np.float32)[..., :shape_dim])
+        # w_exp = _to_tensor(bfm.get('w_exp').astype(np.float32)[..., :exp_dim])
+        # self.w = torch.cat((w_shp, w_exp), dim=1)
+
+        # self.keypoints = bfm.get('keypoints').astype(np.long)  # fix bug
+        # self.u_base = self.u[self.keypoints].reshape(-1, 1)
+        # self.w_shp_base = self.w_shp[self.keypoints]
+        # self.w_exp_base = self.w_exp[self.keypoints]
+
+    def forward(self, *inps):
+        R, offset, alpha_shp, alpha_exp = inps
+        alpha = torch.cat((alpha_shp, alpha_exp))
+        # pts3d = R @ (self.u + self.w_shp.matmul(alpha_shp) + self.w_exp.matmul(alpha_exp)). \
+        #     view(-1, 3).transpose(1, 0) + offset
+        # pts3d = R @ (self.u + self.w.matmul(alpha)).view(-1, 3).transpose(1, 0) + offset
+        pts3d = R @ (self.u + self.w.matmul(alpha).squeeze()) + offset
+        return pts3d
+
+
+def convert_bfm_to_onnx(bfm_onnx_fp, shape_dim=40, exp_dim=10):
+    # print(shape_dim, exp_dim)
+    bfm_fp = bfm_onnx_fp.replace('.onnx', '.pkl')
+    bfm_decoder = BFMModel_ONNX(bfm_fp=bfm_fp, shape_dim=shape_dim, exp_dim=exp_dim)
+    bfm_decoder.eval()
+
+    # dummy_input = torch.randn(12 + shape_dim + exp_dim)
+    dummy_input = torch.randn(3, 3), torch.randn(3, 1), torch.randn(shape_dim, 1), torch.randn(exp_dim, 1)
+    R, offset, alpha_shp, alpha_exp = dummy_input
+    torch.onnx.export(
+        bfm_decoder,
+        (R, offset, alpha_shp, alpha_exp),
+        bfm_onnx_fp,
+        input_names=['R', 'offset', 'alpha_shp', 'alpha_exp'],
+        output_names=['output'],
+        dynamic_axes={
+            'alpha_shp': [0],
+            'alpha_exp': [0],
+        },
+        do_constant_folding=True
+    )
+    print(f'Convert {bfm_fp} to {bfm_onnx_fp} done.')
+
+
+if __name__ == '__main__':
+    convert_bfm_to_onnx('../configs/bfm_noneck_v3.onnx')
diff --git a/tddfa/bfm/readme.md b/tddfa/bfm/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a40fc41818f63045c3e03babbe4f1b9bfc2e3dd
--- /dev/null
+++ b/tddfa/bfm/readme.md
@@ -0,0 +1,23 @@
+## Statement
+
+The modified BFM2009 face model in `../configs/bfm_noneck_v3.pkl` is only for academic use.
+For commercial use, you need to apply for the commercial license, some refs are below:
+
+[1] https://faces.dmi.unibas.ch/bfm/?nav=1-0&id=basel_face_model
+
+[2] https://faces.dmi.unibas.ch/bfm/bfm2019.html
+
+If your work benefits from this repo, please cite
+
+    @PROCEEDINGS{bfm09,
+        title={A 3D Face Model for Pose and Illumination Invariant Face Recognition},
+        author={P. Paysan and R. Knothe and B. Amberg
+                and S. Romdhani and T. Vetter},
+        journal={Proceedings of the 6th IEEE International Conference on Advanced Video and Signal based Surveillance (AVSS)
+             for Security, Safety and Monitoring in Smart Environments},
+        organization={IEEE},
+        year={2009},
+        address     = {Genova, Italy},
+    }
+
+ 
\ No newline at end of file
diff --git a/tddfa/build.sh b/tddfa/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3dbd41c68e9c682766d08e3f71f4627483cf381d
--- /dev/null
+++ b/tddfa/build.sh
@@ -0,0 +1,7 @@
+cd tddfa/Sim3DR
+sh ./build_sim3dr.sh
+cd ../..
+
+cd tddfa/utils/asset
+gcc -shared -Wall -O3 render.c -o render.so -fPIC
+cd ../../..
\ No newline at end of file
diff --git a/tddfa/configs/.gitignore b/tddfa/configs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..5f6ba3bd84ea6eb09369cc3bf46c41ac823d906c
--- /dev/null
+++ b/tddfa/configs/.gitignore
@@ -0,0 +1,3 @@
+*.pkl
+*.yml
+*.onnx
\ No newline at end of file
diff --git a/tddfa/configs/BFM_UV.mat b/tddfa/configs/BFM_UV.mat
new file mode 100644
index 0000000000000000000000000000000000000000..fa205443b62307647358facf6fd92b13166d0d86
Binary files /dev/null and b/tddfa/configs/BFM_UV.mat differ
diff --git a/tddfa/configs/indices.npy b/tddfa/configs/indices.npy
new file mode 100644
index 0000000000000000000000000000000000000000..8dc0fbd5fcd3f46528ce0c8cb40fd5fa8432bed4
Binary files /dev/null and b/tddfa/configs/indices.npy differ
diff --git a/tddfa/configs/ncc_code.npy b/tddfa/configs/ncc_code.npy
new file mode 100644
index 0000000000000000000000000000000000000000..b1411ff3a716a1d87d43c865525007e6d65fd2f0
Binary files /dev/null and b/tddfa/configs/ncc_code.npy differ
diff --git a/tddfa/configs/readme.md b/tddfa/configs/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..b6930cd18576f8e1b5c729c1f8729972c9d6067d
--- /dev/null
+++ b/tddfa/configs/readme.md
@@ -0,0 +1,3 @@
+## The simplified version of BFM
+
+`bfm_noneck_v3_slim.pkl`: [Google Drive](https://drive.google.com/file/d/1iK5lD49E_gCn9voUjWDPj2ItGKvM10GI/view?usp=sharing) or [Baidu Drive](https://pan.baidu.com/s/1C_SzYBOG3swZA_EjxpXlAw) (Password: p803)
\ No newline at end of file
diff --git a/tddfa/models/__init__.py b/tddfa/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e86ed62ed5073038e6616c35a22021f44987c19
--- /dev/null
+++ b/tddfa/models/__init__.py
@@ -0,0 +1,3 @@
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .resnet import *
\ No newline at end of file
diff --git a/tddfa/models/mobilenet_v1.py b/tddfa/models/mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c9baf5103865bfbe981db02d0f754469068c9f3
--- /dev/null
+++ b/tddfa/models/mobilenet_v1.py
@@ -0,0 +1,163 @@
+# coding: utf-8
+
+from __future__ import division
+
+""" 
+Creates a MobileNet Model as defined in:
+Andrew G. Howard Menglong Zhu Bo Chen, et.al. (2017). 
+MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications. 
+Copyright (c) Yang Lu, 2017
+
+Modified By cleardusk
+"""
+import math
+import torch.nn as nn
+
+__all__ = ['MobileNet', 'mobilenet']
+
+
+# __all__ = ['mobilenet_2', 'mobilenet_1', 'mobilenet_075', 'mobilenet_05', 'mobilenet_025']
+
+
+class DepthWiseBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, prelu=False):
+        super(DepthWiseBlock, self).__init__()
+        inplanes, planes = int(inplanes), int(planes)
+        self.conv_dw = nn.Conv2d(inplanes, inplanes, kernel_size=3, padding=1, stride=stride, groups=inplanes,
+                                 bias=False)
+        self.bn_dw = nn.BatchNorm2d(inplanes)
+        self.conv_sep = nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, padding=0, bias=False)
+        self.bn_sep = nn.BatchNorm2d(planes)
+        if prelu:
+            self.relu = nn.PReLU()
+        else:
+            self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        out = self.conv_dw(x)
+        out = self.bn_dw(out)
+        out = self.relu(out)
+
+        out = self.conv_sep(out)
+        out = self.bn_sep(out)
+        out = self.relu(out)
+
+        return out
+
+
+class MobileNet(nn.Module):
+    def __init__(self, widen_factor=1.0, num_classes=1000, prelu=False, input_channel=3):
+        """ Constructor
+        Args:
+            widen_factor: config of widen_factor
+            num_classes: number of classes
+        """
+        super(MobileNet, self).__init__()
+
+        block = DepthWiseBlock
+        self.conv1 = nn.Conv2d(input_channel, int(32 * widen_factor), kernel_size=3, stride=2, padding=1,
+                               bias=False)
+
+        self.bn1 = nn.BatchNorm2d(int(32 * widen_factor))
+        if prelu:
+            self.relu = nn.PReLU()
+        else:
+            self.relu = nn.ReLU(inplace=True)
+
+        self.dw2_1 = block(32 * widen_factor, 64 * widen_factor, prelu=prelu)
+        self.dw2_2 = block(64 * widen_factor, 128 * widen_factor, stride=2, prelu=prelu)
+
+        self.dw3_1 = block(128 * widen_factor, 128 * widen_factor, prelu=prelu)
+        self.dw3_2 = block(128 * widen_factor, 256 * widen_factor, stride=2, prelu=prelu)
+
+        self.dw4_1 = block(256 * widen_factor, 256 * widen_factor, prelu=prelu)
+        self.dw4_2 = block(256 * widen_factor, 512 * widen_factor, stride=2, prelu=prelu)
+
+        self.dw5_1 = block(512 * widen_factor, 512 * widen_factor, prelu=prelu)
+        self.dw5_2 = block(512 * widen_factor, 512 * widen_factor, prelu=prelu)
+        self.dw5_3 = block(512 * widen_factor, 512 * widen_factor, prelu=prelu)
+        self.dw5_4 = block(512 * widen_factor, 512 * widen_factor, prelu=prelu)
+        self.dw5_5 = block(512 * widen_factor, 512 * widen_factor, prelu=prelu)
+        self.dw5_6 = block(512 * widen_factor, 1024 * widen_factor, stride=2, prelu=prelu)
+
+        self.dw6 = block(1024 * widen_factor, 1024 * widen_factor, prelu=prelu)
+
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(int(1024 * widen_factor), num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.dw2_1(x)
+        x = self.dw2_2(x)
+        x = self.dw3_1(x)
+        x = self.dw3_2(x)
+        x = self.dw4_1(x)
+        x = self.dw4_2(x)
+        x = self.dw5_1(x)
+        x = self.dw5_2(x)
+        x = self.dw5_3(x)
+        x = self.dw5_4(x)
+        x = self.dw5_5(x)
+        x = self.dw5_6(x)
+        x = self.dw6(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+def mobilenet(**kwargs):
+    """
+    Construct MobileNet.
+    widen_factor=1.0  for mobilenet_1
+    widen_factor=0.75 for mobilenet_075
+    widen_factor=0.5  for mobilenet_05
+    widen_factor=0.25 for mobilenet_025
+    """
+    # widen_factor = 1.0, num_classes = 1000
+    # model = MobileNet(widen_factor=widen_factor, num_classes=num_classes)
+    # return model
+
+    model = MobileNet(
+        widen_factor=kwargs.get('widen_factor', 1.0),
+        num_classes=kwargs.get('num_classes', 62)
+    )
+    return model
+
+
+def mobilenet_2(num_classes=62, input_channel=3):
+    model = MobileNet(widen_factor=2.0, num_classes=num_classes, input_channel=input_channel)
+    return model
+
+
+def mobilenet_1(num_classes=62, input_channel=3):
+    model = MobileNet(widen_factor=1.0, num_classes=num_classes, input_channel=input_channel)
+    return model
+
+
+def mobilenet_075(num_classes=62, input_channel=3):
+    model = MobileNet(widen_factor=0.75, num_classes=num_classes, input_channel=input_channel)
+    return model
+
+
+def mobilenet_05(num_classes=62, input_channel=3):
+    model = MobileNet(widen_factor=0.5, num_classes=num_classes, input_channel=input_channel)
+    return model
+
+
+def mobilenet_025(num_classes=62, input_channel=3):
+    model = MobileNet(widen_factor=0.25, num_classes=num_classes, input_channel=input_channel)
+    return model
diff --git a/tddfa/models/mobilenet_v3.py b/tddfa/models/mobilenet_v3.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5eaf7f1c13821402e07a9dd46a35b9a9d39b859
--- /dev/null
+++ b/tddfa/models/mobilenet_v3.py
@@ -0,0 +1,246 @@
+# coding: utf-8
+
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+__all__ = ['MobileNetV3', 'mobilenet_v3']
+
+
+def conv_bn(inp, oup, stride, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d, nlin_layer=nn.ReLU):
+    return nn.Sequential(
+        conv_layer(inp, oup, 3, stride, 1, bias=False),
+        norm_layer(oup),
+        nlin_layer(inplace=True)
+    )
+
+
+def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d, nlin_layer=nn.ReLU):
+    return nn.Sequential(
+        conv_layer(inp, oup, 1, 1, 0, bias=False),
+        norm_layer(oup),
+        nlin_layer(inplace=True)
+    )
+
+
+class Hswish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hswish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x * F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class SEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            Hsigmoid()
+            # nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+
+class Identity(nn.Module):
+    def __init__(self, channel):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def make_divisible(x, divisible_by=8):
+    import numpy as np
+    return int(np.ceil(x * 1. / divisible_by) * divisible_by)
+
+
+class MobileBottleneck(nn.Module):
+    def __init__(self, inp, oup, kernel, stride, exp, se=False, nl='RE'):
+        super(MobileBottleneck, self).__init__()
+        assert stride in [1, 2]
+        assert kernel in [3, 5]
+        padding = (kernel - 1) // 2
+        self.use_res_connect = stride == 1 and inp == oup
+
+        conv_layer = nn.Conv2d
+        norm_layer = nn.BatchNorm2d
+        if nl == 'RE':
+            nlin_layer = nn.ReLU  # or ReLU6
+        elif nl == 'HS':
+            nlin_layer = Hswish
+        else:
+            raise NotImplementedError
+        if se:
+            SELayer = SEModule
+        else:
+            SELayer = Identity
+
+        self.conv = nn.Sequential(
+            # pw
+            conv_layer(inp, exp, 1, 1, 0, bias=False),
+            norm_layer(exp),
+            nlin_layer(inplace=True),
+            # dw
+            conv_layer(exp, exp, kernel, stride, padding, groups=exp, bias=False),
+            norm_layer(exp),
+            SELayer(exp),
+            nlin_layer(inplace=True),
+            # pw-linear
+            conv_layer(exp, oup, 1, 1, 0, bias=False),
+            norm_layer(oup),
+        )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV3(nn.Module):
+    def __init__(self, widen_factor=1.0, num_classes=141, num_landmarks=136, input_size=120, mode='small'):
+        super(MobileNetV3, self).__init__()
+        input_channel = 16
+        last_channel = 1280
+        if mode == 'large':
+            # refer to Table 1 in paper
+            mobile_setting = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, 'RE', 1],
+                [3, 64, 24, False, 'RE', 2],
+                [3, 72, 24, False, 'RE', 1],
+                [5, 72, 40, True, 'RE', 2],
+                [5, 120, 40, True, 'RE', 1],
+                [5, 120, 40, True, 'RE', 1],
+                [3, 240, 80, False, 'HS', 2],
+                [3, 200, 80, False, 'HS', 1],
+                [3, 184, 80, False, 'HS', 1],
+                [3, 184, 80, False, 'HS', 1],
+                [3, 480, 112, True, 'HS', 1],
+                [3, 672, 112, True, 'HS', 1],
+                [5, 672, 160, True, 'HS', 2],
+                [5, 960, 160, True, 'HS', 1],
+                [5, 960, 160, True, 'HS', 1],
+            ]
+        elif mode == 'small':
+            # refer to Table 2 in paper
+            mobile_setting = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, 'RE', 2],
+                [3, 72, 24, False, 'RE', 2],
+                [3, 88, 24, False, 'RE', 1],
+                [5, 96, 40, True, 'HS', 2],
+                [5, 240, 40, True, 'HS', 1],
+                [5, 240, 40, True, 'HS', 1],
+                [5, 120, 48, True, 'HS', 1],
+                [5, 144, 48, True, 'HS', 1],
+                [5, 288, 96, True, 'HS', 2],
+                [5, 576, 96, True, 'HS', 1],
+                [5, 576, 96, True, 'HS', 1],
+            ]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        assert input_size % 32 == 0
+        last_channel = make_divisible(last_channel * widen_factor) if widen_factor > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2, nlin_layer=Hswish)]
+        # self.classifier = []
+
+        # building mobile blocks
+        for k, exp, c, se, nl, s in mobile_setting:
+            output_channel = make_divisible(c * widen_factor)
+            exp_channel = make_divisible(exp * widen_factor)
+            self.features.append(MobileBottleneck(input_channel, output_channel, k, s, exp_channel, se, nl))
+            input_channel = output_channel
+
+        # building last several layers
+        if mode == 'large':
+            last_conv = make_divisible(960 * widen_factor)
+            self.features.append(conv_1x1_bn(input_channel, last_conv, nlin_layer=Hswish))
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        elif mode == 'small':
+            last_conv = make_divisible(576 * widen_factor)
+            self.features.append(conv_1x1_bn(input_channel, last_conv, nlin_layer=Hswish))
+            # self.features.append(SEModule(last_conv))  # refer to paper Table2, but I think this is a mistake
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        else:
+            raise NotImplementedError
+
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # self.fc_param = nn.Linear(int(last_channel), num_classes)
+        self.fc = nn.Linear(int(last_channel), num_classes)
+        # self.fc_lm = nn.Linear(int(last_channel), num_landmarks)
+
+        # building classifier
+        # self.classifier = nn.Sequential(
+        #     nn.Dropout(p=dropout),    # refer to paper section 6
+        #     nn.Linear(last_channel, n_class),
+        # )
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x_share = x.mean(3).mean(2)
+
+        # x = self.classifier(x)
+        # print(x_share.shape)
+        # xp = self.fc_param(x_share)  # param
+        # xl = self.fc_lm(x_share)  # lm
+
+        xp = self.fc(x_share)  # param
+
+        return xp  # , xl
+
+    def _initialize_weights(self):
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+
+def mobilenet_v3(**kwargs):
+    model = MobileNetV3(
+        widen_factor=kwargs.get('widen_factor', 1.0),
+        num_classes=kwargs.get('num_classes', 62),
+        num_landmarks=kwargs.get('num_landmarks', 136),
+        input_size=kwargs.get('size', 128),
+        mode=kwargs.get('mode', 'small')
+    )
+
+    return model
diff --git a/tddfa/models/resnet.py b/tddfa/models/resnet.py
new file mode 100755
index 0000000000000000000000000000000000000000..87d87273c04bfe3f13295452de46600343e22bcf
--- /dev/null
+++ b/tddfa/models/resnet.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# coding: utf-8
+
+import torch.nn as nn
+
+__all__ = ['ResNet', 'resnet22']
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    """Another Strucutre used in caffe-resnet25"""
+
+    def __init__(self, block, layers, num_classes=62, num_landmarks=136, input_channel=3, fc_flg=False):
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(input_channel, 32, kernel_size=5, stride=2, padding=2, bias=False)
+        self.bn1 = nn.BatchNorm2d(32)  # 32 is input channels number
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(64)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.layer1 = self._make_layer(block, 128, layers[0], stride=2)
+        self.layer2 = self._make_layer(block, 256, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 512, layers[2], stride=2)
+
+        self.conv_param = nn.Conv2d(512, num_classes, 1)
+        # self.conv_lm = nn.Conv2d(512, num_landmarks, 1)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        # self.fc = nn.Linear(512 * block.expansion, num_classes)
+        self.fc_flg = fc_flg
+
+        # parameter initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # 1.
+                # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                # m.weight.data.normal_(0, math.sqrt(2. / n))
+
+                # 2. kaiming normal
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes, stride, downsample))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        # x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        # if self.fc_flg:
+        #     x = self.avgpool(x)
+        #     x = x.view(x.size(0), -1)
+        #     x = self.fc(x)
+        # else:
+        xp = self.conv_param(x)
+        xp = self.avgpool(xp)
+        xp = xp.view(xp.size(0), -1)
+
+        # xl = self.conv_lm(x)
+        # xl = self.avgpool(xl)
+        # xl = xl.view(xl.size(0), -1)
+
+        return xp  # , xl
+
+
+def resnet22(**kwargs):
+    model = ResNet(
+        BasicBlock,
+        [3, 4, 3],
+        num_landmarks=kwargs.get('num_landmarks', 136),
+        input_channel=kwargs.get('input_channel', 3),
+        fc_flg=False
+    )
+    return model
+
+
+def main():
+    pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tddfa/utils/__init__.py b/tddfa/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tddfa/utils/asset/.gitignore b/tddfa/utils/asset/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..140f8cf80f2c88e66c141b1c4074b92b29fde4e6
--- /dev/null
+++ b/tddfa/utils/asset/.gitignore
@@ -0,0 +1 @@
+*.so
diff --git a/tddfa/utils/asset/build_render_ctypes.sh b/tddfa/utils/asset/build_render_ctypes.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6927a8e89973279a643fb10f0d82b71feaabc23f
--- /dev/null
+++ b/tddfa/utils/asset/build_render_ctypes.sh
@@ -0,0 +1 @@
+gcc -shared -Wall -O3 render.c -o render.so -fPIC
\ No newline at end of file
diff --git a/tddfa/utils/asset/render.c b/tddfa/utils/asset/render.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e7279d988284a4c471bece47ffc315a932fba8d
--- /dev/null
+++ b/tddfa/utils/asset/render.c
@@ -0,0 +1,233 @@
+#include <math.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define max(x, y) (((x) > (y)) ? (x) : (y))
+#define min(x, y) (((x) < (y)) ? (x) : (y))
+#define clip(_x, _min, _max) min(max(_x, _min), _max)
+
+struct Tuple3D
+{
+    float x;
+    float y;
+    float z;
+};
+
+void _render(const int *triangles,
+             const int ntri,
+             const float *light,
+             const float *directional,
+             const float *ambient,
+             const float *vertices,
+             const int nver,
+             unsigned char *image,
+             const int h, const int w)
+{
+    int tri_p0_ind, tri_p1_ind, tri_p2_ind;
+    int color_index;
+    float dot00, dot01, dot11, dot02, dot12;
+    float cos_sum, det;
+
+    struct Tuple3D p0, p1, p2;
+    struct Tuple3D v0, v1, v2;
+    struct Tuple3D p, start, end;
+
+    struct Tuple3D ver_max = {-1.0e8, -1.0e8, -1.0e8};
+    struct Tuple3D ver_min = {1.0e8, 1.0e8, 1.0e8};
+    struct Tuple3D ver_mean = {0.0, 0.0, 0.0};
+
+    float *ver_normal = (float *)calloc(3 * nver, sizeof(float));
+    float *colors = (float *)malloc(3 * nver * sizeof(float));
+    float *depth_buffer = (float *)calloc(h * w, sizeof(float));
+
+    for (int i = 0; i < ntri; i++)
+    {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        // counter clockwise order
+        start.x = vertices[tri_p1_ind] - vertices[tri_p0_ind];
+        start.y = vertices[tri_p1_ind + 1] - vertices[tri_p0_ind + 1];
+        start.z = vertices[tri_p1_ind + 2] - vertices[tri_p0_ind + 2];
+
+        end.x = vertices[tri_p2_ind] - vertices[tri_p0_ind];
+        end.y = vertices[tri_p2_ind + 1] - vertices[tri_p0_ind + 1];
+        end.z = vertices[tri_p2_ind + 2] - vertices[tri_p0_ind + 2];
+
+        p.x = start.y * end.z - start.z * end.y;
+        p.y = start.z * end.x - start.x * end.z;
+        p.z = start.x * end.y - start.y * end.x;
+
+        ver_normal[tri_p0_ind] += p.x;
+        ver_normal[tri_p1_ind] += p.x;
+        ver_normal[tri_p2_ind] += p.x;
+
+        ver_normal[tri_p0_ind + 1] += p.y;
+        ver_normal[tri_p1_ind + 1] += p.y;
+        ver_normal[tri_p2_ind + 1] += p.y;
+
+        ver_normal[tri_p0_ind + 2] += p.z;
+        ver_normal[tri_p1_ind + 2] += p.z;
+        ver_normal[tri_p2_ind + 2] += p.z;
+    }
+
+    for (int i = 0; i < nver; ++i)
+    {
+        p.x = ver_normal[3 * i];
+        p.y = ver_normal[3 * i + 1];
+        p.z = ver_normal[3 * i + 2];
+
+        det = sqrt(p.x * p.x + p.y * p.y + p.z * p.z);
+        if (det <= 0)
+            det = 1e-6;
+
+        ver_normal[3 * i] /= det;
+        ver_normal[3 * i + 1] /= det;
+        ver_normal[3 * i + 2] /= det;
+
+        ver_mean.x += p.x;
+        ver_mean.y += p.y;
+        ver_mean.z += p.z;
+
+        ver_max.x = max(ver_max.x, p.x);
+        ver_max.y = max(ver_max.y, p.y);
+        ver_max.z = max(ver_max.z, p.z);
+
+        ver_min.x = min(ver_min.x, p.x);
+        ver_min.y = min(ver_min.y, p.y);
+        ver_min.z = min(ver_min.z, p.z);
+    }
+
+    ver_mean.x /= nver;
+    ver_mean.y /= nver;
+    ver_mean.z /= nver;
+
+    for (int i = 0; i < nver; ++i)
+    {
+        colors[3 * i] = vertices[3 * i];
+        colors[3 * i + 1] = vertices[3 * i + 1];
+        colors[3 * i + 2] = vertices[3 * i + 2];
+
+        colors[3 * i] -= ver_mean.x;
+        colors[3 * i] /= ver_max.x - ver_min.x;
+
+        colors[3 * i + 1] -= ver_mean.y;
+        colors[3 * i + 1] /= ver_max.y - ver_min.y;
+
+        colors[3 * i + 2] -= ver_mean.z;
+        colors[3 * i + 2] /= ver_max.z - ver_min.z;
+
+        p.x = light[0] - colors[3 * i];
+        p.y = light[1] - colors[3 * i + 1];
+        p.z = light[2] - colors[3 * i + 2];
+
+        det = sqrt(p.x * p.x + p.y * p.y + p.z * p.z);
+        if (det <= 0)
+            det = 1e-6;
+
+        colors[3 * i] = p.x / det;
+        colors[3 * i + 1] = p.y / det;
+        colors[3 * i + 2] = p.z / det;
+
+        colors[3 * i] *= ver_normal[3 * i];
+        colors[3 * i + 1] *= ver_normal[3 * i + 1];
+        colors[3 * i + 2] *= ver_normal[3 * i + 2];
+
+        cos_sum = colors[3 * i] + colors[3 * i + 1] + colors[3 * i + 2];
+
+        colors[3 * i] = clip(cos_sum * directional[0] + ambient[0], 0, 1);
+        colors[3 * i + 1] = clip(cos_sum * directional[1] + ambient[1], 0, 1);
+        colors[3 * i + 2] = clip(cos_sum * directional[2] + ambient[2], 0, 1);
+    }
+
+    for (int i = 0; i < ntri; ++i)
+    {
+        tri_p0_ind = triangles[3 * i];
+        tri_p1_ind = triangles[3 * i + 1];
+        tri_p2_ind = triangles[3 * i + 2];
+
+        p0.x = vertices[tri_p0_ind];
+        p0.y = vertices[tri_p0_ind + 1];
+        p0.z = vertices[tri_p0_ind + 2];
+
+        p1.x = vertices[tri_p1_ind];
+        p1.y = vertices[tri_p1_ind + 1];
+        p1.z = vertices[tri_p1_ind + 2];
+
+        p2.x = vertices[tri_p2_ind];
+        p2.y = vertices[tri_p2_ind + 1];
+        p2.z = vertices[tri_p2_ind + 2];
+
+        start.x = max(ceil(min(p0.x, min(p1.x, p2.x))), 0);
+        end.x = min(floor(max(p0.x, max(p1.x, p2.x))), w - 1);
+
+        start.y = max(ceil(min(p0.y, min(p1.y, p2.y))), 0);
+        end.y = min(floor(max(p0.y, max(p1.y, p2.y))), h - 1);
+
+        if (end.x < start.x || end.y < start.y)
+            continue;
+
+        v0.x = p2.x - p0.x;
+        v0.y = p2.y - p0.y;
+        v1.x = p1.x - p0.x;
+        v1.y = p1.y - p0.y;
+
+        // dot products np.dot(v0.T, v0)
+        dot00 = v0.x * v0.x + v0.y * v0.y;
+        dot01 = v0.x * v1.x + v0.y * v1.y;
+        dot11 = v1.x * v1.x + v1.y * v1.y;
+
+        // barycentric coordinates
+        start.z = dot00 * dot11 - dot01 * dot01;
+        if (start.z != 0)
+            start.z = 1 / start.z;
+
+        for (p.y = start.y; p.y <= end.y; p.y += 1.0)
+        {
+            for (p.x = start.x; p.x <= end.x; p.x += 1.0)
+            {
+                v2.x = p.x - p0.x;
+                v2.y = p.y - p0.y;
+
+                dot02 = v0.x * v2.x + v0.y * v2.y;
+                dot12 = v1.x * v2.x + v1.y * v2.y;
+
+                v2.z = (dot11 * dot02 - dot01 * dot12) * start.z;
+                v1.z = (dot00 * dot12 - dot01 * dot02) * start.z;
+                v0.z = 1 - v2.z - v1.z;
+
+                // judge is_point_in_tri by below line of code
+                if (v2.z > 0 && v1.z > 0 && v0.z > 0)
+                {
+                    p.z = v0.z * p0.z + v1.z * p1.z + v2.z * p2.z;
+                    color_index = p.y * w + p.x;
+
+                    if (p.z > depth_buffer[color_index])
+                    {
+                        end.z = v0.z * colors[tri_p0_ind];
+                        end.z += v1.z * colors[tri_p1_ind];
+                        end.z += v2.z * colors[tri_p2_ind];
+                        image[3 * color_index] = end.z * 255;
+
+                        end.z = v0.z * colors[tri_p0_ind + 1];
+                        end.z += v1.z * colors[tri_p1_ind + 1];
+                        end.z += v2.z * colors[tri_p2_ind + 1];
+                        image[3 * color_index + 1] = end.z * 255;
+
+                        end.z = v0.z * colors[tri_p0_ind + 2];
+                        end.z += v1.z * colors[tri_p1_ind + 2];
+                        end.z += v2.z * colors[tri_p2_ind + 2];
+                        image[3 * color_index + 2] = end.z * 255;
+
+                        depth_buffer[color_index] = p.z;
+                    }
+                }
+            }
+        }
+    }
+
+    free(depth_buffer);
+    free(colors);
+    free(ver_normal);
+}
diff --git a/tddfa/utils/depth.py b/tddfa/utils/depth.py
new file mode 100644
index 0000000000000000000000000000000000000000..63001f2463b1d3321820eb662fd95020ba40b76c
--- /dev/null
+++ b/tddfa/utils/depth.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import cv2
+import numpy as np
+
+from tddfa.Sim3DR import rasterize
+from tddfa.utils.functions import plot_image
+from .tddfa_util import _to_ctype
+
+
+def depth(img, ver_lst, tri, show_flag=False, wfp=None, with_bg_flag=True):
+    if with_bg_flag:
+        overlap = img.copy()
+    else:
+        overlap = np.zeros_like(img)
+
+    for ver_ in ver_lst:
+        ver = _to_ctype(ver_.T)  # transpose
+
+        z = ver[:, 2]
+        z_min, z_max = min(z), max(z)
+
+        z = (z - z_min) / (z_max - z_min)
+
+        # expand
+        z = np.repeat(z[:, np.newaxis], 3, axis=1)
+
+        overlap = rasterize(ver, tri, z, bg=overlap)
+
+    if wfp is not None:
+        cv2.imwrite(wfp, overlap)
+        # print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(overlap)
+
+    return overlap
diff --git a/tddfa/utils/functions.py b/tddfa/utils/functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..970c668d19b5518b56a8230523016b07e54fec0f
--- /dev/null
+++ b/tddfa/utils/functions.py
@@ -0,0 +1,182 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import numpy as np
+import cv2
+from math import sqrt
+import matplotlib.pyplot as plt
+
+RED = (0, 0, 255)
+GREEN = (0, 255, 0)
+BLUE = (255, 0, 0)
+
+
+def get_suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind('.')
+    if pos == -1:
+        return ''
+    return filename[pos:]
+
+
+def crop_img(img, roi_box):
+    h, w = img.shape[:2]
+
+    sx, sy, ex, ey = [int(round(_)) for _ in roi_box]
+    dh, dw = ey - sy, ex - sx
+    if len(img.shape) == 3:
+        res = np.zeros((dh, dw, 3), dtype=np.uint8)
+    else:
+        res = np.zeros((dh, dw), dtype=np.uint8)
+    if sx < 0:
+        sx, dsx = 0, -sx
+    else:
+        dsx = 0
+
+    if ex > w:
+        ex, dex = w, dw - (ex - w)
+    else:
+        dex = dw
+
+    if sy < 0:
+        sy, dsy = 0, -sy
+    else:
+        dsy = 0
+
+    if ey > h:
+        ey, dey = h, dh - (ey - h)
+    else:
+        dey = dh
+
+    res[dsy:dey, dsx:dex] = img[sy:ey, sx:ex]
+    return res
+
+
+def calc_hypotenuse(pts):
+    bbox = [min(pts[0, :]), min(pts[1, :]), max(pts[0, :]), max(pts[1, :])]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    radius = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    bbox = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
+    llength = sqrt((bbox[2] - bbox[0]) ** 2 + (bbox[3] - bbox[1]) ** 2)
+    return llength / 3
+
+
+def parse_roi_box_from_landmark(pts):
+    """calc roi box from landmark"""
+    bbox = [min(pts[0, :]), min(pts[1, :]), max(pts[0, :]), max(pts[1, :])]
+    center = [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]
+    radius = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 2
+    bbox = [center[0] - radius, center[1] - radius, center[0] + radius, center[1] + radius]
+
+    llength = sqrt((bbox[2] - bbox[0]) ** 2 + (bbox[3] - bbox[1]) ** 2)
+    center_x = (bbox[2] + bbox[0]) / 2
+    center_y = (bbox[3] + bbox[1]) / 2
+
+    roi_box = [0] * 4
+    roi_box[0] = center_x - llength / 2
+    roi_box[1] = center_y - llength / 2
+    roi_box[2] = roi_box[0] + llength
+    roi_box[3] = roi_box[1] + llength
+
+    return roi_box
+
+
+def parse_roi_box_from_bbox(bbox):
+    left, top, right, bottom = bbox[:4]
+    old_size = (right - left + bottom - top) / 2
+    center_x = right - (right - left) / 2.0
+    center_y = bottom - (bottom - top) / 2.0 + old_size * 0.14
+    size = int(old_size * 1.58)
+
+    roi_box = [0] * 4
+    roi_box[0] = center_x - size / 2
+    roi_box[1] = center_y - size / 2
+    roi_box[2] = roi_box[0] + size
+    roi_box[3] = roi_box[1] + size
+
+    return roi_box
+
+
+def plot_image(img):
+    height, width = img.shape[:2]
+    plt.figure(figsize=(12, height / width * 12))
+
+    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
+    plt.axis('off')
+
+    plt.imshow(img[..., ::-1])
+    plt.show()
+
+
+def draw_landmarks(img, pts, style='fancy', wfp=None, show_flag=False, **kwargs):
+    """Draw landmarks using matplotlib"""
+    height, width = img.shape[:2]
+    plt.figure(figsize=(12, height / width * 12))
+    plt.imshow(img[..., ::-1])
+    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
+    plt.axis('off')
+
+    dense_flag = kwargs.get('dense_flag')
+
+    if not type(pts) in [tuple, list]:
+        pts = [pts]
+    for i in range(len(pts)):
+        if dense_flag:
+            plt.plot(pts[i][0, ::6], pts[i][1, ::6], 'o', markersize=0.4, color='c', alpha=0.7)
+        else:
+            alpha = 0.8
+            markersize = 4
+            lw = 1.5
+            color = kwargs.get('color', 'w')
+            markeredgecolor = kwargs.get('markeredgecolor', 'black')
+
+            nums = [0, 17, 22, 27, 31, 36, 42, 48, 60, 68]
+
+            # close eyes and mouths
+            plot_close = lambda i1, i2: plt.plot([pts[i][0, i1], pts[i][0, i2]], [pts[i][1, i1], pts[i][1, i2]],
+                                                 color=color, lw=lw, alpha=alpha - 0.1)
+            plot_close(41, 36)
+            plot_close(47, 42)
+            plot_close(59, 48)
+            plot_close(67, 60)
+
+            for ind in range(len(nums) - 1):
+                l, r = nums[ind], nums[ind + 1]
+                plt.plot(pts[i][0, l:r], pts[i][1, l:r], color=color, lw=lw, alpha=alpha - 0.1)
+
+                plt.plot(pts[i][0, l:r], pts[i][1, l:r], marker='o', linestyle='None', markersize=markersize,
+                         color=color,
+                         markeredgecolor=markeredgecolor, alpha=alpha)
+    if wfp is not None:
+        plt.savefig(wfp, dpi=150)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plt.show()
+
+
+def cv_draw_landmark(img_ori, pts, box=None, color=GREEN, size=1):
+    img = img_ori.copy()
+    n = pts.shape[1]
+    if n <= 106:
+        for i in range(n):
+            cv2.circle(img, (int(round(pts[0, i])), int(round(pts[1, i]))), size, color, -1)
+    else:
+        sep = 1
+        for i in range(0, n, sep):
+            cv2.circle(img, (int(round(pts[0, i])), int(round(pts[1, i]))), size, color, 1)
+
+    if box is not None:
+        left, top, right, bottom = np.round(box).astype(np.int32)
+        left_top = (left, top)
+        right_top = (right, top)
+        right_bottom = (right, bottom)
+        left_bottom = (left, bottom)
+        cv2.line(img, left_top, right_top, BLUE, 1, cv2.LINE_AA)
+        cv2.line(img, right_top, right_bottom, BLUE, 1, cv2.LINE_AA)
+        cv2.line(img, right_bottom, left_bottom, BLUE, 1, cv2.LINE_AA)
+        cv2.line(img, left_bottom, left_top, BLUE, 1, cv2.LINE_AA)
+
+    return img
+    
\ No newline at end of file
diff --git a/tddfa/utils/io.py b/tddfa/utils/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..31eeaac864e1f6704a616f82e001794486aef053
--- /dev/null
+++ b/tddfa/utils/io.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import os
+import numpy as np
+import torch
+import pickle
+
+
+def mkdir(d):
+    os.makedirs(d, exist_ok=True)
+
+
+def _get_suffix(filename):
+    """a.jpg -> jpg"""
+    pos = filename.rfind('.')
+    if pos == -1:
+        return ''
+    return filename[pos + 1:]
+
+
+def _load(fp):
+    suffix = _get_suffix(fp)
+    if suffix == 'npy':
+        return np.load(fp)
+    elif suffix == 'pkl':
+        return pickle.load(open(fp, 'rb'))
+
+
+def _dump(wfp, obj):
+    suffix = _get_suffix(wfp)
+    if suffix == 'npy':
+        np.save(wfp, obj)
+    elif suffix == 'pkl':
+        pickle.dump(obj, open(wfp, 'wb'))
+    else:
+        raise Exception('Unknown Type: {}'.format(suffix))
+
+
+def _load_tensor(fp, mode='cpu'):
+    if mode.lower() == 'cpu':
+        return torch.from_numpy(_load(fp))
+    elif mode.lower() == 'gpu':
+        return torch.from_numpy(_load(fp)).cuda()
+
+
+def _tensor_to_cuda(x):
+    if x.is_cuda:
+        return x
+    else:
+        return x.cuda()
+
+
+def _load_gpu(fp):
+    return torch.from_numpy(_load(fp)).cuda()
+
+
+_load_cpu = _load
+_numpy_to_tensor = lambda x: torch.from_numpy(x)
+_tensor_to_numpy = lambda x: x.numpy()
+_numpy_to_cuda = lambda x: _tensor_to_cuda(torch.from_numpy(x))
+_cuda_to_tensor = lambda x: x.cpu()
+_cuda_to_numpy = lambda x: x.cpu().numpy()
diff --git a/tddfa/utils/onnx.py b/tddfa/utils/onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1df8b8544db939436f31369a1e9da46ba1d8a4e
--- /dev/null
+++ b/tddfa/utils/onnx.py
@@ -0,0 +1,40 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import torch
+import tddfa.models
+from tddfa.utils.tddfa_util import load_model
+
+
+def convert_to_onnx(**kvs):
+    # 1. load model
+    size = kvs.get('size', 120)
+    model = getattr(models, kvs.get('arch'))(
+        num_classes=kvs.get('num_params', 62),
+        widen_factor=kvs.get('widen_factor', 1),
+        size=size,
+        mode=kvs.get('mode', 'small')
+    )
+    checkpoint_fp = kvs.get('checkpoint_fp')
+    model = load_model(model, checkpoint_fp)
+    model.eval()
+
+    # 2. convert
+    batch_size = 1
+    dummy_input = torch.randn(batch_size, 3, size, size)
+    wfp = checkpoint_fp.replace('.pth', '.onnx')
+    torch.onnx.export(
+        model,
+        (dummy_input, ),
+        wfp,
+        input_names=['input'],
+        output_names=['output'],
+        do_constant_folding=True
+    )
+    print(f'Convert {checkpoint_fp} to {wfp} done.')
+    return wfp
diff --git a/tddfa/utils/pncc.py b/tddfa/utils/pncc.py
new file mode 100644
index 0000000000000000000000000000000000000000..16e6b307eb2f9444e1aeb709175013b749285d31
--- /dev/null
+++ b/tddfa/utils/pncc.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import cv2
+import numpy as np
+import os.path as osp
+
+from Sim3DR import rasterize
+from utils.functions import plot_image
+from utils.io import _load, _dump
+from utils.tddfa_util import _to_ctype
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def calc_ncc_code():
+    from bfm import bfm
+
+    # formula: ncc_d = ( u_d - min(u_d) ) / ( max(u_d) - min(u_d) ), d = {r, g, b}
+    u = bfm.u
+    u = u.reshape(3, -1, order='F')
+
+    for i in range(3):
+        u[i] = (u[i] - u[i].min()) / (u[i].max() - u[i].min())
+
+    _dump('../configs/ncc_code.npy', u)
+
+
+def pncc(img, ver_lst, tri, show_flag=False, wfp=None, with_bg_flag=True):
+    ncc_code = _load(make_abs_path('../configs/ncc_code.npy'))
+
+    if with_bg_flag:
+        overlap = img.copy()
+    else:
+        overlap = np.zeros_like(img)
+
+    # rendering pncc
+    for ver_ in ver_lst:
+        ver = _to_ctype(ver_.T)  # transpose
+        overlap = rasterize(ver, tri, ncc_code.T, bg=overlap)  # m x 3
+
+    if wfp is not None:
+        cv2.imwrite(wfp, overlap)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(overlap)
+
+    return overlap
+
+
+def main():
+    # `configs/ncc_code.npy` is generated by `calc_nnc_code` function
+    # calc_ncc_code()
+    pass
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tddfa/utils/pose.py b/tddfa/utils/pose.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bfd3daa88814323dac9f5f58ad2c1a11bf1b14f
--- /dev/null
+++ b/tddfa/utils/pose.py
@@ -0,0 +1,141 @@
+# coding: utf-8
+
+"""
+Reference: https://github.com/YadiraF/PRNet/blob/master/utils/estimate_pose.py
+
+Calculating pose from the output 3DMM parameters, you can also try to use solvePnP to perform estimation
+"""
+
+__author__ = 'cleardusk'
+
+import cv2
+import numpy as np
+from math import cos, sin, atan2, asin, sqrt
+
+from .functions import calc_hypotenuse, plot_image
+
+
+def P2sRt(P):
+    """ decompositing camera matrix P.
+    Args:
+        P: (3, 4). Affine Camera Matrix.
+    Returns:
+        s: scale factor.
+        R: (3, 3). rotation matrix.
+        t2d: (2,). 2d translation.
+    """
+    t3d = P[:, 3]
+    R1 = P[0:1, :3]
+    R2 = P[1:2, :3]
+    s = (np.linalg.norm(R1) + np.linalg.norm(R2)) / 2.0
+    r1 = R1 / np.linalg.norm(R1)
+    r2 = R2 / np.linalg.norm(R2)
+    r3 = np.cross(r1, r2)
+
+    R = np.concatenate((r1, r2, r3), 0)
+    return s, R, t3d
+
+
+def matrix2angle(R):
+    """ compute three Euler angles from a Rotation Matrix. Ref: http://www.gregslabaugh.net/publications/euler.pdf
+    refined by: https://stackoverflow.com/questions/43364900/rotation-matrix-to-euler-angles-with-opencv
+    todo: check and debug
+     Args:
+         R: (3,3). rotation matrix
+     Returns:
+         x: yaw
+         y: pitch
+         z: roll
+     """
+    if R[2, 0] > 0.998:
+        z = 0
+        x = np.pi / 2
+        y = z + atan2(-R[0, 1], -R[0, 2])
+    elif R[2, 0] < -0.998:
+        z = 0
+        x = -np.pi / 2
+        y = -z + atan2(R[0, 1], R[0, 2])
+    else:
+        x = asin(R[2, 0])
+        y = atan2(R[2, 1] / cos(x), R[2, 2] / cos(x))
+        z = atan2(R[1, 0] / cos(x), R[0, 0] / cos(x))
+
+    return x, y, z
+
+
+def calc_pose(param):
+    P = param[:12].reshape(3, -1)  # camera matrix
+    s, R, t3d = P2sRt(P)
+    P = np.concatenate((R, t3d.reshape(3, -1)), axis=1)  # without scale
+    pose = matrix2angle(R)
+    pose = [p * 180 / np.pi for p in pose]
+
+    return P, pose
+
+
+def build_camera_box(rear_size=90):
+    point_3d = []
+    rear_depth = 0
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, rear_size, rear_depth))
+    point_3d.append((rear_size, -rear_size, rear_depth))
+    point_3d.append((-rear_size, -rear_size, rear_depth))
+
+    front_size = int(4 / 3 * rear_size)
+    front_depth = int(4 / 3 * rear_size)
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d.append((-front_size, front_size, front_depth))
+    point_3d.append((front_size, front_size, front_depth))
+    point_3d.append((front_size, -front_size, front_depth))
+    point_3d.append((-front_size, -front_size, front_depth))
+    point_3d = np.array(point_3d, dtype=np.float32).reshape(-1, 3)
+
+    return point_3d
+
+
+def plot_pose_box(img, P, ver, color=(40, 255, 0), line_width=2):
+    """ Draw a 3D box as annotation of pose.
+    Ref:https://github.com/yinguobing/head-pose-estimation/blob/master/pose_estimator.py
+    Args:
+        img: the input image
+        P: (3, 4). Affine Camera Matrix.
+        kpt: (2, 68) or (3, 68)
+    """
+    llength = calc_hypotenuse(ver)
+    point_3d = build_camera_box(llength)
+    # Map to 2d image points
+    point_3d_homo = np.hstack((point_3d, np.ones([point_3d.shape[0], 1])))  # n x 4
+    point_2d = point_3d_homo.dot(P.T)[:, :2]
+
+    point_2d[:, 1] = - point_2d[:, 1]
+    point_2d[:, :2] = point_2d[:, :2] - np.mean(point_2d[:4, :2], 0) + np.mean(ver[:2, :27], 1)
+    point_2d = np.int32(point_2d.reshape(-1, 2))
+
+    # Draw all the lines
+    cv2.polylines(img, [point_2d], True, color, line_width, cv2.LINE_AA)
+    cv2.line(img, tuple(point_2d[1]), tuple(
+        point_2d[6]), color, line_width, cv2.LINE_AA)
+    cv2.line(img, tuple(point_2d[2]), tuple(
+        point_2d[7]), color, line_width, cv2.LINE_AA)
+    cv2.line(img, tuple(point_2d[3]), tuple(
+        point_2d[8]), color, line_width, cv2.LINE_AA)
+
+    return img
+
+
+def viz_pose(img, param_lst, ver_lst, show_flag=False, wfp=None):
+    for param, ver in zip(param_lst, ver_lst):
+        P, pose = calc_pose(param)
+        img = plot_pose_box(img, P, ver)
+        # print(P[:, :3])
+        print(f'yaw: {pose[0]:.1f}, pitch: {pose[1]:.1f}, roll: {pose[2]:.1f}')
+
+    if wfp is not None:
+        cv2.imwrite(wfp, img)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(img)
+
+    return img
diff --git a/tddfa/utils/render.py b/tddfa/utils/render.py
new file mode 100644
index 0000000000000000000000000000000000000000..3239b3add34c5a48a2516f2833d3c1732e7dbb62
--- /dev/null
+++ b/tddfa/utils/render.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import cv2
+import numpy as np
+
+from Sim3DR import RenderPipeline
+from utils.functions import plot_image
+from .tddfa_util import _to_ctype
+
+cfg = {
+    'intensity_ambient': 0.3,
+    'color_ambient': (1, 1, 1),
+    'intensity_directional': 0.6,
+    'color_directional': (1, 1, 1),
+    'intensity_specular': 0.1,
+    'specular_exp': 5,
+    'light_pos': (0, 0, 5),
+    'view_pos': (0, 0, 5)
+}
+
+render_app = RenderPipeline(**cfg)
+
+
+def render(img, ver_lst, tri, alpha=0.6, show_flag=False, wfp=None, with_bg_flag=True):
+    if with_bg_flag:
+        overlap = img.copy()
+    else:
+        overlap = np.zeros_like(img)
+
+    for ver_ in ver_lst:
+        ver = _to_ctype(ver_.T)  # transpose
+        overlap = render_app(ver, tri, overlap)
+
+    if with_bg_flag:
+        res = cv2.addWeighted(img, 1 - alpha, overlap, alpha, 0)
+    else:
+        res = overlap
+
+    if wfp is not None:
+        cv2.imwrite(wfp, res)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(res)
+
+    return res
diff --git a/tddfa/utils/render_ctypes.py b/tddfa/utils/render_ctypes.py
new file mode 100644
index 0000000000000000000000000000000000000000..d945d43052236e12b568aba4591c97570af97b05
--- /dev/null
+++ b/tddfa/utils/render_ctypes.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+
+"""
+Borrowed from https://github.com/1996scarlet/Dense-Head-Pose-Estimation/blob/main/service/CtypesMeshRender.py
+
+To use this render, you should build the clib first:
+```
+cd utils/asset
+gcc -shared -Wall -O3 render.c -o render.so -fPIC
+cd ../..
+```
+"""
+
+import sys
+
+sys.path.append('..')
+
+import os.path as osp
+import cv2
+import numpy as np
+import ctypes
+from utils.functions import plot_image
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+class TrianglesMeshRender(object):
+    def __init__(
+            self,
+            clibs,
+            light=(0, 0, 5),
+            direction=(0.6, 0.6, 0.6),
+            ambient=(0.3, 0.3, 0.3)
+    ):
+        if not osp.exists(clibs):
+            raise Exception(f'{clibs} not found, please build it first, by run '
+                            f'"gcc -shared -Wall -O3 render.c -o render.so -fPIC" in utils/asset directory')
+
+        self._clibs = ctypes.CDLL(clibs)
+
+        self._light = np.array(light, dtype=np.float32)
+        self._light = np.ctypeslib.as_ctypes(self._light)
+
+        self._direction = np.array(direction, dtype=np.float32)
+        self._direction = np.ctypeslib.as_ctypes(self._direction)
+
+        self._ambient = np.array(ambient, dtype=np.float32)
+        self._ambient = np.ctypeslib.as_ctypes(self._ambient)
+
+    def __call__(self, vertices, triangles, bg):
+        self.triangles = np.ctypeslib.as_ctypes(3 * triangles)  # Attention
+        self.tri_nums = triangles.shape[0]
+
+        self._clibs._render(
+            self.triangles, self.tri_nums,
+            self._light, self._direction, self._ambient,
+            np.ctypeslib.as_ctypes(vertices),
+            vertices.shape[0],
+            np.ctypeslib.as_ctypes(bg),
+            bg.shape[0], bg.shape[1]
+        )
+
+
+render_app = TrianglesMeshRender(clibs=make_abs_path('asset/render.so'))
+
+
+def render(img, ver_lst, tri, alpha=0.6, show_flag=False, wfp=None, with_bg_flag=True):
+    if with_bg_flag:
+        overlap = img.copy()
+    else:
+        overlap = np.zeros_like(img)
+
+    for ver_ in ver_lst:
+        ver = np.ascontiguousarray(ver_.T)  # transpose
+        render_app(ver, tri, bg=overlap)
+
+    if with_bg_flag:
+        res = cv2.addWeighted(img, 1 - alpha, overlap, alpha, 0)
+    else:
+        res = overlap
+
+    if wfp is not None:
+        cv2.imwrite(wfp, res)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(res)
+
+    return res
diff --git a/tddfa/utils/serialization.py b/tddfa/utils/serialization.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b9209c4b795668d4a63ff2cb965ce5fae135aca
--- /dev/null
+++ b/tddfa/utils/serialization.py
@@ -0,0 +1,146 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import numpy as np
+
+from .tddfa_util import _to_ctype
+from .functions import get_suffix
+
+header_temp = """ply
+format ascii 1.0
+element vertex {}
+property float x
+property float y
+property float z
+element face {}
+property list uchar int vertex_indices
+end_header
+"""
+
+
+def ser_to_ply_single(ver_lst, tri, height, wfp, reverse=True):
+    suffix = get_suffix(wfp)
+
+    for i, ver in enumerate(ver_lst):
+        wfp_new = wfp.replace(suffix, f'_{i + 1}{suffix}')
+
+        n_vertex = ver.shape[1]
+        n_face = tri.shape[0]
+        header = header_temp.format(n_vertex, n_face)
+
+        with open(wfp_new, 'w') as f:
+            f.write(header + '\n')
+            for i in range(n_vertex):
+                x, y, z = ver[:, i]
+                if reverse:
+                    f.write(f'{x:.2f} {height-y:.2f} {z:.2f}\n')
+                else:
+                    f.write(f'{x:.2f} {y:.2f} {z:.2f}\n')
+            for i in range(n_face):
+                idx1, idx2, idx3 = tri[i]  # m x 3
+                if reverse:
+                    f.write(f'3 {idx3} {idx2} {idx1}\n')
+                else:
+                    f.write(f'3 {idx1} {idx2} {idx3}\n')
+
+        print(f'Dump tp {wfp_new}')
+
+
+def ser_to_ply_multiple(ver_lst, tri, height, wfp, reverse=True):
+    n_ply = len(ver_lst)  # count ply
+
+    if n_ply <= 0:
+        return
+
+    n_vertex = ver_lst[0].shape[1]
+    n_face = tri.shape[0]
+    header = header_temp.format(n_vertex * n_ply, n_face * n_ply)
+
+    with open(wfp, 'w') as f:
+        f.write(header + '\n')
+
+        for i in range(n_ply):
+            ver = ver_lst[i]
+            for j in range(n_vertex):
+                x, y, z = ver[:, j]
+                if reverse:
+                    f.write(f'{x:.2f} {height - y:.2f} {z:.2f}\n')
+                else:
+                    f.write(f'{x:.2f} {y:.2f} {z:.2f}\n')
+
+        for i in range(n_ply):
+            offset = i * n_vertex
+            for j in range(n_face):
+                idx1, idx2, idx3 = tri[j]  # m x 3
+                if reverse:
+                    f.write(f'3 {idx3 + offset} {idx2 + offset} {idx1 + offset}\n')
+                else:
+                    f.write(f'3 {idx1 + offset} {idx2 + offset} {idx3 + offset}\n')
+
+    print(f'Dump tp {wfp}')
+
+
+def get_colors(img, ver):
+    h, w, _ = img.shape
+    ver[0, :] = np.minimum(np.maximum(ver[0, :], 0), w - 1)  # x
+    ver[1, :] = np.minimum(np.maximum(ver[1, :], 0), h - 1)  # y
+    ind = np.round(ver).astype(np.int32)
+    colors = img[ind[1, :], ind[0, :], :] / 255.  # n x 3
+
+    return colors.copy()
+
+
+def ser_to_obj_single(img, ver_lst, tri, height, wfp):
+    suffix = get_suffix(wfp)
+
+    n_face = tri.shape[0]
+    for i, ver in enumerate(ver_lst):
+        colors = get_colors(img, ver)
+
+        n_vertex = ver.shape[1]
+
+        wfp_new = wfp.replace(suffix, f'_{i + 1}{suffix}')
+
+        with open(wfp_new, 'w') as f:
+            for i in range(n_vertex):
+                x, y, z = ver[:, i]
+                f.write(
+                    f'v {x:.2f} {height - y:.2f} {z:.2f} {colors[i, 2]:.2f} {colors[i, 1]:.2f} {colors[i, 0]:.2f}\n')
+            for i in range(n_face):
+                idx1, idx2, idx3 = tri[i]  # m x 3
+                f.write(f'f {idx3 + 1} {idx2 + 1} {idx1 + 1}\n')
+
+        print(f'Dump tp {wfp_new}')
+
+
+def ser_to_obj_multiple(img, ver_lst, tri, height, wfp):
+    n_obj = len(ver_lst)  # count obj
+
+    if n_obj <= 0:
+        return
+
+    n_vertex = ver_lst[0].shape[1]
+    n_face = tri.shape[0]
+
+    with open(wfp, 'w') as f:
+        for i in range(n_obj):
+            ver = ver_lst[i]
+            colors = get_colors(img, ver)
+
+            for j in range(n_vertex):
+                x, y, z = ver[:, j]
+                f.write(
+                    f'v {x:.2f} {height - y:.2f} {z:.2f} {colors[j, 2]:.2f} {colors[j, 1]:.2f} {colors[j, 0]:.2f}\n')
+
+        for i in range(n_obj):
+            offset = i * n_vertex
+            for j in range(n_face):
+                idx1, idx2, idx3 = tri[j]  # m x 3
+                f.write(f'f {idx3 + 1 + offset} {idx2 + 1 + offset} {idx1 + 1 + offset}\n')
+
+    print(f'Dump tp {wfp}')
+
+
+ser_to_ply = ser_to_ply_multiple  # ser_to_ply_single
+ser_to_obj = ser_to_obj_multiple  # ser_to_obj_multiple
diff --git a/tddfa/utils/tddfa_util.py b/tddfa/utils/tddfa_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..68eee7b7ee17b2208e1c0a1bfe27cfe495811ece
--- /dev/null
+++ b/tddfa/utils/tddfa_util.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import argparse
+import numpy as np
+import torch
+
+
+def _to_ctype(arr):
+    if not arr.flags.c_contiguous:
+        return arr.copy(order='C')
+    return arr
+
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected')
+
+
+def load_model(model, checkpoint_fp):
+    checkpoint = torch.load(checkpoint_fp, map_location=lambda storage, loc: storage)['state_dict']
+    model_dict = model.state_dict()
+    # because the model is trained by multiple gpus, prefix module should be removed
+    for k in checkpoint.keys():
+        kc = k.replace('module.', '')
+        if kc in model_dict.keys():
+            model_dict[kc] = checkpoint[k]
+        if kc in ['fc_param.bias', 'fc_param.weight']:
+            model_dict[kc.replace('_param', '')] = checkpoint[k]
+
+    model.load_state_dict(model_dict)
+    return model
+
+
+class ToTensorGjz(object):
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            img = torch.from_numpy(pic.transpose((2, 0, 1)))
+            return img.float()
+
+    def __repr__(self):
+        return self.__class__.__name__ + '()'
+
+
+class NormalizeGjz(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        tensor.sub_(self.mean).div_(self.std)
+        return tensor
+
+
+def similar_transform(pts3d, roi_box, size):
+    pts3d[0, :] -= 1  # for Python compatibility
+    pts3d[2, :] -= 1
+    pts3d[1, :] = size - pts3d[1, :]
+
+    sx, sy, ex, ey = roi_box
+    scale_x = (ex - sx) / size
+    scale_y = (ey - sy) / size
+    pts3d[0, :] = pts3d[0, :] * scale_x + sx
+    pts3d[1, :] = pts3d[1, :] * scale_y + sy
+    s = (scale_x + scale_y) / 2
+    pts3d[2, :] *= s
+    pts3d[2, :] -= np.min(pts3d[2, :])
+    return np.array(pts3d, dtype=np.float32)
+
+
+def _parse_param(param):
+    """matrix pose form
+    param: shape=(trans_dim+shape_dim+exp_dim,), i.e., 62 = 12 + 40 + 10
+    """
+
+    # pre-defined templates for parameter
+    n = param.shape[0]
+    if n == 62:
+        trans_dim, shape_dim, exp_dim = 12, 40, 10
+    elif n == 72:
+        trans_dim, shape_dim, exp_dim = 12, 40, 20
+    elif n == 141:
+        trans_dim, shape_dim, exp_dim = 12, 100, 29
+    else:
+        raise Exception(f'Undefined templated param parsing rule')
+
+    R_ = param[:trans_dim].reshape(3, -1)
+    R = R_[:, :3]
+    offset = R_[:, -1].reshape(3, 1)
+    alpha_shp = param[trans_dim:trans_dim + shape_dim].reshape(-1, 1)
+    alpha_exp = param[trans_dim + shape_dim:].reshape(-1, 1)
+
+    return R, offset, alpha_shp, alpha_exp
diff --git a/tddfa/utils/uv.py b/tddfa/utils/uv.py
new file mode 100644
index 0000000000000000000000000000000000000000..84d326fa8ab78d2624ade994c5fe530d47184f13
--- /dev/null
+++ b/tddfa/utils/uv.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+
+__author__ = 'cleardusk'
+
+import sys
+
+sys.path.append('..')
+
+import cv2
+import numpy as np
+import os.path as osp
+import scipy.io as sio
+
+from Sim3DR import rasterize
+from utils.functions import plot_image
+from utils.io import _load
+from utils.tddfa_util import _to_ctype
+
+make_abs_path = lambda fn: osp.join(osp.dirname(osp.realpath(__file__)), fn)
+
+
+def load_uv_coords(fp):
+    C = sio.loadmat(fp)
+    uv_coords = C['UV'].copy(order='C').astype(np.float32)
+    return uv_coords
+
+
+def process_uv(uv_coords, uv_h=256, uv_w=256):
+    uv_coords[:, 0] = uv_coords[:, 0] * (uv_w - 1)
+    uv_coords[:, 1] = uv_coords[:, 1] * (uv_h - 1)
+    uv_coords[:, 1] = uv_h - uv_coords[:, 1] - 1
+    uv_coords = np.hstack((uv_coords, np.zeros((uv_coords.shape[0], 1), dtype=np.float32)))  # add z
+    return uv_coords
+
+
+g_uv_coords = load_uv_coords(make_abs_path('../configs/BFM_UV.mat'))
+indices = _load(make_abs_path('../configs/indices.npy'))  # todo: handle bfm_slim
+g_uv_coords = g_uv_coords[indices, :]
+
+
+def get_colors(img, ver):
+    # nearest-neighbor sampling
+    [h, w, _] = img.shape
+    ver[0, :] = np.minimum(np.maximum(ver[0, :], 0), w - 1)  # x
+    ver[1, :] = np.minimum(np.maximum(ver[1, :], 0), h - 1)  # y
+    ind = np.round(ver).astype(np.int32)
+    colors = img[ind[1, :], ind[0, :], :]  # n x 3
+
+    return colors
+
+
+def bilinear_interpolate(img, x, y):
+    """
+    https://stackoverflow.com/questions/12729228/simple-efficient-bilinear-interpolation-of-images-in-numpy-and-python
+    """
+    x0 = np.floor(x).astype(np.int32)
+    x1 = x0 + 1
+    y0 = np.floor(y).astype(np.int32)
+    y1 = y0 + 1
+
+    x0 = np.clip(x0, 0, img.shape[1] - 1)
+    x1 = np.clip(x1, 0, img.shape[1] - 1)
+    y0 = np.clip(y0, 0, img.shape[0] - 1)
+    y1 = np.clip(y1, 0, img.shape[0] - 1)
+
+    i_a = img[y0, x0]
+    i_b = img[y1, x0]
+    i_c = img[y0, x1]
+    i_d = img[y1, x1]
+
+    wa = (x1 - x) * (y1 - y)
+    wb = (x1 - x) * (y - y0)
+    wc = (x - x0) * (y1 - y)
+    wd = (x - x0) * (y - y0)
+
+    return wa[..., np.newaxis] * i_a + wb[..., np.newaxis] * i_b + wc[..., np.newaxis] * i_c + wd[..., np.newaxis] * i_d
+
+
+def uv_tex(img, ver_lst, tri, uv_h=256, uv_w=256, uv_c=3, show_flag=False, wfp=None):
+    uv_coords = process_uv(g_uv_coords.copy(), uv_h=uv_h, uv_w=uv_w)
+
+    res_lst = []
+    for ver_ in ver_lst:
+        ver = _to_ctype(ver_.T)  # transpose to m x 3
+        colors = bilinear_interpolate(img, ver[:, 0], ver[:, 1]) / 255.
+        # `rasterize` here serves as texture sampling, may need to optimization
+        res = rasterize(uv_coords, tri, colors, height=uv_h, width=uv_w, channel=uv_c)
+        res_lst.append(res)
+
+    # concat if there more than one image
+    res = np.concatenate(res_lst, axis=1) if len(res_lst) > 1 else res_lst[0]
+
+    if wfp is not None:
+        cv2.imwrite(wfp, res)
+        print(f'Save visualization result to {wfp}')
+
+    if show_flag:
+        plot_image(res)
+
+    return res
diff --git a/tddfa/weights/.gitignore b/tddfa/weights/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..0990b541fb47d1bd41e3103498c72e4d3190b0dc
--- /dev/null
+++ b/tddfa/weights/.gitignore
@@ -0,0 +1,3 @@
+checkpoints/
+*.pth
+*.onnx
\ No newline at end of file
diff --git a/tddfa/weights/readme.md b/tddfa/weights/readme.md
new file mode 100644
index 0000000000000000000000000000000000000000..9329d0c40a84535e6f74228c7e3b938e68945f7e
--- /dev/null
+++ b/tddfa/weights/readme.md
@@ -0,0 +1,8 @@
+## Pre-converted onnx model
+
+| Model | Link |
+| :-: | :-: |
+| `mb1_120x120.onnx` | [Google Drive](https://drive.google.com/file/d/1YpO1KfXvJHRmCBkErNa62dHm-CUjsoIk/view?usp=sharing) or [Baidu Drive](https://pan.baidu.com/s/1qpQBd5KOS0-5lD6jZKXZ-Q) (Password: cqbx) |
+| `mb05_120x120.onnx` | [Google Drive](https://drive.google.com/file/d/1orJFiZPshmp7jmCx_D0tvIEtPYtnFvHS/view?usp=sharing) or [Baidu Drive](https://pan.baidu.com/s/1sRaBOA5wHu6PFS1Qd-TBFA) (Password: 8qst) |
+| `resnet22.onnx` | [Google Drive](https://drive.google.com/file/d/1rRyrd7Ar-QYTi1hRHOYHspT8PTyXQ5ds/view?usp=sharing) or [Baidu Drive](https://pan.baidu.com/s/1Nzkw7Ie_5trKvi1JYxymJA) (Password: 1op6) |
+| `resnet22.pth` | [Google Drive](https://drive.google.com/file/d/1dh7JZgkj1IaO4ZcSuBOBZl2suT9EPedV/view?usp=sharing) or [Baidu Drive](https://pan.baidu.com/s/1IS7ncVxhw0f955ySg67Y4A) (Password: lv1a) |
\ No newline at end of file