Spaces:

ma-xu
/

LIVE

Runtime error

App Files Files Community

Xu Ma commited on Apr 24, 2022

Commit

be11144

1 Parent(s): 6afe7e5

update

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

DiffVG/CMakeLists.txt +140 -0
DiffVG/aabb.h +67 -0
DiffVG/atomic.cpp +27 -0
DiffVG/atomic.h +139 -0
DiffVG/cdf.h +29 -0
DiffVG/cmake/FindTensorFlow.cmake +34 -0
DiffVG/cmake/FindThrust.cmake +40 -0
DiffVG/color.cpp +25 -0
DiffVG/color.h +63 -0
DiffVG/compute_distance.h +949 -0
DiffVG/cuda_utils.h +53 -0
DiffVG/diffvg.cpp +1792 -0
DiffVG/diffvg.h +156 -0
DiffVG/edge_query.h +7 -0
DiffVG/filter.h +106 -0
DiffVG/matrix.h +544 -0
DiffVG/painterly_rendering.py +223 -0
DiffVG/parallel.cpp +273 -0
DiffVG/parallel.h +91 -0
DiffVG/pcg.h +40 -0
DiffVG/poetry.lock +0 -0
DiffVG/ptr.h +23 -0
DiffVG/pybind11/.appveyor.yml +37 -0
DiffVG/pybind11/.cmake-format.yaml +73 -0
DiffVG/pybind11/.github/CONTRIBUTING.md +171 -0
DiffVG/pybind11/.github/ISSUE_TEMPLATE/bug-report.md +28 -0
DiffVG/pybind11/.github/ISSUE_TEMPLATE/config.yml +5 -0
DiffVG/pybind11/.github/ISSUE_TEMPLATE/feature-request.md +16 -0
DiffVG/pybind11/.github/ISSUE_TEMPLATE/question.md +21 -0
DiffVG/pybind11/.github/workflows/ci.yml +359 -0
DiffVG/pybind11/.github/workflows/configure.yml +78 -0
DiffVG/pybind11/.github/workflows/format.yml +19 -0
DiffVG/pybind11/.gitignore +41 -0
DiffVG/pybind11/.gitmodules +3 -0
DiffVG/pybind11/.pre-commit-config.yaml +44 -0
DiffVG/pybind11/.readthedocs.yml +3 -0
DiffVG/pybind11/CMakeLists.txt +271 -0
DiffVG/pybind11/LICENSE +29 -0
DiffVG/pybind11/MANIFEST.in +2 -0
DiffVG/pybind11/README.md +143 -0
DiffVG/pybind11/docs/Doxyfile +22 -0
DiffVG/pybind11/docs/_static/theme_overrides.css +11 -0
DiffVG/pybind11/docs/advanced/cast/chrono.rst +81 -0
DiffVG/pybind11/docs/advanced/cast/custom.rst +91 -0
DiffVG/pybind11/docs/advanced/cast/eigen.rst +310 -0
DiffVG/pybind11/docs/advanced/cast/functional.rst +109 -0
DiffVG/pybind11/docs/advanced/cast/index.rst +41 -0
DiffVG/pybind11/docs/advanced/cast/overview.rst +165 -0
DiffVG/pybind11/docs/advanced/cast/stl.rst +240 -0
DiffVG/pybind11/docs/advanced/cast/strings.rst +305 -0

DiffVG/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,140 @@

+cmake_minimum_required(VERSION 3.12)
+project(diffvg VERSION 0.0.1 DESCRIPTION "Differentiable Vector Graphics")
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(WIN32)
+    find_package(Python 3.6 COMPONENTS Development REQUIRED)
+else()
+    find_package(Python 3.7 COMPONENTS Development REQUIRED)
+endif()
+add_subdirectory(pybind11)
+option(DIFFVG_CUDA "Build diffvg with GPU code path?" ON)
+if(DIFFVG_CUDA)
+    message(STATUS "Build with CUDA support")
+    find_package(CUDA 10 REQUIRED)
+    set(CMAKE_CUDA_STANDARD 11)
+    if(NOT WIN32)
+        # Hack: for some reason the line above doesn't work on some Linux systems.
+        set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -std=c++11")
+        #set(CUDA_NVCC_FLAGS_DEBUG "-g -G")
+    endif()
+else()
+    message(STATUS "Build without CUDA support")
+    find_package(Thrust REQUIRED)
+endif()
+# include_directories(${CMAKE_SOURCE_DIR}/pybind11/include)
+include_directories(${PYTHON_INCLUDE_PATH})
+find_package(PythonLibs REQUIRED)
+include_directories(${PYTHON_INCLUDE_PATH})
+include_directories(${PYTHON_INCLUDE_DIRS})
+include_directories(pybind11/include)
+if(DIFFVG_CUDA)
+    link_directories(${CUDA_LIBRARIES})
+else()
+    include_directories(${THRUST_INCLUDE_DIR})
+endif()
+if(NOT MSVC)
+  # These compile definitions are not meaningful for MSVC
+  add_compile_options(-Wall -g -O3 -fvisibility=hidden -Wno-unknown-pragmas)
+else()
+  add_compile_options(/Wall /Zi)
+  add_link_options(/DEBUG)
+endif()
+if(NOT DIFFVG_CUDA)
+    add_compile_options("-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CPP")
+endif()
+set(SRCS atomic.h
+         color.h
+         cdf.h
+         cuda_utils.h
+         diffvg.h
+         edge_query.h
+         filter.h
+         matrix.h
+         parallel.h
+         pcg.h
+         ptr.h
+         sample_boundary.h
+         scene.h
+         shape.h
+         solve.h
+         vector.h
+         within_distance.h
+         winding_number.h
+         atomic.cpp
+         color.cpp
+         diffvg.cpp
+         parallel.cpp
+         scene.cpp
+         shape.cpp)
+if(DIFFVG_CUDA)
+    add_compile_definitions(COMPILE_WITH_CUDA)
+    set_source_files_properties(
+        diffvg.cpp
+        scene.cpp
+        PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ)
+    cuda_add_library(diffvg MODULE ${SRCS})
+else()
+    add_library(diffvg MODULE ${SRCS})
+endif()
+if(APPLE)
+    # The "-undefined dynamic_lookup" is a hack for systems with
+    # multiple Python installed. If we link a particular Python version
+    # here, and we import it with a different Python version later.
+    # likely a segmentation fault.
+    # The solution for Linux Mac OS machines, as mentioned in
+    # https://github.com/pybind/pybind11/blob/master/tools/pybind11Tools.cmake
+    # is to not link against Python library at all and resolve the symbols
+    # at compile time.
+    set(DYNAMIC_LOOKUP "-undefined dynamic_lookup")
+endif()
+target_link_libraries(diffvg ${DYNAMIC_LOOKUP})
+if(WIN32)
+    # See: https://pybind11.readthedocs.io/en/master/compiling.html#advanced-interface-library-target
+    target_link_libraries(diffvg pybind11::module)
+    set_target_properties(diffvg PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}"
+                                            SUFFIX "${PYTHON_MODULE_EXTENSION}")
+endif()
+set_target_properties(diffvg PROPERTIES SKIP_BUILD_RPATH FALSE)
+set_target_properties(diffvg PROPERTIES BUILD_WITH_INSTALL_RPATH TRUE)
+if(UNIX AND NOT APPLE)
+    set_target_properties(diffvg PROPERTIES INSTALL_RPATH "$ORIGIN")
+elseif(APPLE)
+    set_target_properties(diffvg PROPERTIES INSTALL_RPATH "@loader_path")
+endif()
+set_property(TARGET diffvg PROPERTY CXX_STANDARD 11)
+set_target_properties(diffvg PROPERTIES PREFIX "")
+# Still enable assertion in release mode
+string( REPLACE "/DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "/DNDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "-DNDEBUG" "" CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "/DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE}")
+string( REPLACE "/DNDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+string( REPLACE "-DNDEBUG" "" CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO}")
+if(NOT WIN32)
+    find_package(TensorFlow)
+    if(TensorFlow_FOUND)
+        add_subdirectory(pydiffvg_tensorflow/custom_ops)
+    else()
+        message(INFO " Building without TensorFlow support (not found)")
+    endif()
+endif()

DiffVG/aabb.h ADDED Viewed

	@@ -0,0 +1,67 @@

+#pragma once
+#include "diffvg.h"
+#include "cuda_utils.h"
+#include "vector.h"
+#include "matrix.h"
+struct AABB {
+    DEVICE
+    inline AABB(const Vector2f &p_min = Vector2f{infinity<float>(), infinity<float>()},
+                const Vector2f &p_max = Vector2f{-infinity<float>(), -infinity<float>()})
+        : p_min(p_min), p_max(p_max) {}
+    Vector2f p_min, p_max;
+};
+DEVICE
+inline
+AABB merge(const AABB &box, const Vector2f &p) {
+    return AABB{Vector2f{min(p.x, box.p_min.x), min(p.y, box.p_min.y)},
+                Vector2f{max(p.x, box.p_max.x), max(p.y, box.p_max.y)}};
+}
+DEVICE
+inline
+AABB merge(const AABB &box0, const AABB &box1) {
+    return AABB{Vector2f{min(box0.p_min.x, box1.p_min.x), min(box0.p_min.y, box1.p_min.y)},
+                Vector2f{max(box0.p_max.x, box1.p_max.x), max(box0.p_max.y, box1.p_max.y)}};
+}
+DEVICE
+inline
+bool inside(const AABB &box, const Vector2f &p) {
+    return p.x >= box.p_min.x && p.x <= box.p_max.x &&
+           p.y >= box.p_min.y && p.y <= box.p_max.y;
+}
+DEVICE
+inline
+bool inside(const AABB &box, const Vector2f &p, float radius) {
+    return p.x >= box.p_min.x - radius && p.x <= box.p_max.x + radius &&
+           p.y >= box.p_min.y - radius && p.y <= box.p_max.y + radius;
+}
+DEVICE
+inline
+AABB enlarge(const AABB &box, float width) {
+    return AABB{Vector2f{box.p_min.x - width, box.p_min.y - width},
+                Vector2f{box.p_max.x + width, box.p_max.y + width}};
+}
+DEVICE
+inline
+AABB transform(const Matrix3x3f &xform, const AABB &box) {
+    auto ret = AABB();
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_min.x, box.p_min.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_min.x, box.p_max.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_max.x, box.p_min.y}));
+    ret = merge(ret, xform_pt(xform, Vector2f{box.p_max.x, box.p_max.y}));
+    return ret;
+}
+DEVICE
+inline
+bool within_distance(const AABB &box, const Vector2f &pt, float r) {
+    return pt.x >= box.p_min.x - r && pt.x <= box.p_max.x + r &&
+           pt.y >= box.p_min.y - r && pt.y <= box.p_max.y + r;
+}

DiffVG/atomic.cpp ADDED Viewed

	@@ -0,0 +1,27 @@

+//A hacky solution to get around the Ellipse include
+#ifdef WIN32
+#include <windows.h>
+#include <cstdint>
+float win_atomic_add(float &target, float source) {
+	union { int i; float f; } old_val;
+	union { int i; float f; } new_val;
+	do {
+		old_val.f = target;
+		new_val.f = old_val.f + (float)source;
+	} while (InterlockedCompareExchange((LONG*)&target, (LONG)new_val.i, (LONG)old_val.i) != old_val.i);
+	return old_val.f;
+}
+double win_atomic_add(double &target, double source) {
+	union { int64_t i; double f; } old_val;
+	union { int64_t i; double f; } new_val;
+	do {
+		old_val.f = target;
+		new_val.f = old_val.f + (double)source;
+	} while (InterlockedCompareExchange64((LONG64*)&target, (LONG64)new_val.i, (LONG64)old_val.i) != old_val.i);
+	return old_val.f;
+}
+#endif

DiffVG/atomic.h ADDED Viewed

	@@ -0,0 +1,139 @@

+#pragma once
+#include "diffvg.h"
+#include "vector.h"
+#include "matrix.h"
+// https://stackoverflow.com/questions/39274472/error-function-atomicadddouble-double-has-already-been-defined
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+#else
+static inline DEVICE double atomicAdd(double *address, double val) {
+    unsigned long long int* address_as_ull = (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    if (val == 0.0)
+        return __longlong_as_double(old);
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val +__longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+#endif
+#ifndef WIN32
+    template <typename T0, typename T1>
+    DEVICE
+    inline T0 atomic_add_(T0 &target, T1 source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, (T0)source);
+    #else
+        T0 old_val;
+        T0 new_val;
+        do {
+            old_val = target;
+            new_val = old_val + source;
+        } while (!__atomic_compare_exchange(&target, &old_val, &new_val, true,
+            std::memory_order::memory_order_seq_cst,
+            std::memory_order::memory_order_seq_cst));
+        return old_val;
+    #endif
+    }
+    DEVICE
+    inline
+    float atomic_add(float &target, float source) {
+        return atomic_add_(target, source);
+    }
+    DEVICE
+    inline
+    double atomic_add(double &target, double source) {
+        return atomic_add_(target, source);
+    }
+#else
+	float win_atomic_add(float &target, float source);
+	double win_atomic_add(double &target, double source);
+    DEVICE
+    static float atomic_add(float &target, float source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, source);
+    #else
+		return win_atomic_add(target, source);
+    #endif
+    }
+    DEVICE
+    static double atomic_add(double &target, double source) {
+    #ifdef __CUDA_ARCH__
+        return atomicAdd(&target, (double)source);
+    #else
+		return win_atomic_add(target, source);
+    #endif
+    }
+#endif
+template <typename T0, typename T1>
+DEVICE
+inline T0 atomic_add(T0 *target, T1 source) {
+    return atomic_add(*target, (T0)source);
+}
+template <typename T0, typename T1>
+DEVICE
+inline TVector2<T0> atomic_add(TVector2<T0> &target, const TVector2<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    return target;
+}
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector2<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+}
+template <typename T0, typename T1>
+DEVICE
+inline TVector3<T0> atomic_add(TVector3<T0> &target, const TVector3<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    atomic_add(target[2], source[2]);
+    return target;
+}
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector3<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+    atomic_add(target[2], (T0)source[2]);
+}
+template <typename T0, typename T1>
+DEVICE
+inline TVector4<T0> atomic_add(TVector4<T0> &target, const TVector4<T1> &source) {
+    atomic_add(target[0], source[0]);
+    atomic_add(target[1], source[1]);
+    atomic_add(target[2], source[2]);
+    atomic_add(target[3], source[3]);
+    return target;
+}
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TVector4<T1> &source) {
+    atomic_add(target[0], (T0)source[0]);
+    atomic_add(target[1], (T0)source[1]);
+    atomic_add(target[2], (T0)source[2]);
+    atomic_add(target[3], (T0)source[3]);
+}
+template <typename T0, typename T1>
+DEVICE
+inline void atomic_add(T0 *target, const TMatrix3x3<T1> &source) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            atomic_add(target[3 * i + j], (T0)source(i, j));
+        }
+    }
+}

DiffVG/cdf.h ADDED Viewed

	@@ -0,0 +1,29 @@

+#pragma once
+#include "diffvg.h"
+DEVICE int sample(const float *cdf, int num_entries, float u, float *updated_u = nullptr) {
+    // Binary search the cdf
+    auto lb = 0;
+    auto len = num_entries - 1 - lb;
+    while (len > 0) {
+        auto half_len = len / 2;
+        auto mid = lb + half_len;
+        assert(mid >= 0 && mid < num_entries);
+        if (u < cdf[mid]) {
+            len = half_len;
+        } else {
+            lb = mid + 1;
+            len = len - half_len - 1;
+        }
+    }
+    lb = clamp(lb, 0, num_entries - 1);
+    if (updated_u != nullptr) {
+    	if (lb > 0) {
+    		*updated_u = (u - cdf[lb - 1]) / (cdf[lb] - cdf[lb - 1]);
+    	} else {
+    		*updated_u = u / cdf[lb];
+    	}
+    }
+    return lb;
+}

DiffVG/cmake/FindTensorFlow.cmake ADDED Viewed

	@@ -0,0 +1,34 @@

+# https://github.com/PatWie/tensorflow-cmake/blob/master/cmake/modules/FindTensorFlow.cmake
+execute_process(
+    COMMAND python -c "exec(\"try:\\n  import tensorflow as tf; print(tf.__version__); print(tf.__cxx11_abi_flag__);print(tf.sysconfig.get_include()); print(tf.sysconfig.get_lib())\\nexcept ImportError:\\n  exit(1)\")"
+    OUTPUT_VARIABLE TF_INFORMATION_STRING
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE retcode)
+if("${retcode}" STREQUAL "0")
+    string(REPLACE "\n" ";" TF_INFORMATION_LIST ${TF_INFORMATION_STRING})
+    list(GET TF_INFORMATION_LIST 0 TF_DETECTED_VERSION)
+    list(GET TF_INFORMATION_LIST 1 TF_DETECTED_ABI)
+    list(GET TF_INFORMATION_LIST 2 TF_DETECTED_INCLUDE_DIR)
+    list(GET TF_INFORMATION_LIST 3 TF_DETECTED_LIBRARY_DIR)
+    if(WIN32)
+        find_library(TF_DETECTED_LIBRARY NAMES _pywrap_tensorflow_internal PATHS
+            ${TF_DETECTED_LIBRARY_DIR}/python)
+    else()
+        # For some reason my tensorflow doesn't have a .so file
+        list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES .so.1)
+        list(APPEND CMAKE_FIND_LIBRARY_SUFFIXES .so.2)
+        find_library(TF_DETECTED_LIBRARY NAMES tensorflow_framework PATHS
+            ${TF_DETECTED_LIBRARY_DIR})
+    endif()
+    set(TensorFlow_VERSION ${TF_DETECTED_VERSION})
+    set(TensorFlow_ABI ${TF_DETECTED_ABI})
+    set(TensorFlow_INCLUDE_DIR ${TF_DETECTED_INCLUDE_DIR})
+    set(TensorFlow_LIBRARY ${TF_DETECTED_LIBRARY})
+    if(TensorFlow_LIBRARY AND TensorFlow_INCLUDE_DIR)
+        set(TensorFlow_FOUND TRUE)
+    else()
+        set(TensorFlow_FOUND FALSE)
+    endif()
+endif()

DiffVG/cmake/FindThrust.cmake ADDED Viewed

	@@ -0,0 +1,40 @@

+##=============================================================================
+##
+##  Copyright (c) Kitware, Inc.
+##  All rights reserved.
+##  See LICENSE.txt for details.
+##
+##  This software is distributed WITHOUT ANY WARRANTY; without even
+##  the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+##  PURPOSE.  See the above copyright notice for more information.
+##
+##  Copyright 2012 Sandia Corporation.
+##  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+##  the U.S. Government retains certain rights in this software.
+##
+##=============================================================================
+#
+# FindThrust
+#
+# This module finds the Thrust header files and extrats their version.  It
+# sets the following variables.
+#
+# THRUST_INCLUDE_DIR -  Include directory for thrust header files.  (All header
+#                       files will actually be in the thrust subdirectory.)
+# THRUST_VERSION -      Version of thrust in the form "major.minor.patch".
+#
+find_path(THRUST_INCLUDE_DIR
+	HINTS /usr/include/cuda
+	      /usr/local/include
+	      /usr/local/cuda/include
+	      ${CUDA_INCLUDE_DIRS}
+	      ./thrust
+	      ../thrust
+	NAMES thrust/version.h
+)
+if (THRUST_INCLUDE_DIR)
+  set(THRUST_FOUND TRUE)
+endif ()

DiffVG/color.cpp ADDED Viewed

	@@ -0,0 +1,25 @@

+#include "color.h"
+void LinearGradient::copy_to(ptr<float> stop_offsets,
+                             ptr<float> stop_colors) const {
+    float *o = stop_offsets.get();
+    float *c = stop_colors.get();
+    for (int i = 0; i < num_stops; i++) {
+        o[i] = this->stop_offsets[i];
+    }
+    for (int i = 0; i < 4 * num_stops; i++) {
+        c[i] = this->stop_colors[i];
+    }
+}
+void RadialGradient::copy_to(ptr<float> stop_offsets,
+                             ptr<float> stop_colors) const {
+    float *o = stop_offsets.get();
+    float *c = stop_colors.get();
+    for (int i = 0; i < num_stops; i++) {
+        o[i] = this->stop_offsets[i];
+    }
+    for (int i = 0; i < 4 * num_stops; i++) {
+        c[i] = this->stop_colors[i];
+    }
+}

DiffVG/color.h ADDED Viewed

	@@ -0,0 +1,63 @@

+#pragma once
+#include "diffvg.h"
+#include "vector.h"
+#include "ptr.h"
+enum class ColorType {
+    Constant,
+    LinearGradient,
+    RadialGradient
+};
+struct Constant {
+    Vector4f color;
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+};
+struct LinearGradient {
+    LinearGradient(const Vector2f &begin,
+                   const Vector2f &end,
+                   int num_stops,
+                   ptr<float> stop_offsets,
+                   ptr<float> stop_colors)
+        : begin(begin), end(end), num_stops(num_stops),
+          stop_offsets(stop_offsets.get()), stop_colors(stop_colors.get()) {}
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+    void copy_to(ptr<float> stop_offset,
+                 ptr<float> stop_colors) const;
+    Vector2f begin, end;
+    int num_stops;
+    float *stop_offsets;
+    float *stop_colors; // rgba
+};
+struct RadialGradient {
+    RadialGradient(const Vector2f &center,
+                   const Vector2f &radius,
+                   int num_stops,
+                   ptr<float> stop_offsets,
+                   ptr<float> stop_colors)
+        : center(center), radius(radius), num_stops(num_stops),
+          stop_offsets(stop_offsets.get()), stop_colors(stop_colors.get()) {}
+    ptr<void> get_ptr() {
+        return ptr<void>(this);
+    }
+    void copy_to(ptr<float> stop_offset,
+                 ptr<float> stop_colors) const;
+    Vector2f center, radius;
+    int num_stops;
+    float *stop_offsets;
+    float *stop_colors; // rgba
+};

DiffVG/compute_distance.h ADDED Viewed

	@@ -0,0 +1,949 @@

+#pragma once
+#include "diffvg.h"
+#include "edge_query.h"
+#include "scene.h"
+#include "shape.h"
+#include "solve.h"
+#include "vector.h"
+#include <cassert>
+struct ClosestPointPathInfo {
+    int base_point_id;
+    int point_id;
+    float t_root;
+};
+DEVICE
+inline
+bool closest_point(const Circle &circle, const Vector2f &pt,
+                   Vector2f *result) {
+    *result = circle.center + circle.radius * normalize(pt - circle.center);
+    return false;
+}
+DEVICE
+inline
+bool closest_point(const Path &path, const BVHNode *bvh_nodes, const Vector2f &pt, float max_radius,
+                   ClosestPointPathInfo *path_info,
+                   Vector2f *result) {
+    auto min_dist = max_radius;
+    auto ret_pt = Vector2f{0, 0};
+    auto found = false;
+    auto num_segments = path.num_base_points;
+    constexpr auto max_bvh_size = 128;
+    int bvh_stack[max_bvh_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * num_segments - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto base_point_id = node.child0;
+            auto point_id = - node.child1 - 1;
+            assert(base_point_id < num_segments);
+            assert(point_id < path.num_points);
+            auto dist = 0.f;
+            auto closest_pt = Vector2f{0, 0};
+            auto t_root = 0.f;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                // project pt to line
+                auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+                if (t < 0) {
+                    dist = distance(p0, pt);
+                    closest_pt = p0;
+                    t_root = 0;
+                } else if (t > 1) {
+                    dist = distance(p1, pt);
+                    closest_pt = p1;
+                    t_root = 1;
+                } else {
+                    dist = distance(p0 + t * (p1 - p0), pt);
+                    closest_pt = p0 + t * (p1 - p0);
+                    t_root = t;
+                }
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                if (path.use_distance_approx) {
+                    closest_pt = quadratic_closest_pt_approx(p0, p1, p2, pt, &t_root);
+                    dist = distance(closest_pt, pt);
+                } else {
+                    auto eval = [&](float t) -> Vector2f {
+                        auto tt = 1 - t;
+                        return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+                    };
+                    auto pt0 = eval(0);
+                    auto pt1 = eval(1);
+                    auto dist0 = distance(pt0, pt);
+                    auto dist1 = distance(pt1, pt);
+                    {
+                        dist = dist0;
+                        closest_pt = pt0;
+                        t_root = 0;
+                    }
+                    if (dist1 < dist) {
+                        dist = dist1;
+                        closest_pt = pt1;
+                        t_root = 1;
+                    }
+                    // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+                    // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0 = q
+                    // Want to solve (q - pt) dot q' = 0
+                    // q' = (p0-2p1+p2)t + (-p0+p1)
+                    // Expanding (p0-2p1+p2)^2 t^3 +
+                    //           3(p0-2p1+p2)(-p0+p1) t^2 +
+                    //           (2(-p0+p1)^2+(p0-2p1+p2)(p0-pt))t +
+                    //           (-p0+p1)(p0-pt) = 0
+                    auto A = sum((p0-2*p1+p2)*(p0-2*p1+p2));
+                    auto B = sum(3*(p0-2*p1+p2)*(-p0+p1));
+                    auto C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt));
+                    auto D = sum((-p0+p1)*(p0-pt));
+                    float t[3];
+                    int num_sol = solve_cubic(A, B, C, D, t);
+                    for (int j = 0; j < num_sol; j++) {
+                        if (t[j] >= 0 && t[j] <= 1) {
+                            auto p = eval(t[j]);
+                            auto distp = distance(p, pt);
+                            if (distp < dist) {
+                                dist = distp;
+                                closest_pt = p;
+                                t_root = t[j];
+                            }
+                        }
+                    }
+                }
+            } else if (path.num_control_points[base_point_id] == 2) {
+                // Cubic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+                auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+                auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+                auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+                auto eval = [&](float t) -> Vector2f {
+                    auto tt = 1 - t;
+                    return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+                };
+                auto pt0 = eval(0);
+                auto pt1 = eval(1);
+                auto dist0 = distance(pt0, pt);
+                auto dist1 = distance(pt1, pt);
+                {
+                    dist = dist0;
+                    closest_pt = pt0;
+                    t_root = 0;
+                }
+                if (dist1 < dist) {
+                    dist = dist1;
+                    closest_pt = pt1;
+                    t_root = 1;
+                }
+                // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+                // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+                // Want to solve (q - pt) dot q' = 0
+                // q' = 3*(-p0+3p1-3p2+p3)t^2 + 2*(3p0-6p1+3p2)t + (-3p0+3p1)
+                // Expanding
+                // 3*(-p0+3p1-3p2+p3)^2 t^5
+                // 5*(-p0+3p1-3p2+p3)(3p0-6p1+3p2) t^4
+                // 4*(-p0+3p1-3p2+p3)(-3p0+3p1) + 2*(3p0-6p1+3p2)^2 t^3
+                // 3*(3p0-6p1+3p2)(-3p0+3p1) + 3*(-p0+3p1-3p2+p3)(p0-pt) t^2
+                // (-3p0+3p1)^2+2(p0-pt)(3p0-6p1+3p2) t
+                // (p0-pt)(-3p0+3p1)
+                double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3));
+                double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+                double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+                double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+                double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+                double F = sum((p0-pt)*(-3*p0+3*p1));
+                // normalize the polynomial
+                B /= A;
+                C /= A;
+                D /= A;
+                E /= A;
+                F /= A;
+                // Isolator Polynomials:
+                // https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.133.2233&rep=rep1&type=pdf
+                //                                       x/5 + B/25
+                //                                    /-----------------------------------------------------
+                // 5x^4 + 4B x^3 + 3C x^2 + 2D x + E /   x^5 +    B x^4 +       C x^3 +      D x^2 +      E x + F
+                //                                       x^5 + 4B/5 x^4 +    3C/5 x^3 +   2D/5 x^2 +    E/5 x
+                //                                      ----------------------------------------------------
+                //                                              B/5 x^4 +    2C/5 x^3 +   3D/5 x^2 +   4E/5 x + F
+                //                                              B/5 x^4 + 4B^2/25 x^3 + 3BC/25 x^2 + 2BD/25 x + BE/25
+                //                                      ----------------------------------------------------
+                //                                     (2C/5 - 4B^2/25)x^3 + (3D/5-3BC/25)x^2 + (4E/5-2BD/25) + (F-BE/25)
+                auto p1A = ((2 / 5.f) * C - (4 / 25.f) * B * B);
+                auto p1B = ((3 / 5.f) * D - (3 / 25.f) * B * C);
+                auto p1C = ((4 / 5.f) * E - (2 / 25.f) * B * D);
+                auto p1D = F - B * E / 25.f;
+                // auto q1A = 1 / 5.f;
+                // auto q1B = B / 25.f;
+                // x/5 + B/25 = 0
+                // x = -B/5
+                auto q_root = -B/5.f;
+                double p_roots[3];
+                int num_sol = solve_cubic(p1A, p1B, p1C, p1D, p_roots);
+                float intervals[4];
+                if (q_root >= 0 && q_root <= 1) {
+                    intervals[0] = q_root;
+                }
+                for (int j = 0; j < num_sol; j++) {
+                    intervals[j + 1] = p_roots[j];
+                }
+                auto num_intervals = 1 + num_sol;
+                // sort intervals
+                for (int j = 1; j < num_intervals; j++) {
+                    for (int k = j; k > 0 && intervals[k - 1] > intervals[k]; k--) {
+                        auto tmp = intervals[k];
+                        intervals[k] = intervals[k - 1];
+                        intervals[k - 1] = tmp;
+                    }
+                }
+                auto eval_polynomial = [&] (double t) {
+                    return t*t*t*t*t+
+                           B*t*t*t*t+
+                           C*t*t*t+
+                           D*t*t+
+                           E*t+
+                           F;
+                };
+                auto eval_polynomial_deriv = [&] (double t) {
+                    return 5*t*t*t*t+
+                           4*B*t*t*t+
+                           3*C*t*t+
+                           2*D*t+
+                           E;
+                };
+                auto lower_bound = 0.f;
+                for (int j = 0; j < num_intervals + 1; j++) {
+                    if (j < num_intervals && intervals[j] < 0.f) {
+                        continue;
+                    }
+                    auto upper_bound = j < num_intervals ?
+                        min(intervals[j], 1.f) : 1.f;
+                    auto lb = lower_bound;
+                    auto ub = upper_bound;
+                    auto lb_eval = eval_polynomial(lb);
+                    auto ub_eval = eval_polynomial(ub);
+                    if (lb_eval * ub_eval > 0) {
+                        // Doesn't have root
+                        continue;
+                    }
+                    if (lb_eval > ub_eval) {
+                        swap_(lb, ub);
+                    }
+                    auto t = 0.5f * (lb + ub);
+                    auto num_iter = 20;
+                    for (int it = 0; it < num_iter; it++) {
+                        if (!(t >= lb && t <= ub)) {
+                            t = 0.5f * (lb + ub);
+                        }
+                        auto value = eval_polynomial(t);
+                        if (fabs(value) < 1e-5f || it == num_iter - 1) {
+                            break;
+                        }
+                        // The derivative may not be entirely accurate,
+                        // but the bisection is going to handle this
+                        if (value > 0.f) {
+                            ub = t;
+                        } else {
+                            lb = t;
+                        }
+                        auto derivative = eval_polynomial_deriv(t);
+                        t -= value / derivative;
+                    }
+                    auto p = eval(t);
+                    auto distp = distance(p, pt);
+                    if (distp < dist) {
+                        dist = distp;
+                        closest_pt = p;
+                        t_root = t;
+                    }
+                    if (upper_bound >= 1.f) {
+                        break;
+                    }
+                    lower_bound = upper_bound;
+                }
+            } else {
+                assert(false);
+            }
+            if (dist < min_dist) {
+                min_dist = dist;
+                ret_pt = closest_pt;
+                path_info->base_point_id = base_point_id;
+                path_info->point_id = point_id;
+                path_info->t_root = t_root;
+                found = true;
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (within_distance(b0, pt, min_dist)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (within_distance(b1, pt, min_dist)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_size);
+        }
+    }
+    if (found) {
+        assert(path_info->base_point_id < num_segments);
+    }
+    *result = ret_pt;
+    return found;
+}
+DEVICE
+inline
+bool closest_point(const Rect &rect, const Vector2f &pt,
+                   Vector2f *result) {
+    auto min_dist = 0.f;
+    auto closest_pt = Vector2f{0, 0};
+    auto update = [&](const Vector2f &p0, const Vector2f &p1, bool first) {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            auto d = distance(p0, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p0;
+            }
+        } else if (t > 1) {
+            auto d = distance(p1, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p1;
+            }
+        } else {
+            auto p = p0 + t * (p1 - p0);
+            auto d = distance(p, pt);
+            if (first || d < min_dist) {
+                min_dist = d;
+                closest_pt = p0;
+            }
+        }
+    };
+    auto left_top = rect.p_min;
+    auto right_top = Vector2f{rect.p_max.x, rect.p_min.y};
+    auto left_bottom = Vector2f{rect.p_min.x, rect.p_max.y};
+    auto right_bottom = rect.p_max;
+    update(left_top, left_bottom, true);
+    update(left_top, right_top, false);
+    update(right_top, right_bottom, false);
+    update(left_bottom, right_bottom, false);
+    *result = closest_pt;
+    return true;
+}
+DEVICE
+inline
+bool closest_point(const Shape &shape, const BVHNode *bvh_nodes, const Vector2f &pt, float max_radius,
+                   ClosestPointPathInfo *path_info,
+                   Vector2f *result) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            return closest_point(*(const Circle *)shape.ptr, pt, result);
+        case ShapeType::Ellipse:
+            // https://www.geometrictools.com/Documentation/DistancePointEllipseEllipsoid.pdf
+            assert(false);
+            return false;
+        case ShapeType::Path:
+            return closest_point(*(const Path *)shape.ptr, bvh_nodes, pt, max_radius, path_info, result);
+        case ShapeType::Rect:
+            return closest_point(*(const Rect *)shape.ptr, pt, result);
+    }
+    assert(false);
+    return false;
+}
+DEVICE
+inline
+bool compute_distance(const SceneData &scene,
+                      int shape_group_id,
+                      const Vector2f &pt,
+                      float max_radius,
+                      int *min_shape_id,
+                      Vector2f *closest_pt_,
+                      ClosestPointPathInfo *path_info,
+                      float *result) {
+    const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    const auto &bvh_nodes = scene.shape_groups_bvh_nodes[shape_group_id];
+    auto min_dist = max_radius;
+    auto found = false;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            const auto &shape = scene.shapes[shape_id];
+            ClosestPointPathInfo local_path_info{-1, -1};
+            auto local_closest_pt = Vector2f{0, 0};
+            if (closest_point(shape, scene.path_bvhs[shape_id], local_pt, max_radius, &local_path_info, &local_closest_pt)) {
+                auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+                auto dist = distance(closest_pt, pt);
+                if (!found || dist < min_dist) {
+                    found = true;
+                    min_dist = dist;
+                    if (min_shape_id != nullptr) {
+                        *min_shape_id = shape_id;
+                    }
+                    if (closest_pt_ != nullptr) {
+                        *closest_pt_ = closest_pt;
+                    }
+                    if (path_info != nullptr) {
+                        *path_info = local_path_info;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt, max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt, max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    *result = min_dist;
+    return found;
+}
+DEVICE
+inline
+void d_closest_point(const Circle &circle,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     Circle &d_circle,
+                     Vector2f &d_pt) {
+    // return circle.center + circle.radius * normalize(pt - circle.center);
+    auto d_center = d_closest_pt *
+        (1 + d_normalize(pt - circle.center, circle.radius * d_closest_pt));
+    atomic_add(&d_circle.center.x, d_center);
+    atomic_add(&d_circle.radius, dot(d_closest_pt, normalize(pt - circle.center)));
+}
+DEVICE
+inline
+void d_closest_point(const Path &path,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     const ClosestPointPathInfo &path_info,
+                     Path &d_path,
+                     Vector2f &d_pt) {
+    auto base_point_id = path_info.base_point_id;
+    auto point_id = path_info.point_id;
+    auto min_t_root = path_info.t_root;
+    if (path.num_control_points[base_point_id] == 0) {
+        // Straight line
+        auto i0 = point_id;
+        auto i1 = (point_id + 1) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        if (t < 0) {
+            d_p0 += d_closest_pt;
+        } else if (t > 1) {
+            d_p1 += d_closest_pt;
+        } else {
+            auto d_p = d_closest_pt;
+            // p = p0 + t * (p1 - p0)
+            d_p0 += d_p * (1 - t);
+            d_p1 += d_p * t;
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+    } else if (path.num_control_points[base_point_id] == 1) {
+        // Quadratic Bezier curve
+        auto i0 = point_id;
+        auto i1 = point_id + 1;
+        auto i2 = (point_id + 2) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        // auto eval = [&](float t) -> Vector2f {
+        //     auto tt = 1 - t;
+        //     return (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2;
+        // };
+        // auto dist0 = distance(eval(0), pt);
+        // auto dist1 = distance(eval(1), pt);
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        auto d_p2 = Vector2f{0, 0};
+        auto t = min_t_root;
+        if (t == 0) {
+            d_p0 += d_closest_pt;
+        } else if (t == 1) {
+            d_p2 += d_closest_pt;
+        } else {
+            // The curve is (1-t)^2p0 + 2(1-t)tp1 + t^2p2
+            // = (p0-2p1+p2)t^2+(-2p0+2p1)t+p0 = q
+            // Want to solve (q - pt) dot q' = 0
+            // q' = (p0-2p1+p2)t + (-p0+p1)
+            // Expanding (p0-2p1+p2)^2 t^3 +
+            //           3(p0-2p1+p2)(-p0+p1) t^2 +
+            //           (2(-p0+p1)^2+(p0-2p1+p2)(p0-pt))t +
+            //           (-p0+p1)(p0-pt) = 0
+            auto A = sum((p0-2*p1+p2)*(p0-2*p1+p2));
+            auto B = sum(3*(p0-2*p1+p2)*(-p0+p1));
+            auto C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt));
+            // auto D = sum((-p0+p1)*(p0-pt));
+            auto d_p = d_closest_pt;
+            // p = eval(t)
+            auto tt = 1 - t;
+            // (tt*tt)*p0 + (2*tt*t)*p1 + (t*t)*p2
+            auto d_tt = 2 * tt * dot(d_p, p0) + 2 * t * dot(d_p, p1);
+            auto d_t = -d_tt + 2 * tt * dot(d_p, p1) + 2 * t * dot(d_p, p2);
+            auto d_p0 = d_p * tt * tt;
+            auto d_p1 = 2 * d_p * tt * t;
+            auto d_p2 = d_p * t * t;
+            // implicit function theorem: dt/dA = -1/(p'(t)) * dp/dA
+            auto poly_deriv_t = 3 * A * t * t + 2 * B * t + C;
+            if (fabs(poly_deriv_t) > 1e-6f) {
+                auto d_A = - (d_t / poly_deriv_t) * t * t * t;
+                auto d_B = - (d_t / poly_deriv_t) * t * t;
+                auto d_C = - (d_t / poly_deriv_t) * t;
+                auto d_D = - (d_t / poly_deriv_t);
+                // A = sum((p0-2*p1+p2)*(p0-2*p1+p2))
+                // B = sum(3*(p0-2*p1+p2)*(-p0+p1))
+                // C = sum(2*(-p0+p1)*(-p0+p1)+(p0-2*p1+p2)*(p0-pt))
+                // D = sum((-p0+p1)*(p0-pt))
+                d_p0 += 2*d_A*(p0-2*p1+p2)+
+                        3*d_B*((-p0+p1)-(p0-2*p1+p2))+
+                        2*d_C*(-2*(-p0+p1))+
+                          d_C*((p0-pt)+(p0-2*p1+p2))+
+                        2*d_D*(-(p0-pt)+(-p0+p1));
+                d_p1 += (-2)*2*d_A*(p0-2*p1+p2)+
+                        3*d_B*(-2*(-p0+p1)+(p0-2*p1+p2))+
+                        2*d_C*(2*(-p0+p1))+
+                          d_C*((-2)*(p0-pt))+
+                        d_D*(p0-pt);
+                d_p2 += 2*d_A*(p0-2*p1+p2)+
+                        3*d_B*(-p0+p1)+
+                        d_C*(p0-pt);
+                d_pt += d_C*(-(p0-2*p1+p2))+
+                        d_D*(-(-p0+p1));
+            }
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+        atomic_add(d_path.points + 2 * i2, d_p2);
+    } else if (path.num_control_points[base_point_id] == 2) {
+        // Cubic Bezier curve
+        auto i0 = point_id;
+        auto i1 = point_id + 1;
+        auto i2 = point_id + 2;
+        auto i3 = (point_id + 3) % path.num_points;
+        auto p0 = Vector2f{path.points[2 * i0], path.points[2 * i0 + 1]};
+        auto p1 = Vector2f{path.points[2 * i1], path.points[2 * i1 + 1]};
+        auto p2 = Vector2f{path.points[2 * i2], path.points[2 * i2 + 1]};
+        auto p3 = Vector2f{path.points[2 * i3], path.points[2 * i3 + 1]};
+        // auto eval = [&](float t) -> Vector2f {
+        //     auto tt = 1 - t;
+        //     return (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3;
+        // };
+        auto d_p0 = Vector2f{0, 0};
+        auto d_p1 = Vector2f{0, 0};
+        auto d_p2 = Vector2f{0, 0};
+        auto d_p3 = Vector2f{0, 0};
+        auto t = min_t_root;
+        if (t == 0) {
+            // closest_pt = p0
+            d_p0 += d_closest_pt;
+        } else if (t == 1) {
+            // closest_pt = p1
+            d_p3 += d_closest_pt;
+        } else {
+            // The curve is (1 - t)^3 p0 + 3 * (1 - t)^2 t p1 + 3 * (1 - t) t^2 p2 + t^3 p3
+            // = (-p0+3p1-3p2+p3) t^3 + (3p0-6p1+3p2) t^2 + (-3p0+3p1) t + p0
+            // Want to solve (q - pt) dot q' = 0
+            // q' = 3*(-p0+3p1-3p2+p3)t^2 + 2*(3p0-6p1+3p2)t + (-3p0+3p1)
+            // Expanding
+            // 3*(-p0+3p1-3p2+p3)^2 t^5
+            // 5*(-p0+3p1-3p2+p3)(3p0-6p1+3p2) t^4
+            // 4*(-p0+3p1-3p2+p3)(-3p0+3p1) + 2*(3p0-6p1+3p2)^2 t^3
+            // 3*(3p0-6p1+3p2)(-3p0+3p1) + 3*(-p0+3p1-3p2+p3)(p0-pt) t^2
+            // (-3p0+3p1)^2+2(p0-pt)(3p0-6p1+3p2) t
+            // (p0-pt)(-3p0+3p1)
+            double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3));
+            double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+            double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+            double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+            double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+            double F = sum((p0-pt)*(-3*p0+3*p1));
+            B /= A;
+            C /= A;
+            D /= A;
+            E /= A;
+            F /= A;
+            // auto eval_polynomial = [&] (double t) {
+            //     return t*t*t*t*t+
+            //            B*t*t*t*t+
+            //            C*t*t*t+
+            //            D*t*t+
+            //            E*t+
+            //            F;
+            // };
+            auto eval_polynomial_deriv = [&] (double t) {
+                return 5*t*t*t*t+
+                       4*B*t*t*t+
+                       3*C*t*t+
+                       2*D*t+
+                       E;
+            };
+            // auto p = eval(t);
+            auto d_p = d_closest_pt;
+            // (tt*tt*tt)*p0 + (3*tt*tt*t)*p1 + (3*tt*t*t)*p2 + (t*t*t)*p3
+            auto tt = 1 - t;
+            auto d_tt = 3 * tt * tt * dot(d_p, p0) +
+                        6 * tt * t * dot(d_p, p1) +
+                        3 * t * t * dot(d_p, p2);
+            auto d_t = -d_tt +
+                       3 * tt * tt * dot(d_p, p1) +
+                       6 * tt * t * dot(d_p, p2) +
+                       3 * t * t * dot(d_p, p3);
+            d_p0 += d_p * (tt * tt * tt);
+            d_p1 += d_p * (3 * tt * tt * t);
+            d_p2 += d_p * (3 * tt * t * t);
+            d_p3 += d_p * (t * t * t);
+            // implicit function theorem: dt/dA = -1/(p'(t)) * dp/dA
+            auto poly_deriv_t = eval_polynomial_deriv(t);
+            if (fabs(poly_deriv_t) > 1e-10f) {
+                auto d_B = -(d_t / poly_deriv_t) * t * t * t * t;
+                auto d_C = -(d_t / poly_deriv_t) * t * t * t;
+                auto d_D = -(d_t / poly_deriv_t) * t * t;
+                auto d_E = -(d_t / poly_deriv_t) * t;
+                auto d_F = -(d_t / poly_deriv_t);
+                // B = B' / A
+                // C = C' / A
+                // D = D' / A
+                // E = E' / A
+                // F = F' / A
+                auto d_A = -d_B * B / A
+                           -d_C * C / A
+                           -d_D * D / A
+                           -d_E * E / A
+                           -d_F * F / A;
+                d_B /= A;
+                d_C /= A;
+                d_D /= A;
+                d_E /= A;
+                d_F /= A;
+                {
+                    double A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3)) + 1e-3;
+                    double B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2));
+                    double C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2));
+                    double D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)));
+                    double E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2));
+                    double F = sum((p0-pt)*(-3*p0+3*p1));
+                    B /= A;
+                    C /= A;
+                    D /= A;
+                    E /= A;
+                    F /= A;
+                    auto eval_polynomial = [&] (double t) {
+                        return t*t*t*t*t+
+                               B*t*t*t*t+
+                               C*t*t*t+
+                               D*t*t+
+                               E*t+
+                               F;
+                    };
+                    auto eval_polynomial_deriv = [&] (double t) {
+                        return 5*t*t*t*t+
+                               4*B*t*t*t+
+                               3*C*t*t+
+                               2*D*t+
+                               E;
+                    };
+                    auto lb = t - 1e-2f;
+                    auto ub = t + 1e-2f;
+                    auto lb_eval = eval_polynomial(lb);
+                    auto ub_eval = eval_polynomial(ub);
+                    if (lb_eval > ub_eval) {
+                        swap_(lb, ub);
+                    }
+                    auto t_ = 0.5f * (lb + ub);
+                    auto num_iter = 20;
+                    for (int it = 0; it < num_iter; it++) {
+                        if (!(t_ >= lb && t_ <= ub)) {
+                            t_ = 0.5f * (lb + ub);
+                        }
+                        auto value = eval_polynomial(t_);
+                        if (fabs(value) < 1e-5f || it == num_iter - 1) {
+                            break;
+                        }
+                        // The derivative may not be entirely accurate,
+                        // but the bisection is going to handle this
+                        if (value > 0.f) {
+                            ub = t_;
+                        } else {
+                            lb = t_;
+                        }
+                        auto derivative = eval_polynomial_deriv(t);
+                        t_ -= value / derivative;
+                    }
+                }
+                // A = 3*sum((-p0+3*p1-3*p2+p3)*(-p0+3*p1-3*p2+p3))
+                d_p0 += d_A * 3 * (-1) * 2 * (-p0+3*p1-3*p2+p3);
+                d_p1 += d_A * 3 *   3  * 2 * (-p0+3*p1-3*p2+p3);
+                d_p2 += d_A * 3 * (-3) * 2 * (-p0+3*p1-3*p2+p3);
+                d_p3 += d_A * 3 *   1  * 2 * (-p0+3*p1-3*p2+p3);
+                // B = 5*sum((-p0+3*p1-3*p2+p3)*(3*p0-6*p1+3*p2))
+                d_p0 += d_B * 5 * ((-1) * (3*p0-6*p1+3*p2) + 3 * (-p0+3*p1-3*p2+p3));
+                d_p1 += d_B * 5 * (3 * (3*p0-6*p1+3*p2) + (-6) * (-p0+3*p1-3*p2+p3));
+                d_p2 += d_B * 5 * ((-3) * (3*p0-6*p1+3*p2) + 3 * (-p0+3*p1-3*p2+p3));
+                d_p3 += d_B * 5 * (3*p0-6*p1+3*p2);
+                // C = 4*sum((-p0+3*p1-3*p2+p3)*(-3*p0+3*p1)) + 2*sum((3*p0-6*p1+3*p2)*(3*p0-6*p1+3*p2))
+                d_p0 += d_C * 4 * ((-1) * (-3*p0+3*p1) + (-3) * (-p0+3*p1-3*p2+p3)) +
+                        d_C * 2 * (3 * 2 * (3*p0-6*p1+3*p2));
+                d_p1 += d_C * 4 * (3 * (-3*p0+3*p1) + 3 * (-p0+3*p1-3*p2+p3)) +
+                        d_C * 2 * ((-6) * 2 * (3*p0-6*p1+3*p2));
+                d_p2 += d_C * 4 * ((-3) * (-3*p0+3*p1)) +
+                        d_C * 2 * (3 * 2 * (3*p0-6*p1+3*p2));
+                d_p3 += d_C * 4 * (-3*p0+3*p1);
+                // D = 3*(sum((3*p0-6*p1+3*p2)*(-3*p0+3*p1)) + sum((-p0+3*p1-3*p2+p3)*(p0-pt)))
+                d_p0 += d_D * 3 * (3 * (-3*p0+3*p1) + (-3) * (3*p0-6*p1+3*p2)) +
+                        d_D * 3 * ((-1) * (p0-pt) + 1 * (-p0+3*p1-3*p2+p3));
+                d_p1 += d_D * 3 * ((-6) * (-3*p0+3*p1) + (3) * (3*p0-6*p1+3*p2)) +
+                        d_D * 3 * (3 * (p0-pt));
+                d_p2 += d_D * 3 * (3 * (-3*p0+3*p1)) +
+                        d_D * 3 * ((-3) * (p0-pt));
+                d_pt += d_D * 3 * ((-1) * (-p0+3*p1-3*p2+p3));
+                // E = sum((-3*p0+3*p1)*(-3*p0+3*p1)) + 2*sum((p0-pt)*(3*p0-6*p1+3*p2))
+                d_p0 += d_E * ((-3) * 2 * (-3*p0+3*p1)) +
+                        d_E * 2 * (1 * (3*p0-6*p1+3*p2) + 3 * (p0-pt));
+                d_p1 += d_E * (  3  * 2 * (-3*p0+3*p1)) +
+                        d_E * 2 * ((-6) * (p0-pt));
+                d_p2 += d_E * 2 * (  3  * (p0-pt));
+                d_pt += d_E * 2 * ((-1) * (3*p0-6*p1+3*p2));
+                // F = sum((p0-pt)*(-3*p0+3*p1))
+                d_p0 += d_F * (1 * (-3*p0+3*p1)) +
+                        d_F * ((-3) * (p0-pt));
+                d_p1 += d_F * (3 * (p0-pt));
+                d_pt += d_F * ((-1) * (-3*p0+3*p1));
+            }
+        }
+        atomic_add(d_path.points + 2 * i0, d_p0);
+        atomic_add(d_path.points + 2 * i1, d_p1);
+        atomic_add(d_path.points + 2 * i2, d_p2);
+        atomic_add(d_path.points + 2 * i3, d_p3);
+    } else {
+        assert(false);
+    }
+}
+DEVICE
+inline
+void d_closest_point(const Rect &rect,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     Rect &d_rect,
+                     Vector2f &d_pt) {
+    auto dist = [&](const Vector2f &p0, const Vector2f &p1) -> float {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            return distance(p0, pt);
+        } else if (t > 1) {
+            return distance(p1, pt);
+        } else {
+            return distance(p0 + t * (p1 - p0), pt);
+        }
+        // return 0;
+    };
+    auto left_top = rect.p_min;
+    auto right_top = Vector2f{rect.p_max.x, rect.p_min.y};
+    auto left_bottom = Vector2f{rect.p_min.x, rect.p_max.y};
+    auto right_bottom = rect.p_max;
+    auto left_dist = dist(left_top, left_bottom);
+    auto top_dist = dist(left_top, right_top);
+    auto right_dist = dist(right_top, right_bottom);
+    auto bottom_dist = dist(left_bottom, right_bottom);
+    int min_id = 0;
+    auto min_dist = left_dist;
+    if (top_dist < min_dist) { min_dist = top_dist; min_id = 1; }
+    if (right_dist < min_dist) { min_dist = right_dist; min_id = 2; }
+    if (bottom_dist < min_dist) { min_dist = bottom_dist; min_id = 3; }
+    auto d_update = [&](const Vector2f &p0, const Vector2f &p1,
+                        const Vector2f &d_closest_pt,
+                        Vector2f &d_p0, Vector2f &d_p1) {
+        // project pt to line
+        auto t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0);
+        if (t < 0) {
+            d_p0 += d_closest_pt;
+        } else if (t > 1) {
+            d_p1 += d_closest_pt;
+        } else {
+            // p = p0 + t * (p1 - p0)
+            auto d_p = d_closest_pt;
+            d_p0 += d_p * (1 - t);
+            d_p1 += d_p * t;
+            auto d_t = sum(d_p * (p1 - p0));
+            // t = dot(pt - p0, p1 - p0) / dot(p1 - p0, p1 - p0)
+            auto d_numerator = d_t / dot(p1 - p0, p1 - p0);
+            auto d_denominator = d_t * (-t) / dot(p1 - p0, p1 - p0);
+            // numerator = dot(pt - p0, p1 - p0)
+            d_pt += (p1 - p0) * d_numerator;
+            d_p1 += (pt - p0) * d_numerator;
+            d_p0 += ((p0 - p1) + (p0 - pt)) * d_numerator;
+            // denominator = dot(p1 - p0, p1 - p0)
+            d_p1 += 2 * (p1 - p0) * d_denominator;
+            d_p0 += 2 * (p0 - p1) * d_denominator;
+        }
+    };
+    auto d_left_top = Vector2f{0, 0};
+    auto d_right_top = Vector2f{0, 0};
+    auto d_left_bottom = Vector2f{0, 0};
+    auto d_right_bottom = Vector2f{0, 0};
+    if (min_id == 0) {
+        d_update(left_top, left_bottom, d_closest_pt, d_left_top, d_left_bottom);
+    } else if (min_id == 1) {
+        d_update(left_top, right_top, d_closest_pt, d_left_top, d_right_top);
+    } else if (min_id == 2) {
+        d_update(right_top, right_bottom, d_closest_pt, d_right_top, d_right_bottom);
+    } else {
+        assert(min_id == 3);
+        d_update(left_bottom, right_bottom, d_closest_pt, d_left_bottom, d_right_bottom);
+    }
+    auto d_p_min = Vector2f{0, 0};
+    auto d_p_max = Vector2f{0, 0};
+    // left_top = rect.p_min
+    // right_top = Vector2f{rect.p_max.x, rect.p_min.y}
+    // left_bottom = Vector2f{rect.p_min.x, rect.p_max.y}
+    // right_bottom = rect.p_max
+    d_p_min += d_left_top;
+    d_p_max.x += d_right_top.x;
+    d_p_min.y += d_right_top.y;
+    d_p_min.x += d_left_bottom.x;
+    d_p_max.y += d_left_bottom.y;
+    d_p_max += d_right_bottom;
+    atomic_add(d_rect.p_min, d_p_min);
+    atomic_add(d_rect.p_max, d_p_max);
+}
+DEVICE
+inline
+void d_closest_point(const Shape &shape,
+                     const Vector2f &pt,
+                     const Vector2f &d_closest_pt,
+                     const ClosestPointPathInfo &path_info,
+                     Shape &d_shape,
+                     Vector2f &d_pt) {
+    switch (shape.type) {
+        case ShapeType::Circle:
+            d_closest_point(*(const Circle *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            *(Circle *)d_shape.ptr,
+                            d_pt);
+            break;
+        case ShapeType::Ellipse:
+            // https://www.geometrictools.com/Documentation/DistancePointEllipseEllipsoid.pdf
+            assert(false);
+            break;
+        case ShapeType::Path:
+            d_closest_point(*(const Path *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            path_info,
+                            *(Path *)d_shape.ptr,
+                            d_pt);
+            break;
+        case ShapeType::Rect:
+            d_closest_point(*(const Rect *)shape.ptr,
+                            pt,
+                            d_closest_pt,
+                            *(Rect *)d_shape.ptr,
+                            d_pt);
+            break;
+    }
+}
+DEVICE
+inline
+void d_compute_distance(const Matrix3x3f &canvas_to_shape,
+                        const Matrix3x3f &shape_to_canvas,
+                        const Shape &shape,
+                        const Vector2f &pt,
+                        const Vector2f &closest_pt,
+                        const ClosestPointPathInfo &path_info,
+                        float d_dist,
+                        Matrix3x3f &d_shape_to_canvas,
+                        Shape &d_shape,
+                        float *d_translation) {
+    if (distance_squared(pt, closest_pt) < 1e-10f) {
+        // The derivative at distance=0 is undefined
+        return;
+    }
+    assert(isfinite(d_dist));
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(canvas_to_shape, pt);
+    auto local_closest_pt = xform_pt(canvas_to_shape, closest_pt);
+    // auto local_closest_pt = closest_point(shape, local_pt);
+    // auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+    // auto dist = distance(closest_pt, pt);
+    auto d_pt = Vector2f{0, 0};
+    auto d_closest_pt = Vector2f{0, 0};
+    d_distance(closest_pt, pt, d_dist, d_closest_pt, d_pt);
+    assert(isfinite(d_pt));
+    assert(isfinite(d_closest_pt));
+    // auto closest_pt = xform_pt(shape_group.shape_to_canvas, local_closest_pt);
+    auto d_local_closest_pt = Vector2f{0, 0};
+    auto d_shape_to_canvas_ = Matrix3x3f();
+    d_xform_pt(shape_to_canvas, local_closest_pt, d_closest_pt,
+               d_shape_to_canvas_, d_local_closest_pt);
+    assert(isfinite(d_local_closest_pt));
+    auto d_local_pt = Vector2f{0, 0};
+    d_closest_point(shape, local_pt, d_local_closest_pt, path_info, d_shape, d_local_pt);
+    assert(isfinite(d_local_pt));
+    auto d_canvas_to_shape = Matrix3x3f();
+    d_xform_pt(canvas_to_shape,
+               pt,
+               d_local_pt,
+               d_canvas_to_shape,
+               d_pt);
+    // http://jack.valmadre.net/notes/2016/09/04/back-prop-differentials/#back-propagation-using-differentials
+    auto tc2s = transpose(canvas_to_shape);
+    d_shape_to_canvas_ += -tc2s * d_canvas_to_shape * tc2s;
+    atomic_add(&d_shape_to_canvas(0, 0), d_shape_to_canvas_);
+    if (d_translation != nullptr) {
+        atomic_add(d_translation, -d_pt);
+    }
+}

DiffVG/cuda_utils.h ADDED Viewed

	@@ -0,0 +1,53 @@

+#pragma once
+#ifdef __CUDACC__
+    #include <cuda.h>
+    #include <cuda_runtime.h>
+#endif
+#include <cstdio>
+#include <cassert>
+#include <limits>
+#ifdef __CUDACC__
+#define checkCuda(x) do { if((x)!=cudaSuccess) { \
+    printf("CUDA Runtime Error: %s at %s:%d\n",\
+    cudaGetErrorString(x),__FILE__,__LINE__);\
+    exit(1);}} while(0)
+#endif
+template <typename T>
+DEVICE
+inline T infinity() {
+#ifdef __CUDA_ARCH__
+    const unsigned long long ieee754inf = 0x7ff0000000000000;
+    return __longlong_as_double(ieee754inf);
+#else
+    return std::numeric_limits<T>::infinity();
+#endif
+}
+template <>
+DEVICE
+inline double infinity() {
+#ifdef __CUDA_ARCH__
+    return __longlong_as_double(0x7ff0000000000000ULL);
+#else
+    return std::numeric_limits<double>::infinity();
+#endif
+}
+template <>
+DEVICE
+inline float infinity() {
+#ifdef __CUDA_ARCH__
+    return __int_as_float(0x7f800000);
+#else
+    return std::numeric_limits<float>::infinity();
+#endif
+}
+inline void cuda_synchronize() {
+#ifdef __CUDACC__
+    checkCuda(cudaDeviceSynchronize());
+#endif
+}

DiffVG/diffvg.cpp ADDED Viewed

	@@ -0,0 +1,1792 @@

+#include "diffvg.h"
+#include "aabb.h"
+#include "shape.h"
+#include "sample_boundary.h"
+#include "atomic.h"
+#include "cdf.h"
+#include "compute_distance.h"
+#include "cuda_utils.h"
+#include "edge_query.h"
+#include "filter.h"
+#include "matrix.h"
+#include "parallel.h"
+#include "pcg.h"
+#include "ptr.h"
+#include "scene.h"
+#include "vector.h"
+#include "winding_number.h"
+#include "within_distance.h"
+#include <cassert>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sort.h>
+namespace py = pybind11;
+struct Command {
+    int shape_group_id;
+    int shape_id;
+    int point_id; // Only used by path
+};
+DEVICE
+bool is_inside(const SceneData &scene_data,
+               int shape_group_id,
+               const Vector2f &pt,
+               EdgeQuery *edge_query) {
+    const ShapeGroup &shape_group = scene_data.shape_groups[shape_group_id];
+    // pt is in canvas space, transform it to shape's local space
+    auto local_pt = xform_pt(shape_group.canvas_to_shape, pt);
+    const auto &bvh_nodes = scene_data.shape_groups_bvh_nodes[shape_group_id];
+    const AABB &bbox = bvh_nodes[2 * shape_group.num_shapes - 2].box;
+    if (!inside(bbox, local_pt)) {
+        return false;
+    }
+    auto winding_number = 0;
+    // Traverse the shape group BVH
+    constexpr auto max_bvh_stack_size = 64;
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    bvh_stack[stack_size++] = 2 * shape_group.num_shapes - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto shape_id = node.child0;
+            auto w = compute_winding_number(
+                scene_data.shapes[shape_id], scene_data.path_bvhs[shape_id], local_pt);
+            winding_number += w;
+            if (edge_query != nullptr) {
+                if (edge_query->shape_group_id == shape_group_id &&
+                        edge_query->shape_id == shape_id) {
+                    if ((shape_group.use_even_odd_rule && abs(w) % 2 == 1) ||
+                        (!shape_group.use_even_odd_rule && w != 0)) {
+                        edge_query->hit = true;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = bvh_nodes[node.child0].box;
+            if (inside(b0, local_pt)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = bvh_nodes[node.child1].box;
+            if (inside(b1, local_pt)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (shape_group.use_even_odd_rule) {
+        return abs(winding_number) % 2 == 1;
+    } else {
+        return winding_number != 0;
+    }
+}
+DEVICE void accumulate_boundary_gradient(const Shape &shape,
+                                         float contrib,
+                                         float t,
+                                         const Vector2f &normal,
+                                         const BoundaryData &boundary_data,
+                                         Shape &d_shape,
+                                         const Matrix3x3f &shape_to_canvas,
+                                         const Vector2f &local_boundary_pt,
+                                         Matrix3x3f &d_shape_to_canvas) {
+    assert(isfinite(contrib));
+    assert(isfinite(normal));
+    // According to Reynold transport theorem,
+    // the Jacobian of the boundary integral is dot(velocity, normal),
+    // where the velocity depends on the variable being differentiated with.
+    if (boundary_data.is_stroke) {
+        auto has_path_thickness = false;
+        if (shape.type == ShapeType::Path) {
+            const Path &path = *(const Path *)shape.ptr;
+            has_path_thickness = path.thickness != nullptr;
+        }
+        // differentiate stroke width: velocity is the same as normal
+        if (has_path_thickness) {
+            Path *d_p = (Path*)d_shape.ptr;
+            auto base_point_id = boundary_data.path.base_point_id;
+            auto point_id = boundary_data.path.point_id;
+            auto t = boundary_data.path.t;
+            const Path &path = *(const Path *)shape.ptr;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                // r = r0 + t * (r1 - r0)
+                atomic_add(&d_p->thickness[i0], (1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], (    t) * contrib);
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                // r = (1-t)^2r0 + 2(1-t)t r1 + t^2 r2
+                atomic_add(&d_p->thickness[i0], square(1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], (2*(1-t)*t) * contrib);
+                atomic_add(&d_p->thickness[i2], (t*t) * contrib);
+            } else if (path.num_control_points[base_point_id] == 2) {
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                // r = (1-t)^3r0 + 3*(1-t)^2tr1 + 3*(1-t)t^2r2 + t^3r3
+                atomic_add(&d_p->thickness[i0], cubic(1 - t) * contrib);
+                atomic_add(&d_p->thickness[i1], 3 * square(1 - t) * t * contrib);
+                atomic_add(&d_p->thickness[i2], 3 * (1 - t) * t * t * contrib);
+                atomic_add(&d_p->thickness[i3], t * t * t * contrib);
+            } else {
+                assert(false);
+            }
+        } else {
+            atomic_add(&d_shape.stroke_width, contrib);
+        }
+    }
+    switch (shape.type) {
+        case ShapeType::Circle: {
+            Circle *d_p = (Circle*)d_shape.ptr;
+            // velocity for the center is (1, 0) for x and (0, 1) for y
+            atomic_add(&d_p->center[0], normal * contrib);
+            // velocity for the radius is the same as the normal
+            atomic_add(&d_p->radius, contrib);
+            break;
+        } case ShapeType::Ellipse: {
+            Ellipse *d_p = (Ellipse*)d_shape.ptr;
+            // velocity for the center is (1, 0) for x and (0, 1) for y
+            atomic_add(&d_p->center[0], normal * contrib);
+            // velocity for the radius:
+            // x = center.x + r.x * cos(2pi * t)
+            // y = center.y + r.y * sin(2pi * t)
+            // for r.x: (cos(2pi * t), 0)
+            // for r.y: (0, sin(2pi * t))
+            atomic_add(&d_p->radius.x, cos(2 * float(M_PI) * t) * normal.x * contrib);
+            atomic_add(&d_p->radius.y, sin(2 * float(M_PI) * t) * normal.y * contrib);
+            break;
+        } case ShapeType::Path: {
+            Path *d_p = (Path*)d_shape.ptr;
+            auto base_point_id = boundary_data.path.base_point_id;
+            auto point_id = boundary_data.path.point_id;
+            auto t = boundary_data.path.t;
+            const Path &path = *(const Path *)shape.ptr;
+            if (path.num_control_points[base_point_id] == 0) {
+                // Straight line
+                auto i0 = point_id;
+                auto i1 = (point_id + 1) % path.num_points;
+                // pt = p0 + t * (p1 - p0)
+                // velocity for p0.x: (1 - t,     0)
+                //              p0.y: (    0, 1 - t)
+                //              p1.x: (    t,     0)
+                //              p1.y: (    0,     t)
+                atomic_add(&d_p->points[2 * i0 + 0], (1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], (1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], (    t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], (    t) * normal.y * contrib);
+            } else if (path.num_control_points[base_point_id] == 1) {
+                // Quadratic Bezier curve
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = (point_id + 2) % path.num_points;
+                // pt = (1-t)^2p0 + 2(1-t)t p1 + t^2 p2
+                // velocity for p0.x: ((1-t)^2,       0)
+                //              p0.y: (      0, (1-t)^2)
+                //              p1.x: (2(1-t)t,       0)
+                //              p1.y: (      0, 2(1-t)t)
+                //              p1.x: (    t^2,       0)
+                //              p1.y: (      0,     t^2)
+                atomic_add(&d_p->points[2 * i0 + 0], square(1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], square(1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], (2*(1-t)*t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], (2*(1-t)*t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i2 + 0], (t*t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i2 + 1], (t*t) * normal.y * contrib);
+            } else if (path.num_control_points[base_point_id] == 2) {
+                auto i0 = point_id;
+                auto i1 = point_id + 1;
+                auto i2 = point_id + 2;
+                auto i3 = (point_id + 3) % path.num_points;
+                // pt = (1-t)^3p0 + 3*(1-t)^2tp1 + 3*(1-t)t^2p2 + t^3p3
+                // velocity for p0.x: (   (1-t)^3,          0)
+                //              p0.y: (         0,    (1-t)^3)
+                //              p1.x: (3*(1-t)^2t,          0)
+                //              p1.y: (         0, 3*(1-t)^2t)
+                //              p2.x: (3*(1-t)t^2,          0)
+                //              p2.y: (         0, 3*(1-t)t^2)
+                //              p2.x: (       t^3,          0)
+                //              p2.y: (         0,        t^3)
+                atomic_add(&d_p->points[2 * i0 + 0], cubic(1 - t) * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i0 + 1], cubic(1 - t) * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i1 + 0], 3 * square(1 - t) * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i1 + 1], 3 * square(1 - t) * t * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i2 + 0], 3 * (1 - t) * t * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i2 + 1], 3 * (1 - t) * t * t * normal.y * contrib);
+                atomic_add(&d_p->points[2 * i3 + 0], t * t * t * normal.x * contrib);
+                atomic_add(&d_p->points[2 * i3 + 1], t * t * t * normal.y * contrib);
+            } else {
+                assert(false);
+            }
+            break;
+        } case ShapeType::Rect: {
+            Rect *d_p = (Rect*)d_shape.ptr;
+            // The velocity depends on the position of the boundary
+            if (normal == Vector2f{-1, 0}) {
+                // left
+                // velocity for p_min is (1, 0) for x and (0, 0) for y
+                atomic_add(&d_p->p_min.x, -contrib);
+            } else if (normal == Vector2f{1, 0}) {
+                // right
+                // velocity for p_max is (1, 0) for x and (0, 0) for y
+                atomic_add(&d_p->p_max.x, contrib);
+            } else if (normal == Vector2f{0, -1}) {
+                // top
+                // velocity for p_min is (0, 0) for x and (0, 1) for y
+                atomic_add(&d_p->p_min.y, -contrib);
+            } else if (normal == Vector2f{0, 1}) {
+                // bottom
+                // velocity for p_max is (0, 0) for x and (0, 1) for y
+                atomic_add(&d_p->p_max.y, contrib);
+            } else {
+                // incorrect normal assignment?
+                assert(false);
+            }
+            break;
+        } default: {
+            assert(false);
+            break;
+        }
+    }
+    // for shape_to_canvas we have the following relationship:
+    // boundary_pt = xform_pt(shape_to_canvas, local_pt)
+    // the velocity is the derivative of boundary_pt with respect to shape_to_canvas
+    // we can use reverse-mode AD to compute the dot product of the velocity and the Jacobian
+    // by passing the normal in d_xform_pt
+    auto d_shape_to_canvas_ = Matrix3x3f();
+    auto d_local_boundary_pt = Vector2f{0, 0};
+    d_xform_pt(shape_to_canvas,
+               local_boundary_pt,
+               normal * contrib,
+               d_shape_to_canvas_,
+               d_local_boundary_pt);
+    atomic_add(&d_shape_to_canvas(0, 0), d_shape_to_canvas_);
+}
+DEVICE
+Vector4f sample_color(const ColorType &color_type,
+                      void *color,
+                      const Vector2f &pt) {
+    switch (color_type) {
+        case ColorType::Constant: {
+            auto c = (const Constant*)color;
+            assert(isfinite(c->color));
+            return c->color;
+        } case ColorType::LinearGradient: {
+            auto c = (const LinearGradient*)color;
+            // Project pt to (c->begin, c->end)
+            auto beg = c->begin;
+            auto end = c->end;
+            auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-3f);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                return Vector4f{c->stop_colors[0],
+                                c->stop_colors[1],
+                                c->stop_colors[2],
+                                c->stop_colors[3]};
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    assert(isfinite(color_curr));
+                    assert(isfinite(color_next));
+                    return color_curr * (1 - tt) + color_next * tt;
+                }
+            }
+            return Vector4f{c->stop_colors[4 * (c->num_stops - 1) + 0],
+                            c->stop_colors[4 * (c->num_stops - 1) + 1],
+                            c->stop_colors[4 * (c->num_stops - 1) + 2],
+                            c->stop_colors[4 * (c->num_stops - 1) + 3]};
+        } case ColorType::RadialGradient: {
+            auto c = (const RadialGradient*)color;
+            // Distance from pt to center
+            auto offset = pt - c->center;
+            auto normalized_offset = offset / c->radius;
+            auto t = length(normalized_offset);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                return Vector4f{c->stop_colors[0],
+                                c->stop_colors[1],
+                                c->stop_colors[2],
+                                c->stop_colors[3]};
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    assert(isfinite(color_curr));
+                    assert(isfinite(color_next));
+                    return color_curr * (1 - tt) + color_next * tt;
+                }
+            }
+            return Vector4f{c->stop_colors[4 * (c->num_stops - 1) + 0],
+                            c->stop_colors[4 * (c->num_stops - 1) + 1],
+                            c->stop_colors[4 * (c->num_stops - 1) + 2],
+                            c->stop_colors[4 * (c->num_stops - 1) + 3]};
+        } default: {
+            assert(false);
+        }
+    }
+    return Vector4f{};
+}
+DEVICE
+void d_sample_color(const ColorType &color_type,
+                    void *color_ptr,
+                    const Vector2f &pt,
+                    const Vector4f &d_color,
+                    void *d_color_ptr,
+                    float *d_translation) {
+    switch (color_type) {
+        case ColorType::Constant: {
+            auto d_c = (Constant*)d_color_ptr;
+            atomic_add(&d_c->color[0], d_color);
+            return;
+        } case ColorType::LinearGradient: {
+            auto c = (const LinearGradient*)color_ptr;
+            auto d_c = (LinearGradient*)d_color_ptr;
+            // Project pt to (c->begin, c->end)
+            auto beg = c->begin;
+            auto end = c->end;
+            auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-3f);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                atomic_add(&d_c->stop_colors[0], d_color);
+                return;
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    // return color_curr * (1 - tt) + color_next * tt;
+                    auto d_color_curr = d_color * (1 - tt);
+                    auto d_color_next = d_color * tt;
+                    auto d_tt = sum(d_color * (color_next - color_curr));
+                    auto d_offset_next = -d_tt * tt / (offset_next - offset_curr);
+                    auto d_offset_curr = d_tt * ((tt - 1.f) / (offset_next - offset_curr));
+                    auto d_t = d_tt / (offset_next - offset_curr);
+                    assert(isfinite(d_tt));
+                    atomic_add(&d_c->stop_colors[4 * i], d_color_curr);
+                    atomic_add(&d_c->stop_colors[4 * (i + 1)], d_color_next);
+                    atomic_add(&d_c->stop_offsets[i], d_offset_curr);
+                    atomic_add(&d_c->stop_offsets[i + 1], d_offset_next);
+                    // auto t = dot(pt - beg, end - beg) / max(dot(end - beg, end - beg), 1e-6f);
+                    // l = max(dot(end - beg, end - beg), 1e-3f)
+                    // t = dot(pt - beg, end - beg) / l;
+                    auto l = max(dot(end - beg, end - beg), 1e-3f);
+                    auto d_beg = d_t * (-(pt - beg)-(end - beg)) / l;
+                    auto d_end = d_t * (pt - beg) / l;
+                    auto d_l = -d_t * t / l;
+                    if (dot(end - beg, end - beg) > 1e-3f) {
+                        d_beg += 2 * d_l * (beg - end);
+                        d_end += 2 * d_l * (end - beg);
+                    }
+                    atomic_add(&d_c->begin[0], d_beg);
+                    atomic_add(&d_c->end[0], d_end);
+                    if (d_translation != nullptr) {
+                        atomic_add(d_translation, (d_beg + d_end));
+                    }
+                    return;
+                }
+            }
+            atomic_add(&d_c->stop_colors[4 * (c->num_stops - 1)], d_color);
+            return;
+        } case ColorType::RadialGradient: {
+            auto c = (const RadialGradient*)color_ptr;
+            auto d_c = (RadialGradient*)d_color_ptr;
+            // Distance from pt to center
+            auto offset = pt - c->center;
+            auto normalized_offset = offset / c->radius;
+            auto t = length(normalized_offset);
+            // Find the correponding stop:
+            if (t < c->stop_offsets[0]) {
+                atomic_add(&d_c->stop_colors[0], d_color);
+                return;
+            }
+            for (int i = 0; i < c->num_stops - 1; i++) {
+                auto offset_curr = c->stop_offsets[i];
+                auto offset_next = c->stop_offsets[i + 1];
+                assert(offset_next > offset_curr);
+                if (t >= offset_curr && t < offset_next) {
+                    auto color_curr = Vector4f{
+                        c->stop_colors[4 * i + 0],
+                        c->stop_colors[4 * i + 1],
+                        c->stop_colors[4 * i + 2],
+                        c->stop_colors[4 * i + 3]};
+                    auto color_next = Vector4f{
+                        c->stop_colors[4 * (i + 1) + 0],
+                        c->stop_colors[4 * (i + 1) + 1],
+                        c->stop_colors[4 * (i + 1) + 2],
+                        c->stop_colors[4 * (i + 1) + 3]};
+                    auto tt = (t - offset_curr) / (offset_next - offset_curr);
+                    assert(isfinite(tt));
+                    // return color_curr * (1 - tt) + color_next * tt;
+                    auto d_color_curr = d_color * (1 - tt);
+                    auto d_color_next = d_color * tt;
+                    auto d_tt = sum(d_color * (color_next - color_curr));
+                    auto d_offset_next = -d_tt * tt / (offset_next - offset_curr);
+                    auto d_offset_curr = d_tt * ((tt - 1.f) / (offset_next - offset_curr));
+                    auto d_t = d_tt / (offset_next - offset_curr);
+                    assert(isfinite(d_t));
+                    atomic_add(&d_c->stop_colors[4 * i], d_color_curr);
+                    atomic_add(&d_c->stop_colors[4 * (i + 1)], d_color_next);
+                    atomic_add(&d_c->stop_offsets[i], d_offset_curr);
+                    atomic_add(&d_c->stop_offsets[i + 1], d_offset_next);
+                    // offset = pt - c->center
+                    // normalized_offset = offset / c->radius
+                    // t = length(normalized_offset)
+                    auto d_normalized_offset = d_length(normalized_offset, d_t);
+                    auto d_offset = d_normalized_offset / c->radius;
+                    auto d_radius = -d_normalized_offset * offset / (c->radius * c->radius);
+                    auto d_center = -d_offset;
+                    atomic_add(&d_c->center[0], d_center);
+                    atomic_add(&d_c->radius[0], d_radius);
+                    if (d_translation != nullptr) {
+                        atomic_add(d_translation, d_center);
+                    }
+                }
+            }
+            atomic_add(&d_c->stop_colors[4 * (c->num_stops - 1)], d_color);
+            return;
+        } default: {
+            assert(false);
+        }
+    }
+}
+struct Fragment {
+    Vector3f color;
+    float alpha;
+    int group_id;
+    bool is_stroke;
+};
+struct PrefilterFragment {
+    Vector3f color;
+    float alpha;
+    int group_id;
+    bool is_stroke;
+    int shape_id;
+    float distance;
+    Vector2f closest_pt;
+    ClosestPointPathInfo path_info;
+    bool within_distance;
+};
+DEVICE
+Vector4f sample_color(const SceneData &scene,
+                      const Vector4f *background_color,
+                      const Vector2f &screen_pt,
+                      const Vector4f *d_color = nullptr,
+                      EdgeQuery *edge_query = nullptr,
+                      Vector4f *d_background_color = nullptr,
+                      float *d_translation = nullptr) {
+    if (edge_query != nullptr) {
+        edge_query->hit = false;
+    }
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    constexpr auto max_hit_shapes = 256;
+    constexpr auto max_bvh_stack_size = 64;
+    Fragment fragments[max_hit_shapes];
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    auto num_fragments = 0;
+    bvh_stack[stack_size++] = 2 * scene.num_shape_groups - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = scene.bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto group_id = node.child0;
+            const ShapeGroup &shape_group = scene.shape_groups[group_id];
+            if (shape_group.stroke_color != nullptr) {
+                if (within_distance(scene, group_id, pt, edge_query)) {
+                    auto color_alpha = sample_color(shape_group.stroke_color_type,
+                                                    shape_group.stroke_color,
+                                                    pt);
+                    Fragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.is_stroke = true;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+            if (shape_group.fill_color != nullptr) {
+                if (is_inside(scene, group_id, pt, edge_query)) {
+                    auto color_alpha = sample_color(shape_group.fill_color_type,
+                                                    shape_group.fill_color,
+                                                    pt);
+                    Fragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.is_stroke = false;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = scene.bvh_nodes[node.child0].box;
+            if (inside(b0, pt, scene.bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = scene.bvh_nodes[node.child1].box;
+            if (inside(b1, pt, scene.bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (num_fragments <= 0) {
+        if (background_color != nullptr) {
+            if (d_background_color != nullptr) {
+                *d_background_color = *d_color;
+            }
+            return *background_color;
+        }
+        return Vector4f{0, 0, 0, 0};
+    }
+    // Sort the fragments from back to front (i.e. increasing order of group id)
+    // https://github.com/frigaut/yorick-imutil/blob/master/insort.c#L37
+    for (int i = 1; i < num_fragments; i++) {
+        auto j = i;
+        auto temp = fragments[j];
+        while (j > 0 && fragments[j - 1].group_id > temp.group_id) {
+            fragments[j] = fragments[j - 1];
+            j--;
+        }
+        fragments[j] = temp;
+    }
+    // Blend the color
+    Vector3f accum_color[max_hit_shapes];
+    float accum_alpha[max_hit_shapes];
+    // auto hit_opaque = false;
+    auto first_alpha = 0.f;
+    auto first_color = Vector3f{0, 0, 0};
+    if (background_color != nullptr) {
+        first_alpha = background_color->w;
+        first_color = Vector3f{background_color->x,
+                               background_color->y,
+                               background_color->z};
+    }
+    for (int i = 0; i < num_fragments; i++) {
+        const Fragment &fragment = fragments[i];
+        auto new_color = fragment.color;
+        auto new_alpha = fragment.alpha;
+        auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+        auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+        if (edge_query != nullptr) {
+            // Do we hit the target shape?
+            if (new_alpha >= 1.f && edge_query->hit) {
+                // A fully opaque shape in front of the target occludes it
+                edge_query->hit = false;
+            }
+            if (edge_query->shape_group_id == fragment.group_id) {
+                edge_query->hit = true;
+            }
+        }
+        // prev_color is alpha premultiplied, don't need to multiply with
+        // prev_alpha
+        accum_color[i] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+        accum_alpha[i] = prev_alpha * (1 - new_alpha) + new_alpha;
+    }
+    auto final_color = accum_color[num_fragments - 1];
+    auto final_alpha = accum_alpha[num_fragments - 1];
+    if (final_alpha > 1e-6f) {
+        final_color /= final_alpha;
+    }
+    assert(isfinite(final_color));
+    assert(isfinite(final_alpha));
+    if (d_color != nullptr) {
+        // Backward pass
+        auto d_final_color = Vector3f{(*d_color)[0], (*d_color)[1], (*d_color)[2]};
+        auto d_final_alpha = (*d_color)[3];
+        auto d_curr_color = d_final_color;
+        auto d_curr_alpha = d_final_alpha;
+        if (final_alpha > 1e-6f) {
+            // final_color = curr_color / final_alpha
+            d_curr_color = d_final_color / final_alpha;
+            d_curr_alpha -= sum(d_final_color * final_color) / final_alpha;
+        }
+        assert(isfinite(*d_color));
+        assert(isfinite(d_curr_color));
+        assert(isfinite(d_curr_alpha));
+        for (int i = num_fragments - 1; i >= 0; i--) {
+            // color[n] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+            // alpha[n] = prev_alpha * (1 - new_alpha) + new_alpha;
+            auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+            auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+            auto d_prev_alpha = d_curr_alpha * (1.f - fragments[i].alpha);
+            auto d_alpha_i = d_curr_alpha * (1.f - prev_alpha);
+            d_alpha_i += sum(d_curr_color * (fragments[i].color - prev_color));
+            auto d_prev_color = d_curr_color * (1 - fragments[i].alpha);
+            auto d_color_i = d_curr_color * fragments[i].alpha;
+            auto group_id = fragments[i].group_id;
+            if (fragments[i].is_stroke) {
+                d_sample_color(scene.shape_groups[group_id].stroke_color_type,
+                               scene.shape_groups[group_id].stroke_color,
+                               pt,
+                               Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                               scene.d_shape_groups[group_id].stroke_color,
+                               d_translation);
+            } else {
+                d_sample_color(scene.shape_groups[group_id].fill_color_type,
+                               scene.shape_groups[group_id].fill_color,
+                               pt,
+                               Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                               scene.d_shape_groups[group_id].fill_color,
+                               d_translation);
+            }
+            d_curr_color = d_prev_color;
+            d_curr_alpha = d_prev_alpha;
+        }
+        if (d_background_color != nullptr) {
+            d_background_color->x += d_curr_color.x;
+            d_background_color->y += d_curr_color.y;
+            d_background_color->z += d_curr_color.z;
+            d_background_color->w += d_curr_alpha;
+        }
+    }
+    return Vector4f{final_color[0], final_color[1], final_color[2], final_alpha};
+}
+DEVICE
+float sample_distance(const SceneData &scene,
+                      const Vector2f &screen_pt,
+                      float weight,
+                      const float *d_dist = nullptr,
+                      float *d_translation = nullptr) {
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    // for each shape
+    auto min_group_id = -1;
+    auto min_distance = 0.f;
+    auto min_shape_id = -1;
+    auto closest_pt = Vector2f{0, 0};
+    auto min_path_info = ClosestPointPathInfo{-1, -1, 0};
+    for (int group_id = scene.num_shape_groups - 1; group_id >= 0; group_id--) {
+        auto s = -1;
+        auto p = Vector2f{0, 0};
+        ClosestPointPathInfo local_path_info;
+        auto d = infinity<float>();
+        if (compute_distance(scene, group_id, pt, infinity<float>(), &s, &p, &local_path_info, &d)) {
+            if (min_group_id == -1 || d < min_distance) {
+                min_distance = d;
+                min_group_id = group_id;
+                min_shape_id = s;
+                closest_pt = p;
+                min_path_info = local_path_info;
+            }
+        }
+    }
+    if (min_group_id == -1) {
+        return min_distance;
+    }
+    min_distance *= weight;
+    auto inside = false;
+    const ShapeGroup &shape_group = scene.shape_groups[min_group_id];
+    if (shape_group.fill_color != nullptr) {
+        inside = is_inside(scene,
+                           min_group_id,
+                           pt,
+                           nullptr);
+        if (inside) {
+            min_distance = -min_distance;
+        }
+    }
+    assert((min_group_id >= 0 && min_shape_id >= 0) || scene.num_shape_groups == 0);
+    if (d_dist != nullptr) {
+        auto d_abs_dist = inside ? -(*d_dist) : (*d_dist);
+        const ShapeGroup &shape_group = scene.shape_groups[min_group_id];
+        const Shape &shape = scene.shapes[min_shape_id];
+        ShapeGroup &d_shape_group = scene.d_shape_groups[min_group_id];
+        Shape &d_shape = scene.d_shapes[min_shape_id];
+        d_compute_distance(shape_group.canvas_to_shape,
+                           shape_group.shape_to_canvas,
+                           shape,
+                           pt,
+                           closest_pt,
+                           min_path_info,
+                           d_abs_dist,
+                           d_shape_group.shape_to_canvas,
+                           d_shape,
+                           d_translation);
+    }
+    return min_distance;
+}
+// Gather d_color from d_image inside the filter kernel, normalize by
+// weight_image.
+DEVICE
+Vector4f gather_d_color(const Filter &filter,
+                        const float *d_color_image,
+                        const float *weight_image,
+                        int width,
+                        int height,
+                        const Vector2f &pt) {
+    auto x = int(pt.x);
+    auto y = int(pt.y);
+    auto radius = filter.radius;
+    assert(radius > 0);
+    auto ri = (int)ceil(radius);
+    auto d_color = Vector4f{0, 0, 0, 0};
+    for (int dy = -ri; dy <= ri; dy++) {
+        for (int dx = -ri; dx <= ri; dx++) {
+            auto xx = x + dx;
+            auto yy = y + dy;
+            if (xx >= 0 && xx < width && yy >= 0 && yy < height) {
+                auto xc = xx + 0.5f;
+                auto yc = yy + 0.5f;
+                auto filter_weight =
+                    compute_filter_weight(filter, xc - pt.x, yc - pt.y);
+                // pixel = \sum weight * color / \sum weight
+                auto weight_sum = weight_image[yy * width + xx];
+                if (weight_sum > 0) {
+                    d_color += (filter_weight / weight_sum) * Vector4f{
+                        d_color_image[4 * (yy * width + xx) + 0],
+                        d_color_image[4 * (yy * width + xx) + 1],
+                        d_color_image[4 * (yy * width + xx) + 2],
+                        d_color_image[4 * (yy * width + xx) + 3],
+                    };
+                }
+            }
+        }
+    }
+    return d_color;
+}
+DEVICE
+float smoothstep(float d) {
+    auto t = clamp((d + 1.f) / 2.f, 0.f, 1.f);
+    return t * t * (3 - 2 * t);
+}
+DEVICE
+float d_smoothstep(float d, float d_ret) {
+    if (d < -1.f || d > 1.f) {
+        return 0.f;
+    }
+    auto t = (d + 1.f) / 2.f;
+    // ret = t * t * (3 - 2 * t)
+    //     = 3 * t * t - 2 * t * t * t
+    auto d_t = d_ret * (6 * t - 6 * t * t);
+    return d_t / 2.f;
+}
+DEVICE
+Vector4f sample_color_prefiltered(const SceneData &scene,
+                                  const Vector4f *background_color,
+                                  const Vector2f &screen_pt,
+                                  const Vector4f *d_color = nullptr,
+                                  Vector4f *d_background_color = nullptr,
+                                  float *d_translation = nullptr) {
+    // screen_pt is in screen space ([0, 1), [0, 1)),
+    // need to transform to canvas space
+    auto pt = screen_pt;
+    pt.x *= scene.canvas_width;
+    pt.y *= scene.canvas_height;
+    constexpr auto max_hit_shapes = 64;
+    constexpr auto max_bvh_stack_size = 64;
+    PrefilterFragment fragments[max_hit_shapes];
+    int bvh_stack[max_bvh_stack_size];
+    auto stack_size = 0;
+    auto num_fragments = 0;
+    bvh_stack[stack_size++] = 2 * scene.num_shape_groups - 2;
+    while (stack_size > 0) {
+        const BVHNode &node = scene.bvh_nodes[bvh_stack[--stack_size]];
+        if (node.child1 < 0) {
+            // leaf
+            auto group_id = node.child0;
+            const ShapeGroup &shape_group = scene.shape_groups[group_id];
+            if (shape_group.stroke_color != nullptr) {
+                auto min_shape_id = -1;
+                auto closest_pt = Vector2f{0, 0};
+                auto local_path_info = ClosestPointPathInfo{-1, -1, 0};
+                auto d = infinity<float>();
+                compute_distance(scene, group_id, pt, infinity<float>(),
+                                 &min_shape_id, &closest_pt, &local_path_info, &d);
+                assert(min_shape_id != -1);
+                const auto &shape = scene.shapes[min_shape_id];
+                auto w = smoothstep(fabs(d) + shape.stroke_width) -
+                         smoothstep(fabs(d) - shape.stroke_width);
+                if (w > 0) {
+                    auto color_alpha = sample_color(shape_group.stroke_color_type,
+                                                    shape_group.stroke_color,
+                                                    pt);
+                    color_alpha[3] *= w;
+                    PrefilterFragment f;
+                    f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                    f.alpha = color_alpha[3];
+                    f.group_id = group_id;
+                    f.shape_id = min_shape_id;
+                    f.distance = d;
+                    f.closest_pt = closest_pt;
+                    f.is_stroke = true;
+                    f.path_info = local_path_info;
+                    f.within_distance = true;
+                    assert(num_fragments < max_hit_shapes);
+                    fragments[num_fragments++] = f;
+                }
+            }
+            if (shape_group.fill_color != nullptr) {
+                auto min_shape_id = -1;
+                auto closest_pt = Vector2f{0, 0};
+                auto local_path_info = ClosestPointPathInfo{-1, -1, 0};
+                auto d = infinity<float>();
+                auto found = compute_distance(scene,
+                                              group_id,
+                                              pt,
+                                              1.f,
+                                              &min_shape_id,
+                                              &closest_pt,
+                                              &local_path_info,
+                                              &d);
+                auto inside = is_inside(scene, group_id, pt, nullptr);
+                if (found || inside) {
+                    if (!inside) {
+                        d = -d;
+                    }
+                    auto w = smoothstep(d);
+                    if (w > 0) {
+                        auto color_alpha = sample_color(shape_group.fill_color_type,
+                                                        shape_group.fill_color,
+                                                        pt);
+                        color_alpha[3] *= w;
+                        PrefilterFragment f;
+                        f.color = Vector3f{color_alpha[0], color_alpha[1], color_alpha[2]};
+                        f.alpha = color_alpha[3];
+                        f.group_id = group_id;
+                        f.shape_id = min_shape_id;
+                        f.distance = d;
+                        f.closest_pt = closest_pt;
+                        f.is_stroke = false;
+                        f.path_info = local_path_info;
+                        f.within_distance = found;
+                        assert(num_fragments < max_hit_shapes);
+                        fragments[num_fragments++] = f;
+                    }
+                }
+            }
+        } else {
+            assert(node.child0 >= 0 && node.child1 >= 0);
+            const AABB &b0 = scene.bvh_nodes[node.child0].box;
+            if (inside(b0, pt, scene.bvh_nodes[node.child0].max_radius)) {
+                bvh_stack[stack_size++] = node.child0;
+            }
+            const AABB &b1 = scene.bvh_nodes[node.child1].box;
+            if (inside(b1, pt, scene.bvh_nodes[node.child1].max_radius)) {
+                bvh_stack[stack_size++] = node.child1;
+            }
+            assert(stack_size <= max_bvh_stack_size);
+        }
+    }
+    if (num_fragments <= 0) {
+        if (background_color != nullptr) {
+            if (d_background_color != nullptr) {
+                *d_background_color = *d_color;
+            }
+            return *background_color;
+        }
+        return Vector4f{0, 0, 0, 0};
+    }
+    // Sort the fragments from back to front (i.e. increasing order of group id)
+    // https://github.com/frigaut/yorick-imutil/blob/master/insort.c#L37
+    for (int i = 1; i < num_fragments; i++) {
+        auto j = i;
+        auto temp = fragments[j];
+        while (j > 0 && fragments[j - 1].group_id > temp.group_id) {
+            fragments[j] = fragments[j - 1];
+            j--;
+        }
+        fragments[j] = temp;
+    }
+    // Blend the color
+    Vector3f accum_color[max_hit_shapes];
+    float accum_alpha[max_hit_shapes];
+    auto first_alpha = 0.f;
+    auto first_color = Vector3f{0, 0, 0};
+    if (background_color != nullptr) {
+        first_alpha = background_color->w;
+        first_color = Vector3f{background_color->x,
+                               background_color->y,
+                               background_color->z};
+    }
+    for (int i = 0; i < num_fragments; i++) {
+        const PrefilterFragment &fragment = fragments[i];
+        auto new_color = fragment.color;
+        auto new_alpha = fragment.alpha;
+        auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+        auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+        // prev_color is alpha premultiplied, don't need to multiply with
+        // prev_alpha
+        accum_color[i] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+        accum_alpha[i] = prev_alpha * (1 - new_alpha) + new_alpha;
+    }
+    auto final_color = accum_color[num_fragments - 1];
+    auto final_alpha = accum_alpha[num_fragments - 1];
+    if (final_alpha > 1e-6f) {
+        final_color /= final_alpha;
+    }
+    assert(isfinite(final_color));
+    assert(isfinite(final_alpha));
+    if (d_color != nullptr) {
+        // Backward pass
+        auto d_final_color = Vector3f{(*d_color)[0], (*d_color)[1], (*d_color)[2]};
+        auto d_final_alpha = (*d_color)[3];
+        auto d_curr_color = d_final_color;
+        auto d_curr_alpha = d_final_alpha;
+        if (final_alpha > 1e-6f) {
+            // final_color = curr_color / final_alpha
+            d_curr_color = d_final_color / final_alpha;
+            d_curr_alpha -= sum(d_final_color * final_color) / final_alpha;
+        }
+        assert(isfinite(*d_color));
+        assert(isfinite(d_curr_color));
+        assert(isfinite(d_curr_alpha));
+        for (int i = num_fragments - 1; i >= 0; i--) {
+            // color[n] = prev_color * (1 - new_alpha) + new_alpha * new_color;
+            // alpha[n] = prev_alpha * (1 - new_alpha) + new_alpha;
+            auto prev_alpha = i > 0 ? accum_alpha[i - 1] : first_alpha;
+            auto prev_color = i > 0 ? accum_color[i - 1] : first_color;
+            auto d_prev_alpha = d_curr_alpha * (1.f - fragments[i].alpha);
+            auto d_alpha_i = d_curr_alpha * (1.f - prev_alpha);
+            d_alpha_i += sum(d_curr_color * (fragments[i].color - prev_color));
+            auto d_prev_color = d_curr_color * (1 - fragments[i].alpha);
+            auto d_color_i = d_curr_color * fragments[i].alpha;
+            auto group_id = fragments[i].group_id;
+            if (fragments[i].is_stroke) {
+                const auto &shape = scene.shapes[fragments[i].shape_id];
+                auto d = fragments[i].distance;
+                auto abs_d_plus_width = fabs(d) + shape.stroke_width;
+                auto abs_d_minus_width = fabs(d) - shape.stroke_width;
+                auto w = smoothstep(abs_d_plus_width) -
+                         smoothstep(abs_d_minus_width);
+                if (w != 0) {
+                    auto d_w = w > 0 ? (fragments[i].alpha / w) * d_alpha_i : 0.f;
+                    d_alpha_i *= w;
+                    // Backprop to color
+                    d_sample_color(scene.shape_groups[group_id].stroke_color_type,
+                                   scene.shape_groups[group_id].stroke_color,
+                                   pt,
+                                   Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                                   scene.d_shape_groups[group_id].stroke_color,
+                                   d_translation);
+                    auto d_abs_d_plus_width = d_smoothstep(abs_d_plus_width, d_w);
+                    auto d_abs_d_minus_width = -d_smoothstep(abs_d_minus_width, d_w);
+                    auto d_d = d_abs_d_plus_width + d_abs_d_minus_width;
+                    if (d < 0) {
+                        d_d = -d_d;
+                    }
+                    auto d_stroke_width = d_abs_d_plus_width - d_abs_d_minus_width;
+                    const auto &shape_group = scene.shape_groups[group_id];
+                    ShapeGroup &d_shape_group = scene.d_shape_groups[group_id];
+                    Shape &d_shape = scene.d_shapes[fragments[i].shape_id];
+                    if (fabs(d_d) > 1e-10f) {
+                        d_compute_distance(shape_group.canvas_to_shape,
+                                           shape_group.shape_to_canvas,
+                                           shape,
+                                           pt,
+                                           fragments[i].closest_pt,
+                                           fragments[i].path_info,
+                                           d_d,
+                                           d_shape_group.shape_to_canvas,
+                                           d_shape,
+                                           d_translation);
+                    }
+                    atomic_add(&d_shape.stroke_width, d_stroke_width);
+                }
+            } else {
+                const auto &shape = scene.shapes[fragments[i].shape_id];
+                auto d = fragments[i].distance;
+                auto w = smoothstep(d);
+                if (w != 0) {
+                    // color_alpha[3] = color_alpha[3] * w;
+                    auto d_w = w > 0 ? (fragments[i].alpha / w) * d_alpha_i : 0.f;
+                    d_alpha_i *= w;
+                    d_sample_color(scene.shape_groups[group_id].fill_color_type,
+                                   scene.shape_groups[group_id].fill_color,
+                                   pt,
+                                   Vector4f{d_color_i[0], d_color_i[1], d_color_i[2], d_alpha_i},
+                                   scene.d_shape_groups[group_id].fill_color,
+                                   d_translation);
+                    // w = smoothstep(d)
+                    auto d_d = d_smoothstep(d, d_w);
+                    if (d < 0) {
+                        d_d = -d_d;
+                    }
+                    const auto &shape_group = scene.shape_groups[group_id];
+                    ShapeGroup &d_shape_group = scene.d_shape_groups[group_id];
+                    Shape &d_shape = scene.d_shapes[fragments[i].shape_id];
+                    if (fabs(d_d) > 1e-10f && fragments[i].within_distance) {
+                        d_compute_distance(shape_group.canvas_to_shape,
+                                           shape_group.shape_to_canvas,
+                                           shape,
+                                           pt,
+                                           fragments[i].closest_pt,
+                                           fragments[i].path_info,
+                                           d_d,
+                                           d_shape_group.shape_to_canvas,
+                                           d_shape,
+                                           d_translation);
+                    }
+                }
+            }
+            d_curr_color = d_prev_color;
+            d_curr_alpha = d_prev_alpha;
+        }
+        if (d_background_color != nullptr) {
+            d_background_color->x += d_curr_color.x;
+            d_background_color->y += d_curr_color.y;
+            d_background_color->z += d_curr_color.z;
+            d_background_color->w += d_curr_alpha;
+        }
+    }
+    return Vector4f{final_color[0], final_color[1], final_color[2], final_alpha};
+}
+struct weight_kernel {
+    DEVICE void operator()(int idx) {
+        auto rng_state = init_pcg32(idx, seed);
+        // height * width * num_samples_y * num_samples_x
+        auto sx = idx % num_samples_x;
+        auto sy = (idx / num_samples_x) % num_samples_y;
+        auto x = (idx / (num_samples_x * num_samples_y)) % width;
+        auto y = (idx / (num_samples_x * num_samples_y * width));
+        assert(y < height);
+        auto rx = next_pcg32_float(&rng_state);
+        auto ry = next_pcg32_float(&rng_state);
+        if (use_prefiltering) {
+            rx = ry = 0.5f;
+        }
+        auto pt = Vector2f{x + ((float)sx + rx) / num_samples_x,
+                           y + ((float)sy + ry) / num_samples_y};
+        auto radius = scene.filter->radius;
+        assert(radius >= 0);
+        auto ri = (int)ceil(radius);
+        for (int dy = -ri; dy <= ri; dy++) {
+            for (int dx = -ri; dx <= ri; dx++) {
+                auto xx = x + dx;
+                auto yy = y + dy;
+                if (xx >= 0 && xx < width && yy >= 0 && yy < height) {
+                    auto xc = xx + 0.5f;
+                    auto yc = yy + 0.5f;
+                    auto filter_weight = compute_filter_weight(*scene.filter,
+                                                               xc - pt.x,
+                                                               yc - pt.y);
+                    atomic_add(weight_image[yy * width + xx], filter_weight);
+                }
+            }
+        }
+    }
+    SceneData scene;
+    float *weight_image;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+    uint64_t seed;
+    bool use_prefiltering;
+};
+// We use a "mega kernel" for rendering
+struct render_kernel {
+    DEVICE void operator()(int idx) {
+        // height * width * num_samples_y * num_samples_x
+        auto pt = Vector2f{0, 0};
+        auto x = 0;
+        auto y = 0;
+        if (eval_positions == nullptr) {
+            auto rng_state = init_pcg32(idx, seed);
+            auto sx = idx % num_samples_x;
+            auto sy = (idx / num_samples_x) % num_samples_y;
+            x = (idx / (num_samples_x * num_samples_y)) % width;
+            y = (idx / (num_samples_x * num_samples_y * width));
+            assert(x < width && y < height);
+            auto rx = next_pcg32_float(&rng_state);
+            auto ry = next_pcg32_float(&rng_state);
+            if (use_prefiltering) {
+                rx = ry = 0.5f;
+            }
+            pt = Vector2f{x + ((float)sx + rx) / num_samples_x,
+                          y + ((float)sy + ry) / num_samples_y};
+        } else {
+            pt = Vector2f{eval_positions[2 * idx],
+                          eval_positions[2 * idx + 1]};
+            x = int(pt.x);
+            y = int(pt.y);
+        }
+        // normalize pt to [0, 1]
+        auto npt = pt;
+        npt.x /= width;
+        npt.y /= height;
+        auto num_samples = num_samples_x * num_samples_y;
+        if (render_image != nullptr || d_render_image != nullptr) {
+            Vector4f d_color = Vector4f{0, 0, 0, 0};
+            if (d_render_image != nullptr) {
+                // Gather d_color from d_render_image inside the filter kernel
+                // normalize using weight_image
+                d_color = gather_d_color(*scene.filter,
+                                         d_render_image,
+                                         weight_image,
+                                         width,
+                                         height,
+                                         pt);
+            }
+            auto color = Vector4f{0, 0, 0, 0};
+            if (use_prefiltering) {
+                color = sample_color_prefiltered(scene,
+                    background_image != nullptr ? (const Vector4f*)&background_image[4 * ((y * width) + x)] : nullptr,
+                    npt,
+                    d_render_image != nullptr ? &d_color : nullptr,
+                    d_background_image != nullptr ? (Vector4f*)&d_background_image[4 * ((y * width) + x)] : nullptr,
+                    d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            } else {
+                color = sample_color(scene,
+                    background_image != nullptr ? (const Vector4f*)&background_image[4 * ((y * width) + x)] : nullptr,
+                    npt,
+                    d_render_image != nullptr ? &d_color : nullptr,
+                    nullptr,
+                    d_background_image != nullptr ? (Vector4f*)&d_background_image[4 * ((y * width) + x)] : nullptr,
+                    d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            }
+            assert(isfinite(color));
+            // Splat color onto render_image
+            auto radius = scene.filter->radius;
+            assert(radius >= 0);
+            auto ri = (int)ceil(radius);
+            for (int dy = -ri; dy <= ri; dy++) {
+                for (int dx = -ri; dx <= ri; dx++) {
+                    auto xx = x + dx;
+                    auto yy = y + dy;
+                    if (xx >= 0 && xx < width && yy >= 0 && yy < height &&
+                            weight_image[yy * width + xx] > 0) {
+                        auto weight_sum = weight_image[yy * width + xx];
+                        auto xc = xx + 0.5f;
+                        auto yc = yy + 0.5f;
+                        auto filter_weight = compute_filter_weight(*scene.filter,
+                                                                   xc - pt.x,
+                                                                   yc - pt.y);
+                        auto weighted_color = filter_weight * color / weight_sum;
+                        if (render_image != nullptr) {
+                            atomic_add(render_image[4 * (yy * width + xx) + 0],
+                                       weighted_color[0]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 1],
+                                       weighted_color[1]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 2],
+                                       weighted_color[2]);
+                            atomic_add(render_image[4 * (yy * width + xx) + 3],
+                                       weighted_color[3]);
+                        }
+                        if (d_render_image != nullptr) {
+                            // Backprop to filter_weight
+                            // pixel = \sum weight * color / \sum weight
+                            auto d_pixel = Vector4f{
+                                d_render_image[4 * (yy * width + xx) + 0],
+                                d_render_image[4 * (yy * width + xx) + 1],
+                                d_render_image[4 * (yy * width + xx) + 2],
+                                d_render_image[4 * (yy * width + xx) + 3],
+                            };
+                            auto d_weight =
+                                (dot(d_pixel, color) * weight_sum -
+                                 filter_weight * dot(d_pixel, color) * (weight_sum - filter_weight)) /
+                                square(weight_sum);
+                            d_compute_filter_weight(*scene.filter,
+                                                    xc - pt.x,
+                                                    yc - pt.y,
+                                                    d_weight,
+                                                    scene.d_filter);
+                        }
+                    }
+                }
+            }
+        }
+        if (sdf_image != nullptr || d_sdf_image != nullptr) {
+            float d_dist = 0.f;
+            if (d_sdf_image != nullptr) {
+                if (eval_positions == nullptr) {
+                    d_dist = d_sdf_image[y * width + x];
+                } else {
+                    d_dist = d_sdf_image[idx];
+                }
+            }
+            auto weight = eval_positions == nullptr ? 1.f / num_samples : 1.f;
+            auto dist = sample_distance(scene, npt, weight,
+                d_sdf_image != nullptr ? &d_dist : nullptr,
+                d_translation != nullptr ? &d_translation[2 * (y * width + x)] : nullptr);
+            if (sdf_image != nullptr) {
+                if (eval_positions == nullptr) {
+                    atomic_add(sdf_image[y * width + x], dist);
+                } else {
+                    atomic_add(sdf_image[idx], dist);
+                }
+            }
+        }
+    }
+    SceneData scene;
+    float *background_image;
+    float *render_image;
+    float *weight_image;
+    float *sdf_image;
+    float *d_background_image;
+    float *d_render_image;
+    float *d_sdf_image;
+    float *d_translation;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+    uint64_t seed;
+    bool use_prefiltering;
+    float *eval_positions;
+};
+struct BoundarySample {
+    Vector2f pt;
+    Vector2f local_pt;
+    Vector2f normal;
+    int shape_group_id;
+    int shape_id;
+    float t;
+    BoundaryData data;
+    float pdf;
+};
+struct sample_boundary_kernel {
+    DEVICE void operator()(int idx) {
+        boundary_samples[idx].pt = Vector2f{0, 0};
+        boundary_samples[idx].shape_id = -1;
+        boundary_ids[idx] = idx;
+        morton_codes[idx] = 0;
+        auto rng_state = init_pcg32(idx, seed);
+        auto u = next_pcg32_float(&rng_state);
+        // Sample a shape
+        auto sample_id = sample(scene.sample_shapes_cdf,
+                                scene.num_total_shapes,
+                                u);
+        assert(sample_id >= 0 && sample_id < scene.num_total_shapes);
+        auto shape_id = scene.sample_shape_id[sample_id];
+        assert(shape_id >= 0 && shape_id < scene.num_shapes);
+        auto shape_group_id = scene.sample_group_id[sample_id];
+        assert(shape_group_id >= 0 && shape_group_id < scene.num_shape_groups);
+        auto shape_pmf = scene.sample_shapes_pmf[shape_id];
+        if (shape_pmf <= 0) {
+            return;
+        }
+        // Sample a point on the boundary of the shape
+        auto boundary_pdf = 0.f;
+        auto normal = Vector2f{0, 0};
+        auto t = next_pcg32_float(&rng_state);
+        BoundaryData boundary_data;
+        const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+        auto local_boundary_pt = sample_boundary(
+            scene, shape_group_id, shape_id,
+            t, normal, boundary_pdf, boundary_data);
+        if (boundary_pdf <= 0) {
+            return;
+        }
+        // local_boundary_pt & normal are in shape's local space,
+        // transform them to canvas space
+        auto boundary_pt = xform_pt(shape_group.shape_to_canvas, local_boundary_pt);
+        normal = xform_normal(shape_group.canvas_to_shape, normal);
+        // Normalize boundary_pt to [0, 1)
+        boundary_pt.x /= scene.canvas_width;
+        boundary_pt.y /= scene.canvas_height;
+        boundary_samples[idx].pt = boundary_pt;
+        boundary_samples[idx].local_pt = local_boundary_pt;
+        boundary_samples[idx].normal = normal;
+        boundary_samples[idx].shape_group_id = shape_group_id;
+        boundary_samples[idx].shape_id = shape_id;
+        boundary_samples[idx].t = t;
+        boundary_samples[idx].data = boundary_data;
+        boundary_samples[idx].pdf = shape_pmf * boundary_pdf;
+        TVector2<uint32_t> p_i{boundary_pt.x * 1023, boundary_pt.y * 1023};
+        morton_codes[idx] = (expand_bits(p_i.x) << 1u) |
+                            (expand_bits(p_i.y) << 0u);
+    }
+    SceneData scene;
+    uint64_t seed;
+    BoundarySample *boundary_samples;
+    int *boundary_ids;
+    uint32_t *morton_codes;
+};
+struct render_edge_kernel {
+    DEVICE void operator()(int idx) {
+        auto bid = boundary_ids[idx];
+        if (boundary_samples[bid].shape_id == -1) {
+            return;
+        }
+        auto boundary_pt = boundary_samples[bid].pt;
+        auto local_boundary_pt = boundary_samples[bid].local_pt;
+        auto normal = boundary_samples[bid].normal;
+        auto shape_group_id = boundary_samples[bid].shape_group_id;
+        auto shape_id = boundary_samples[bid].shape_id;
+        auto t = boundary_samples[bid].t;
+        auto boundary_data = boundary_samples[bid].data;
+        auto pdf = boundary_samples[bid].pdf;
+        const ShapeGroup &shape_group = scene.shape_groups[shape_group_id];
+        auto bx = int(boundary_pt.x * width);
+        auto by = int(boundary_pt.y * height);
+        if (bx < 0 || bx >= width || by < 0 || by >= height) {
+            return;
+        }
+        // Sample the two sides of the boundary
+        auto inside_query = EdgeQuery{shape_group_id, shape_id, false};
+        auto outside_query = EdgeQuery{shape_group_id, shape_id, false};
+        auto color_inside = sample_color(scene,
+            background_image != nullptr ? (const Vector4f *)&background_image[4 * ((by * width) + bx)] : nullptr,
+            boundary_pt - 1e-4f * normal,
+            nullptr, &inside_query);
+        auto color_outside = sample_color(scene,
+            background_image != nullptr ? (const Vector4f *)&background_image[4 * ((by * width) + bx)] : nullptr,
+            boundary_pt + 1e-4f * normal,
+            nullptr, &outside_query);
+        if (!inside_query.hit && !outside_query.hit) {
+            // occluded
+            return;
+        }
+        if (!inside_query.hit) {
+            normal = -normal;
+            swap_(inside_query, outside_query);
+            swap_(color_inside, color_outside);
+        }
+        // Boundary point in screen space
+        auto sboundary_pt = boundary_pt;
+        sboundary_pt.x *= width;
+        sboundary_pt.y *= height;
+        auto d_color = gather_d_color(*scene.filter,
+                                      d_render_image,
+                                      weight_image,
+                                      width,
+                                      height,
+                                      sboundary_pt);
+        // Normalization factor
+        d_color /= float(scene.canvas_width * scene.canvas_height);
+        assert(isfinite(d_color));
+        assert(isfinite(pdf) && pdf > 0);
+        auto contrib = dot(color_inside - color_outside, d_color) / pdf;
+        ShapeGroup &d_shape_group = scene.d_shape_groups[shape_group_id];
+        accumulate_boundary_gradient(scene.shapes[shape_id],
+            contrib, t, normal, boundary_data, scene.d_shapes[shape_id],
+            shape_group.shape_to_canvas, local_boundary_pt, d_shape_group.shape_to_canvas);
+        // Don't need to backprop to filter weights:
+        // \int f'(x) g(x) dx doesn't contain discontinuities
+        // if f is continuous, even if g is discontinuous
+        if (d_translation != nullptr) {
+            // According to Reynold transport theorem,
+            // the Jacobian of the boundary integral is dot(velocity, normal)
+            // The velocity of the object translating x is (1, 0)
+            // The velocity of the object translating y is (0, 1)
+            atomic_add(&d_translation[2 * (by * width + bx) + 0], normal.x * contrib);
+            atomic_add(&d_translation[2 * (by * width + bx) + 1], normal.y * contrib);
+        }
+    }
+    SceneData scene;
+    const float *background_image;
+    const BoundarySample *boundary_samples;
+    const int *boundary_ids;
+    float *weight_image;
+    float *d_render_image;
+    float *d_translation;
+    int width;
+    int height;
+    int num_samples_x;
+    int num_samples_y;
+};
+void render(std::shared_ptr<Scene> scene,
+            ptr<float> background_image,
+            ptr<float> render_image,
+            ptr<float> render_sdf,
+            int width,
+            int height,
+            int num_samples_x,
+            int num_samples_y,
+            uint64_t seed,
+            ptr<float> d_background_image,
+            ptr<float> d_render_image,
+            ptr<float> d_render_sdf,
+            ptr<float> d_translation,
+            bool use_prefiltering,
+            ptr<float> eval_positions,
+            int num_eval_positions) {
+#ifdef __NVCC__
+    int old_device_id = -1;
+    if (scene->use_gpu) {
+        checkCuda(cudaGetDevice(&old_device_id));
+        if (scene->gpu_index != -1) {
+            checkCuda(cudaSetDevice(scene->gpu_index));
+        }
+    }
+#endif
+    parallel_init();
+    float *weight_image = nullptr;
+    // Allocate and zero the weight image
+    if (scene->use_gpu) {
+#ifdef __CUDACC__
+        if (eval_positions.get() == nullptr) {
+            checkCuda(cudaMallocManaged(&weight_image, width * height * sizeof(float)));
+            cudaMemset(weight_image, 0, width * height * sizeof(float));
+        }
+#else
+        assert(false);
+#endif
+    } else {
+        if (eval_positions.get() == nullptr) {
+            weight_image = (float*)malloc(width * height * sizeof(float));
+            memset(weight_image, 0, width * height * sizeof(float));
+        }
+    }
+    if (render_image.get() != nullptr || d_render_image.get() != nullptr ||
+        render_sdf.get() != nullptr || d_render_sdf.get() != nullptr) {
+        if (weight_image != nullptr) {
+            parallel_for(weight_kernel{
+                get_scene_data(*scene.get()),
+                weight_image,
+                width,
+                height,
+                num_samples_x,
+                num_samples_y,
+                seed
+            }, width * height * num_samples_x * num_samples_y, scene->use_gpu);
+        }
+        auto num_samples = eval_positions.get() == nullptr ?
+            width * height * num_samples_x * num_samples_y : num_eval_positions;
+        parallel_for(render_kernel{
+            get_scene_data(*scene.get()),
+            background_image.get(),
+            render_image.get(),
+            weight_image,
+            render_sdf.get(),
+            d_background_image.get(),
+            d_render_image.get(),
+            d_render_sdf.get(),
+            d_translation.get(),
+            width,
+            height,
+            num_samples_x,
+            num_samples_y,
+            seed,
+            use_prefiltering,
+            eval_positions.get()
+        }, num_samples, scene->use_gpu);
+    }
+    // Boundary sampling
+    if (!use_prefiltering && d_render_image.get() != nullptr) {
+        auto num_samples = width * height * num_samples_x * num_samples_y;
+        BoundarySample *boundary_samples = nullptr;
+        int *boundary_ids = nullptr; // for sorting
+        uint32_t *morton_codes = nullptr; // for sorting
+        // Allocate boundary samples
+        if (scene->use_gpu) {
+#ifdef __CUDACC__
+            checkCuda(cudaMallocManaged(&boundary_samples,
+                num_samples * sizeof(BoundarySample)));
+            checkCuda(cudaMallocManaged(&boundary_ids,
+                num_samples * sizeof(int)));
+            checkCuda(cudaMallocManaged(&morton_codes,
+                num_samples * sizeof(uint32_t)));
+#else
+            assert(false);
+    #endif
+        } else {
+            boundary_samples = (BoundarySample*)malloc(
+                num_samples * sizeof(BoundarySample));
+            boundary_ids = (int*)malloc(
+                num_samples * sizeof(int));
+            morton_codes = (uint32_t*)malloc(
+                num_samples * sizeof(uint32_t));
+        }
+        // Edge sampling
+        // We sort the boundary samples for better thread coherency
+        parallel_for(sample_boundary_kernel{
+            get_scene_data(*scene.get()),
+            seed,
+            boundary_samples,
+            boundary_ids,
+            morton_codes
+        }, num_samples, scene->use_gpu);
+        if (scene->use_gpu) {
+            thrust::sort_by_key(thrust::device, morton_codes, morton_codes + num_samples, boundary_ids);
+        } else {
+            // Don't need to sort for CPU, we are not using SIMD hardware anyway.
+            // thrust::sort_by_key(thrust::host, morton_codes, morton_codes + num_samples, boundary_ids);
+        }
+        parallel_for(render_edge_kernel{
+            get_scene_data(*scene.get()),
+            background_image.get(),
+            boundary_samples,
+            boundary_ids,
+            weight_image,
+            d_render_image.get(),
+            d_translation.get(),
+            width,
+            height,
+            num_samples_x,
+            num_samples_y
+        }, num_samples, scene->use_gpu);
+        if (scene->use_gpu) {
+#ifdef __CUDACC__
+            checkCuda(cudaFree(boundary_samples));
+            checkCuda(cudaFree(boundary_ids));
+            checkCuda(cudaFree(morton_codes));
+#else
+            assert(false);
+#endif
+        } else {
+            free(boundary_samples);
+            free(boundary_ids);
+            free(morton_codes);
+        }
+    }
+    // Clean up weight image
+    if (scene->use_gpu) {
+#ifdef __CUDACC__
+        checkCuda(cudaFree(weight_image));
+#else
+        assert(false);
+#endif
+    } else {
+        free(weight_image);
+    }
+    if (scene->use_gpu) {
+        cuda_synchronize();
+    }
+    parallel_cleanup();
+#ifdef __NVCC__
+    if (old_device_id != -1) {
+        checkCuda(cudaSetDevice(old_device_id));
+    }
+#endif
+}
+PYBIND11_MODULE(diffvg, m) {
+    m.doc() = "Differential Vector Graphics";
+    py::class_<ptr<void>>(m, "void_ptr")
+        .def(py::init<std::size_t>())
+        .def("as_size_t", &ptr<void>::as_size_t);
+    py::class_<ptr<float>>(m, "float_ptr")
+        .def(py::init<std::size_t>());
+    py::class_<ptr<int>>(m, "int_ptr")
+        .def(py::init<std::size_t>());
+    py::class_<Vector2f>(m, "Vector2f")
+        .def(py::init<float, float>())
+        .def_readwrite("x", &Vector2f::x)
+        .def_readwrite("y", &Vector2f::y);
+    py::class_<Vector3f>(m, "Vector3f")
+        .def(py::init<float, float, float>())
+        .def_readwrite("x", &Vector3f::x)
+        .def_readwrite("y", &Vector3f::y)
+        .def_readwrite("z", &Vector3f::z);
+    py::class_<Vector4f>(m, "Vector4f")
+        .def(py::init<float, float, float, float>())
+        .def_readwrite("x", &Vector4f::x)
+        .def_readwrite("y", &Vector4f::y)
+        .def_readwrite("z", &Vector4f::z)
+        .def_readwrite("w", &Vector4f::w);
+    py::enum_<ShapeType>(m, "ShapeType")
+        .value("circle", ShapeType::Circle)
+        .value("ellipse", ShapeType::Ellipse)
+        .value("path", ShapeType::Path)
+        .value("rect", ShapeType::Rect);
+    py::class_<Circle>(m, "Circle")
+        .def(py::init<float, Vector2f>())
+        .def("get_ptr", &Circle::get_ptr)
+        .def_readonly("radius", &Circle::radius)
+        .def_readonly("center", &Circle::center);
+    py::class_<Ellipse>(m, "Ellipse")
+        .def(py::init<Vector2f, Vector2f>())
+        .def("get_ptr", &Ellipse::get_ptr)
+        .def_readonly("radius", &Ellipse::radius)
+        .def_readonly("center", &Ellipse::center);
+    py::class_<Path>(m, "Path")
+        .def(py::init<ptr<int>, ptr<float>, ptr<float>, int, int, bool, bool>())
+        .def("get_ptr", &Path::get_ptr)
+        .def("has_thickness", &Path::has_thickness)
+        .def("copy_to", &Path::copy_to)
+        .def_readonly("num_points", &Path::num_points);
+    py::class_<Rect>(m, "Rect")
+        .def(py::init<Vector2f, Vector2f>())
+        .def("get_ptr", &Rect::get_ptr)
+        .def_readonly("p_min", &Rect::p_min)
+        .def_readonly("p_max", &Rect::p_max);
+    py::enum_<ColorType>(m, "ColorType")
+        .value("constant", ColorType::Constant)
+        .value("linear_gradient", ColorType::LinearGradient)
+        .value("radial_gradient", ColorType::RadialGradient);
+    py::class_<Constant>(m, "Constant")
+        .def(py::init<Vector4f>())
+        .def("get_ptr", &Constant::get_ptr)
+        .def_readonly("color", &Constant::color);
+    py::class_<LinearGradient>(m, "LinearGradient")
+        .def(py::init<Vector2f, Vector2f, int, ptr<float>, ptr<float>>())
+        .def("get_ptr", &LinearGradient::get_ptr)
+        .def("copy_to", &LinearGradient::copy_to)
+        .def_readonly("begin", &LinearGradient::begin)
+        .def_readonly("end", &LinearGradient::end)
+        .def_readonly("num_stops", &LinearGradient::num_stops);
+    py::class_<RadialGradient>(m, "RadialGradient")
+        .def(py::init<Vector2f, Vector2f, int, ptr<float>, ptr<float>>())
+        .def("get_ptr", &RadialGradient::get_ptr)
+        .def("copy_to", &RadialGradient::copy_to)
+        .def_readonly("center", &RadialGradient::center)
+        .def_readonly("radius", &RadialGradient::radius)
+        .def_readonly("num_stops", &RadialGradient::num_stops);
+    py::class_<Shape>(m, "Shape")
+        .def(py::init<ShapeType, ptr<void>, float>())
+        .def("as_circle", &Shape::as_circle)
+        .def("as_ellipse", &Shape::as_ellipse)
+        .def("as_path", &Shape::as_path)
+        .def("as_rect", &Shape::as_rect)
+        .def_readonly("type", &Shape::type)
+        .def_readonly("stroke_width", &Shape::stroke_width);
+    py::class_<ShapeGroup>(m, "ShapeGroup")
+        .def(py::init<ptr<int>,
+                      int,
+                      ColorType,
+                      ptr<void>,
+                      ColorType,
+                      ptr<void>,
+                      bool,
+                      ptr<float>>())
+        .def("fill_color_as_constant", &ShapeGroup::fill_color_as_constant)
+        .def("fill_color_as_linear_gradient", &ShapeGroup::fill_color_as_linear_gradient)
+        .def("fill_color_as_radial_gradient", &ShapeGroup::fill_color_as_radial_gradient)
+        .def("stroke_color_as_constant", &ShapeGroup::stroke_color_as_constant)
+        .def("stroke_color_as_linear_gradient", &ShapeGroup::stroke_color_as_linear_gradient)
+        .def("stroke_color_as_radial_gradient", &ShapeGroup::fill_color_as_radial_gradient)
+        .def("has_fill_color", &ShapeGroup::has_fill_color)
+        .def("has_stroke_color", &ShapeGroup::has_stroke_color)
+        .def("copy_to", &ShapeGroup::copy_to)
+        .def_readonly("fill_color_type", &ShapeGroup::fill_color_type)
+        .def_readonly("stroke_color_type", &ShapeGroup::stroke_color_type);
+    py::enum_<FilterType>(m, "FilterType")
+        .value("box", FilterType::Box)
+        .value("tent", FilterType::Tent)
+        .value("parabolic", FilterType::RadialParabolic)
+        .value("hann", FilterType::Hann);
+    py::class_<Filter>(m, "Filter")
+        .def(py::init<FilterType,
+                      float>());
+    py::class_<Scene, std::shared_ptr<Scene>>(m, "Scene")
+        .def(py::init<int,
+                      int,
+                      const std::vector<const Shape*> &,
+                      const std::vector<const ShapeGroup*> &,
+                      const Filter &,
+                      bool,
+                      int>())
+        .def("get_d_shape", &Scene::get_d_shape)
+        .def("get_d_shape_group", &Scene::get_d_shape_group)
+        .def("get_d_filter_radius", &Scene::get_d_filter_radius)
+        .def_readonly("num_shapes", &Scene::num_shapes)
+        .def_readonly("num_shape_groups", &Scene::num_shape_groups);
+    m.def("render", &render, "");
+}

DiffVG/diffvg.h ADDED Viewed

	@@ -0,0 +1,156 @@

+#pragma once
+#ifdef __NVCC__
+    #define DEVICE __device__ __host__
+#else
+    #define DEVICE
+#endif
+#ifndef __NVCC__
+    #include <cmath>
+    namespace {
+        inline float fmodf(float a, float b) {
+            return std::fmod(a, b);
+        }
+        inline double fmod(double a, double b) {
+            return std::fmod(a, b);
+        }
+    }
+    using std::isfinite;
+#endif
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+#include <cstdint>
+#include <atomic>
+// We use Real for most of the internal computation.
+// However, for PyTorch interfaces, Optix Prime and Embree queries
+// we use float
+using Real = float;
+template <typename T>
+DEVICE
+inline T square(const T &x) {
+    return x * x;
+}
+template <typename T>
+DEVICE
+inline T cubic(const T &x) {
+    return x * x * x;
+}
+template <typename T>
+DEVICE
+inline T clamp(const T &v, const T &lo, const T &hi) {
+    if (v < lo) return lo;
+    else if (v > hi) return hi;
+    else return v;
+}
+DEVICE
+inline int modulo(int a, int b) {
+    auto r = a % b;
+    return (r < 0) ? r+b : r;
+}
+DEVICE
+inline float modulo(float a, float b) {
+    float r = ::fmodf(a, b);
+    return (r < 0.0f) ? r+b : r;
+}
+DEVICE
+inline double modulo(double a, double b) {
+    double r = ::fmod(a, b);
+    return (r < 0.0) ? r+b : r;
+}
+template <typename T>
+DEVICE
+inline T max(const T &a, const T &b) {
+    return a > b ? a : b;
+}
+template <typename T>
+DEVICE
+inline T min(const T &a, const T &b) {
+    return a < b ? a : b;
+}
+/// Return ceil(x/y) for integers x and y
+inline int idiv_ceil(int x, int y) {
+    return (x + y-1) / y;
+}
+template <typename T>
+DEVICE
+inline void swap_(T &a, T &b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+inline double log2(double x) {
+    return log(x) / log(Real(2));
+}
+template <typename T>
+DEVICE
+inline T safe_acos(const T &x) {
+    if (x >= 1) return T(0);
+    else if(x <= -1) return T(M_PI);
+    return acos(x);
+}
+// For Morton code computation. This can be made faster.
+DEVICE
+inline uint32_t expand_bits(uint32_t x) {
+    // Insert one zero after every bit given a 10-bit integer
+    constexpr uint64_t mask = 0x1u;
+    // We start from LSB (bit 31)
+    auto result = (x & (mask << 0u));
+    result |= ((x & (mask << 1u)) << 1u);
+    result |= ((x & (mask << 2u)) << 2u);
+    result |= ((x & (mask << 3u)) << 3u);
+    result |= ((x & (mask << 4u)) << 4u);
+    result |= ((x & (mask << 5u)) << 5u);
+    result |= ((x & (mask << 6u)) << 6u);
+    result |= ((x & (mask << 7u)) << 7u);
+    result |= ((x & (mask << 8u)) << 8u);
+    result |= ((x & (mask << 9u)) << 9u);
+    return result;
+}
+// DEVICE
+// inline int clz(uint64_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __clzll(x);
+// #else
+//     // TODO: use _BitScanReverse in windows
+//     return x == 0 ? 64 : __builtin_clzll(x);
+// #endif
+// }
+// DEVICE
+// inline int ffs(uint8_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __ffs(x);
+// #else
+//     // TODO: use _BitScanReverse in windows
+//     return __builtin_ffs(x);
+// #endif
+// }
+// DEVICE
+// inline int popc(uint8_t x) {
+// #ifdef __CUDA_ARCH__
+//     return __popc(x);
+// #else
+//     // TODO: use _popcnt in windows
+//     return __builtin_popcount(x);
+// #endif
+// }

DiffVG/edge_query.h ADDED Viewed

	@@ -0,0 +1,7 @@

+#pragma once
+struct EdgeQuery {
+	int shape_group_id;
+    int shape_id;
+    bool hit; // Do we hit the specified shape_group_id & shape_id?
+};

DiffVG/filter.h ADDED Viewed

	@@ -0,0 +1,106 @@

+#pragma once
+#include "diffvg.h"
+#include "atomic.h"
+enum class FilterType {
+    Box,
+    Tent,
+    RadialParabolic, // 4/3(1 - (d/r))
+    Hann // https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
+};
+struct Filter {
+    FilterType type;
+    float radius;
+};
+struct DFilter {
+    float radius;
+};
+DEVICE
+inline
+float compute_filter_weight(const Filter &filter,
+                            float dx,
+                            float dy) {
+    if (fabs(dx) > filter.radius || fabs(dy) > filter.radius) {
+        return 0;
+    }
+    if (filter.type == FilterType::Box) {
+        return 1.f / square(2 * filter.radius);
+    } else if (filter.type == FilterType::Tent) {
+        return (filter.radius - fabs(dx)) * (filter.radius - fabs(dy)) /
+               square(square(filter.radius));
+    } else if (filter.type == FilterType::RadialParabolic) {
+        return (4.f / 3.f) * (1 - square(dx / filter.radius)) *
+               (4.f / 3.f) * (1 - square(dy / filter.radius));
+    } else {
+        assert(filter.type == FilterType::Hann);
+        // normalize dx, dy to [0, 1]
+        auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        // the normalization factor is R^2
+        return 0.5f * (1.f - cos(float(2 * M_PI) * ndx)) *
+               0.5f * (1.f - cos(float(2 * M_PI) * ndy)) /
+               square(filter.radius);
+    }
+}
+DEVICE
+inline
+void d_compute_filter_weight(const Filter &filter,
+                             float dx,
+                             float dy,
+                             float d_return,
+                             DFilter *d_filter) {
+    if (filter.type == FilterType::Box) {
+        // return 1.f / square(2 * filter.radius);
+        atomic_add(d_filter->radius,
+            d_return * (-2) * 2 * filter.radius / cubic(2 * filter.radius));
+    } else if (filter.type == FilterType::Tent) {
+        // return (filer.radius - fabs(dx)) * (filer.radius - fabs(dy)) /
+        //        square(square(filter.radius));
+        auto fx = filter.radius - fabs(dx);
+        auto fy = filter.radius - fabs(dy);
+        auto norm = 1 / square(filter.radius);
+        auto d_fx = d_return * fy * norm;
+        auto d_fy = d_return * fx * norm;
+        auto d_norm = d_return * fx * fy;
+        atomic_add(d_filter->radius,
+            d_fx + d_fy + (-4) * d_norm / pow(filter.radius, 5));
+    } else if (filter.type == FilterType::RadialParabolic) {
+        // return (4.f / 3.f) * (1 - square(dx / filter.radius)) *
+        //        (4.f / 3.f) * (1 - square(dy / filter.radius));
+        // auto d_square_x = d_return * (-4.f / 3.f);
+        // auto d_square_y = d_return * (-4.f / 3.f);
+        auto r3 = filter.radius * filter.radius * filter.radius;
+        auto d_radius = -(2 * square(dx) + 2 * square(dy)) / r3;
+        atomic_add(d_filter->radius, d_radius);
+    } else {
+        assert(filter.type == FilterType::Hann);
+        // // normalize dx, dy to [0, 1]
+        // auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        // auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        // // the normalization factor is R^2
+        // return 0.5f * (1.f - cos(float(2 * M_PI) * ndx)) *
+        //        0.5f * (1.f - cos(float(2 * M_PI) * ndy)) /
+        //        square(filter.radius);
+        // normalize dx, dy to [0, 1]
+        auto ndx = (dx / (2*filter.radius)) + 0.5f;
+        auto ndy = (dy / (2*filter.radius)) + 0.5f;
+        auto fx = 0.5f * (1.f - cos(float(2*M_PI) * ndx));
+        auto fy = 0.5f * (1.f - cos(float(2*M_PI) * ndy));
+        auto norm = 1 / square(filter.radius);
+        auto d_fx = d_return * fy * norm;
+        auto d_fy = d_return * fx * norm;
+        auto d_norm = d_return * fx * fy;
+        auto d_ndx = d_fx * 0.5f * sin(float(2*M_PI) * ndx) * float(2*M_PI);
+        auto d_ndy = d_fy * 0.5f * sin(float(2*M_PI) * ndy) * float(2*M_PI);
+        atomic_add(d_filter->radius,
+            d_ndx * (-2*dx / square(2*filter.radius)) +
+            d_ndy * (-2*dy / square(2*filter.radius)) +
+            (-2) * d_norm / cubic(filter.radius));
+    }
+}

DiffVG/matrix.h ADDED Viewed

	@@ -0,0 +1,544 @@

+#pragma once
+#include "diffvg.h"
+#include "vector.h"
+#include <iostream>
+template <typename T>
+struct TMatrix3x3 {
+    DEVICE
+    TMatrix3x3() {
+        for (int i = 0; i < 3; i++) {
+            for (int j = 0; j < 3; j++) {
+                data[i][j] = T(0);
+            }
+        }
+    }
+    template <typename T2>
+    DEVICE
+    TMatrix3x3(T2 *arr) {
+    	data[0][0] = arr[0];
+    	data[0][1] = arr[1];
+    	data[0][2] = arr[2];
+    	data[1][0] = arr[3];
+    	data[1][1] = arr[4];
+    	data[1][2] = arr[5];
+    	data[2][0] = arr[6];
+    	data[2][1] = arr[7];
+    	data[2][2] = arr[8];
+    }
+    DEVICE
+    TMatrix3x3(T v00, T v01, T v02,
+               T v10, T v11, T v12,
+               T v20, T v21, T v22) {
+        data[0][0] = v00;
+        data[0][1] = v01;
+        data[0][2] = v02;
+        data[1][0] = v10;
+        data[1][1] = v11;
+        data[1][2] = v12;
+        data[2][0] = v20;
+        data[2][1] = v21;
+        data[2][2] = v22;
+    }
+    DEVICE
+    const T& operator()(int i, int j) const {
+        return data[i][j];
+    }
+    DEVICE
+    T& operator()(int i, int j) {
+        return data[i][j];
+    }
+    DEVICE
+    static TMatrix3x3<T> identity() {
+        TMatrix3x3<T> m(1, 0, 0,
+                        0, 1, 0,
+                        0, 0, 1);
+        return m;
+    }
+    T data[3][3];
+};
+using Matrix3x3 = TMatrix3x3<Real>;
+using Matrix3x3f = TMatrix3x3<float>;
+template <typename T>
+struct TMatrix4x4 {
+    DEVICE TMatrix4x4() {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = T(0);
+            }
+        }
+    }
+    template <typename T2>
+    DEVICE TMatrix4x4(const T2 *arr) {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = (T)arr[i * 4 + j];
+            }
+        }
+    }
+    template <typename T2>
+    DEVICE TMatrix4x4(const TMatrix4x4<T2> &m) {
+        for (int i = 0; i < 4; i++) {
+            for (int j = 0; j < 4; j++) {
+                data[i][j] = T(m.data[i][j]);
+            }
+        }
+    }
+    template <typename T2>
+    DEVICE TMatrix4x4(T2 v00, T2 v01, T2 v02, T2 v03,
+                      T2 v10, T2 v11, T2 v12, T2 v13,
+                      T2 v20, T2 v21, T2 v22, T2 v23,
+                      T2 v30, T2 v31, T2 v32, T2 v33) {
+        data[0][0] = (T)v00;
+        data[0][1] = (T)v01;
+        data[0][2] = (T)v02;
+        data[0][3] = (T)v03;
+        data[1][0] = (T)v10;
+        data[1][1] = (T)v11;
+        data[1][2] = (T)v12;
+        data[1][3] = (T)v13;
+        data[2][0] = (T)v20;
+        data[2][1] = (T)v21;
+        data[2][2] = (T)v22;
+        data[2][3] = (T)v23;
+        data[3][0] = (T)v30;
+        data[3][1] = (T)v31;
+        data[3][2] = (T)v32;
+        data[3][3] = (T)v33;
+    }
+    DEVICE
+    const T& operator()(int i, int j) const {
+        return data[i][j];
+    }
+    DEVICE
+    T& operator()(int i, int j) {
+        return data[i][j];
+    }
+    DEVICE
+    static TMatrix4x4<T> identity() {
+        TMatrix4x4<T> m(1, 0, 0, 0,
+                        0, 1, 0, 0,
+                        0, 0, 1, 0,
+                        0, 0, 0, 1);
+        return m;
+    }
+    T data[4][4];
+};
+using Matrix4x4 = TMatrix4x4<Real>;
+using Matrix4x4f = TMatrix4x4<float>;
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TMatrix3x3<T0> &m0, const TMatrix3x3<T1> &m1) -> TMatrix3x3<decltype(m0(0, 0) + m1(0, 0))> {
+    TMatrix3x3<decltype(m0(0, 0) + m1(0, 0))> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = m0(i, j) + m1(i, j);
+        }
+    }
+    return m;
+}
+template <typename T0, typename T1>
+DEVICE
+inline auto operator-(const TMatrix3x3<T0> &m0, const TMatrix3x3<T1> &m1) -> TMatrix3x3<decltype(m0(0, 0) - m1(0, 0))> {
+    TMatrix3x3<decltype(m0(0, 0) - m1(0, 0))> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = m0(i, j) - m1(i, j);
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+inline auto operator*(const TMatrix3x3<T> &m0, const TMatrix3x3<T> &m1) -> TMatrix3x3<T> {
+    TMatrix3x3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            ret(i, j) = T(0);
+            for (int k = 0; k < 3; k++) {
+                ret(i, j) += m0(i, k) * m1(k, j);
+            }
+        }
+    }
+    return ret;
+}
+template <typename T>
+DEVICE
+inline auto operator*(const TVector3<T> &v, const TMatrix3x3<T> &m) -> TVector3<T> {
+    TVector3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        ret[i] = T(0);
+        for (int j = 0; j < 3; j++) {
+            ret[i] += v[j] * m(j, i);
+        }
+    }
+    return ret;
+}
+template <typename T>
+DEVICE
+inline auto operator*(const TMatrix3x3<T> &m, const TVector3<T> &v) -> TVector3<T> {
+    TVector3<T> ret;
+    for (int i = 0; i < 3; i++) {
+        ret[i] = 0.f;
+        for (int j = 0; j < 3; j++) {
+            ret[i] += m(i, j) * v[j];
+        }
+    }
+    return ret;
+}
+template <typename T>
+DEVICE
+inline auto inverse(const TMatrix3x3<T> &m) -> TMatrix3x3<T> {
+    // computes the inverse of a matrix m
+    auto det = m(0, 0) * (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) -
+               m(0, 1) * (m(1, 0) * m(2, 2) - m(1, 2) * m(2, 0)) +
+               m(0, 2) * (m(1, 0) * m(2, 1) - m(1, 1) * m(2, 0));
+    auto invdet = 1 / det;
+    auto m_inv = TMatrix3x3<T>{};
+    m_inv(0, 0) = (m(1, 1) * m(2, 2) - m(2, 1) * m(1, 2)) * invdet;
+    m_inv(0, 1) = (m(0, 2) * m(2, 1) - m(0, 1) * m(2, 2)) * invdet;
+    m_inv(0, 2) = (m(0, 1) * m(1, 2) - m(0, 2) * m(1, 1)) * invdet;
+    m_inv(1, 0) = (m(1, 2) * m(2, 0) - m(1, 0) * m(2, 2)) * invdet;
+    m_inv(1, 1) = (m(0, 0) * m(2, 2) - m(0, 2) * m(2, 0)) * invdet;
+    m_inv(1, 2) = (m(1, 0) * m(0, 2) - m(0, 0) * m(1, 2)) * invdet;
+    m_inv(2, 0) = (m(1, 0) * m(2, 1) - m(2, 0) * m(1, 1)) * invdet;
+    m_inv(2, 1) = (m(2, 0) * m(0, 1) - m(0, 0) * m(2, 1)) * invdet;
+    m_inv(2, 2) = (m(0, 0) * m(1, 1) - m(1, 0) * m(0, 1)) * invdet;
+    return m_inv;
+}
+template <typename T0, typename T1>
+DEVICE
+inline auto operator+(const TMatrix4x4<T0> &m0, const TMatrix4x4<T1> &m1) -> TMatrix4x4<decltype(m0(0, 0) + m1(0, 0))> {
+    TMatrix4x4<decltype(m0(0, 0) + m1(0, 0))> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = m0(i, j) + m1(i, j);
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+TMatrix3x3<T> transpose(const TMatrix3x3<T> &m) {
+    return TMatrix3x3<T>(m(0, 0), m(1, 0), m(2, 0),
+                         m(0, 1), m(1, 1), m(2, 1),
+                         m(0, 2), m(1, 2), m(2, 2));
+}
+template <typename T>
+DEVICE
+TMatrix4x4<T> transpose(const TMatrix4x4<T> &m) {
+    return TMatrix4x4<T>(m(0, 0), m(1, 0), m(2, 0), m(3, 0),
+                         m(0, 1), m(1, 1), m(2, 1), m(3, 1),
+                         m(0, 2), m(1, 2), m(2, 2), m(3, 2),
+                         m(0, 3), m(1, 3), m(2, 3), m(3, 3));
+}
+template <typename T>
+DEVICE
+inline TMatrix3x3<T> operator-(const TMatrix3x3<T> &m0) {
+    TMatrix3x3<T> m;
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m(i, j) = -m0(i, j);
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator-(const TMatrix4x4<T> &m0) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = -m0(i, j);
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator-(const TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m(i, j) = m0(i, j) - m1(i, j);
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+inline TMatrix3x3<T>& operator+=(TMatrix3x3<T> &m0, const TMatrix3x3<T> &m1) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            m0(i, j) += m1(i, j);
+        }
+    }
+    return m0;
+}
+template <typename T>
+DEVICE
+inline TMatrix4x4<T>& operator+=(TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m0(i, j) += m1(i, j);
+        }
+    }
+    return m0;
+}
+template <typename T>
+DEVICE
+inline TMatrix4x4<T>& operator-=(TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            m0(i, j) -= m1(i, j);
+        }
+    }
+    return m0;
+}
+template <typename T>
+DEVICE
+inline TMatrix4x4<T> operator*(const TMatrix4x4<T> &m0, const TMatrix4x4<T> &m1) {
+    TMatrix4x4<T> m;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            for (int k = 0; k < 4; k++) {
+                m(i, j) += m0(i, k) * m1(k, j);
+            }
+        }
+    }
+    return m;
+}
+template <typename T>
+DEVICE
+TMatrix4x4<T> inverse(const TMatrix4x4<T> &m) {
+    // https://stackoverflow.com/questions/1148309/inverting-a-4x4-matrix
+    TMatrix4x4<T> inv;
+    inv(0, 0) = m(1, 1) * m(2, 2) * m(3, 3) -
+                m(1, 1) * m(2, 3) * m(3, 2) -
+                m(2, 1) * m(1, 2) * m(3, 3) +
+                m(2, 1) * m(1, 3) * m(3, 2) +
+                m(3, 1) * m(1, 2) * m(2, 3) -
+                m(3, 1) * m(1, 3) * m(2, 2);
+    inv(1, 0) = -m(1, 0) * m(2, 2) * m(3, 3) +
+                 m(1, 0) * m(2, 3) * m(3, 2) +
+                 m(2, 0) * m(1, 2) * m(3, 3) -
+                 m(2, 0) * m(1, 3) * m(3, 2) -
+                 m(3, 0) * m(1, 2) * m(2, 3) +
+                 m(3, 0) * m(1, 3) * m(2, 2);
+    inv(2, 0) = m(1, 0) * m(2, 1) * m(3, 3) -
+                m(1, 0) * m(2, 3) * m(3, 1) -
+                m(2, 0) * m(1, 1) * m(3, 3) +
+                m(2, 0) * m(1, 3) * m(3, 1) +
+                m(3, 0) * m(1, 1) * m(2, 3) -
+                m(3, 0) * m(1, 3) * m(2, 1);
+    inv(3, 0) = -m(1, 0) * m(2, 1) * m(3, 2) +
+                 m(1, 0) * m(2, 2) * m(3, 1) +
+                 m(2, 0) * m(1, 1) * m(3, 2) -
+                 m(2, 0) * m(1, 2) * m(3, 1) -
+                 m(3, 0) * m(1, 1) * m(2, 2) +
+                 m(3, 0) * m(1, 2) * m(2, 1);
+    inv(0, 1) = -m(0, 1) * m(2, 2) * m(3, 3) +
+                 m(0, 1) * m(2, 3) * m(3, 2) +
+                 m(2, 1) * m(0, 2) * m(3, 3) -
+                 m(2, 1) * m(0, 3) * m(3, 2) -
+                 m(3, 1) * m(0, 2) * m(2, 3) +
+                 m(3, 1) * m(0, 3) * m(2, 2);
+    inv(1, 1) = m(0, 0) * m(2, 2) * m(3, 3) -
+                m(0, 0) * m(2, 3) * m(3, 2) -
+                m(2, 0) * m(0, 2) * m(3, 3) +
+                m(2, 0) * m(0, 3) * m(3, 2) +
+                m(3, 0) * m(0, 2) * m(2, 3) -
+                m(3, 0) * m(0, 3) * m(2, 2);
+    inv(2, 1) = -m(0, 0) * m(2, 1) * m(3, 3) +
+                 m(0, 0) * m(2, 3) * m(3, 1) +
+                 m(2, 0) * m(0, 1) * m(3, 3) -
+                 m(2, 0) * m(0, 3) * m(3, 1) -
+                 m(3, 0) * m(0, 1) * m(2, 3) +
+                 m(3, 0) * m(0, 3) * m(2, 1);
+    inv(3, 1) = m(0, 0) * m(2, 1) * m(3, 2) -
+                m(0, 0) * m(2, 2) * m(3, 1) -
+                m(2, 0) * m(0, 1) * m(3, 2) +
+                m(2, 0) * m(0, 2) * m(3, 1) +
+                m(3, 0) * m(0, 1) * m(2, 2) -
+                m(3, 0) * m(0, 2) * m(2, 1);
+    inv(0, 2) = m(0, 1) * m(1, 2) * m(3, 3) -
+                m(0, 1) * m(1, 3) * m(3, 2) -
+                m(1, 1) * m(0, 2) * m(3, 3) +
+                m(1, 1) * m(0, 3) * m(3, 2) +
+                m(3, 1) * m(0, 2) * m(1, 3) -
+                m(3, 1) * m(0, 3) * m(1, 2);
+    inv(1, 2) = -m(0, 0) * m(1, 2) * m(3, 3) +
+                 m(0, 0) * m(1, 3) * m(3, 2) +
+                 m(1, 0) * m(0, 2) * m(3, 3) -
+                 m(1, 0) * m(0, 3) * m(3, 2) -
+                 m(3, 0) * m(0, 2) * m(1, 3) +
+                 m(3, 0) * m(0, 3) * m(1, 2);
+    inv(2, 2) = m(0, 0) * m(1, 1) * m(3, 3) -
+                m(0, 0) * m(1, 3) * m(3, 1) -
+                m(1, 0) * m(0, 1) * m(3, 3) +
+                m(1, 0) * m(0, 3) * m(3, 1) +
+                m(3, 0) * m(0, 1) * m(1, 3) -
+                m(3, 0) * m(0, 3) * m(1, 1);
+    inv(3, 2) = -m(0, 0) * m(1, 1) * m(3, 2) +
+                 m(0, 0) * m(1, 2) * m(3, 1) +
+                 m(1, 0) * m(0, 1) * m(3, 2) -
+                 m(1, 0) * m(0, 2) * m(3, 1) -
+                 m(3, 0) * m(0, 1) * m(1, 2) +
+                 m(3, 0) * m(0, 2) * m(1, 1);
+    inv(0, 3) = -m(0, 1) * m(1, 2) * m(2, 3) +
+                 m(0, 1) * m(1, 3) * m(2, 2) +
+                 m(1, 1) * m(0, 2) * m(2, 3) -
+                 m(1, 1) * m(0, 3) * m(2, 2) -
+                 m(2, 1) * m(0, 2) * m(1, 3) +
+                 m(2, 1) * m(0, 3) * m(1, 2);
+    inv(1, 3) = m(0, 0) * m(1, 2) * m(2, 3) -
+                m(0, 0) * m(1, 3) * m(2, 2) -
+                m(1, 0) * m(0, 2) * m(2, 3) +
+                m(1, 0) * m(0, 3) * m(2, 2) +
+                m(2, 0) * m(0, 2) * m(1, 3) -
+                m(2, 0) * m(0, 3) * m(1, 2);
+    inv(2, 3) = -m(0, 0) * m(1, 1) * m(2, 3) +
+                 m(0, 0) * m(1, 3) * m(2, 1) +
+                 m(1, 0) * m(0, 1) * m(2, 3) -
+                 m(1, 0) * m(0, 3) * m(2, 1) -
+                 m(2, 0) * m(0, 1) * m(1, 3) +
+                 m(2, 0) * m(0, 3) * m(1, 1);
+    inv(3, 3) = m(0, 0) * m(1, 1) * m(2, 2) -
+                m(0, 0) * m(1, 2) * m(2, 1) -
+                m(1, 0) * m(0, 1) * m(2, 2) +
+                m(1, 0) * m(0, 2) * m(2, 1) +
+                m(2, 0) * m(0, 1) * m(1, 2) -
+                m(2, 0) * m(0, 2) * m(1, 1);
+    auto det = m(0, 0) * inv(0, 0) +
+               m(0, 1) * inv(1, 0) +
+               m(0, 2) * inv(2, 0) +
+               m(0, 3) * inv(3, 0);
+    if (det == 0) {
+        return TMatrix4x4<T>{};
+    }
+    auto inv_det = 1.0 / det;
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            inv(i, j) *= inv_det;
+        }
+    }
+    return inv;
+}
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TMatrix3x3<T> &m) {
+    for (int i = 0; i < 3; i++) {
+        for (int j = 0; j < 3; j++) {
+            os << m(i, j) << " ";
+        }
+        os << std::endl;
+    }
+    return os;
+}
+template <typename T>
+inline std::ostream& operator<<(std::ostream &os, const TMatrix4x4<T> &m) {
+    for (int i = 0; i < 4; i++) {
+        for (int j = 0; j < 4; j++) {
+            os << m(i, j) << " ";
+        }
+        os << std::endl;
+    }
+    return os;
+}
+template <typename T>
+DEVICE
+TVector2<T> xform_pt(const TMatrix3x3<T> &m, const TVector2<T> &pt) {
+    TVector3<T> t{m(0, 0) * pt[0] + m(0, 1) * pt[1] + m(0, 2),
+                  m(1, 0) * pt[0] + m(1, 1) * pt[1] + m(1, 2),
+                  m(2, 0) * pt[0] + m(2, 1) * pt[1] + m(2, 2)};
+    return TVector2<T>{t[0] / t[2], t[1] / t[2]};
+}
+template <typename T>
+DEVICE
+void d_xform_pt(const TMatrix3x3<T> &m, const TVector2<T> &pt,
+                const TVector2<T> &d_out,
+                TMatrix3x3<T> &d_m,
+                TVector2<T> &d_pt) {
+    TVector3<T> t{m(0, 0) * pt[0] + m(0, 1) * pt[1] + m(0, 2),
+                  m(1, 0) * pt[0] + m(1, 1) * pt[1] + m(1, 2),
+                  m(2, 0) * pt[0] + m(2, 1) * pt[1] + m(2, 2)};
+    auto out = TVector2<T>{t[0] / t[2], t[1] / t[2]};
+    TVector3<T> d_t{d_out[0] / t[2],
+                    d_out[1] / t[2],
+                    -(d_out[0] * out[0] + d_out[1] * out[1]) / t[2]};
+    d_m(0, 0) += d_t[0] * pt[0];
+    d_m(0, 1) += d_t[0] * pt[1];
+    d_m(0, 2) += d_t[0];
+    d_m(1, 0) += d_t[1] * pt[0];
+    d_m(1, 1) += d_t[1] * pt[1];
+    d_m(1, 2) += d_t[1];
+    d_m(2, 0) += d_t[2] * pt[0];
+    d_m(2, 1) += d_t[2] * pt[1];
+    d_m(2, 2) += d_t[2];
+    d_pt[0] += d_t[0] * m(0, 0) + d_t[1] * m(1, 0) + d_t[2] * m(2, 0);
+    d_pt[1] += d_t[0] * m(0, 1) + d_t[1] * m(1, 1) + d_t[2] * m(2, 1);
+}
+template <typename T>
+DEVICE
+TVector2<T> xform_normal(const TMatrix3x3<T> &m_inv, const TVector2<T> &n) {
+    return normalize(TVector2<T>{m_inv(0, 0) * n[0] + m_inv(1, 0) * n[1],
+                                 m_inv(0, 1) * n[0] + m_inv(1, 1) * n[1]});
+}

DiffVG/painterly_rendering.py ADDED Viewed

	@@ -0,0 +1,223 @@

+"""
+Scream: python painterly_rendering.py imgs/scream.jpg --num_paths 2048 --max_width 4.0
+Fallingwater: python painterly_rendering.py imgs/fallingwater.jpg --num_paths 2048 --max_width 4.0
+Fallingwater: python painterly_rendering.py imgs/fallingwater.jpg --num_paths 2048 --max_width 4.0 --use_lpips_loss
+Baboon: python painterly_rendering.py imgs/baboon.png --num_paths 1024 --max_width 4.0 --num_iter 250
+Baboon Lpips: python painterly_rendering.py imgs/baboon.png --num_paths 1024 --max_width 4.0 --num_iter 500 --use_lpips_loss
+smile: python painterly_rendering.py ../LIVE/figures/smile.png --num_paths 5 --use_blob --num_iter 500
+"""
+import pydiffvg
+import torch
+import skimage
+import skimage.io
+import random
+import ttools.modules
+import argparse
+import math
+pydiffvg.set_print_timing(True)
+gamma = 1.0
+def main(args):
+    # Use GPU if available
+    pydiffvg.set_use_gpu(torch.cuda.is_available())
+    perception_loss = ttools.modules.LPIPS().to(pydiffvg.get_device())
+    #target = torch.from_numpy(skimage.io.imread('imgs/lena.png')).to(torch.float32) / 255.0
+    target = torch.from_numpy(skimage.io.imread(args.target)).to(torch.float32) / 255.0
+    target = target.pow(gamma)
+    target = target.to(pydiffvg.get_device())
+    target = target.unsqueeze(0)
+    target = target.permute(0, 3, 1, 2) # NHWC -> NCHW
+    #target = torch.nn.functional.interpolate(target, size = [256, 256], mode = 'area')
+    canvas_width, canvas_height = target.shape[3], target.shape[2]
+    num_paths = args.num_paths
+    max_width = args.max_width
+    random.seed(1234)
+    torch.manual_seed(1234)
+    shapes = []
+    shape_groups = []
+    if args.use_blob:
+        for i in range(num_paths):
+            num_segments = random.randint(3, 5)
+            num_control_points = torch.zeros(num_segments, dtype = torch.int32) + 2
+            points = []
+            p0 = (random.random(), random.random())
+            points.append(p0)
+            for j in range(num_segments):
+                radius = 0.05
+                p1 = (p0[0] + radius * (random.random() - 0.5), p0[1] + radius * (random.random() - 0.5))
+                p2 = (p1[0] + radius * (random.random() - 0.5), p1[1] + radius * (random.random() - 0.5))
+                p3 = (p2[0] + radius * (random.random() - 0.5), p2[1] + radius * (random.random() - 0.5))
+                points.append(p1)
+                points.append(p2)
+                if j < num_segments - 1:
+                    points.append(p3)
+                    p0 = p3
+            points = torch.tensor(points)
+            points[:, 0] *= canvas_width
+            points[:, 1] *= canvas_height
+            path = pydiffvg.Path(num_control_points = num_control_points,
+                                 points = points,
+                                 stroke_width = torch.tensor(1.0),
+                                 is_closed = True)
+            shapes.append(path)
+            path_group = pydiffvg.ShapeGroup(shape_ids = torch.tensor([len(shapes) - 1]),
+                                             fill_color = torch.tensor([random.random(),
+                                                                        random.random(),
+                                                                        random.random(),
+                                                                        random.random()]))
+            shape_groups.append(path_group)
+    else:
+        for i in range(num_paths):
+            num_segments = random.randint(1, 3)
+            num_control_points = torch.zeros(num_segments, dtype = torch.int32) + 2
+            points = []
+            p0 = (random.random(), random.random())
+            points.append(p0)
+            for j in range(num_segments):
+                radius = 0.05
+                p1 = (p0[0] + radius * (random.random() - 0.5), p0[1] + radius * (random.random() - 0.5))
+                p2 = (p1[0] + radius * (random.random() - 0.5), p1[1] + radius * (random.random() - 0.5))
+                p3 = (p2[0] + radius * (random.random() - 0.5), p2[1] + radius * (random.random() - 0.5))
+                points.append(p1)
+                points.append(p2)
+                points.append(p3)
+                p0 = p3
+            points = torch.tensor(points)
+            points[:, 0] *= canvas_width
+            points[:, 1] *= canvas_height
+            #points = torch.rand(3 * num_segments + 1, 2) * min(canvas_width, canvas_height)
+            path = pydiffvg.Path(num_control_points = num_control_points,
+                                 points = points,
+                                 stroke_width = torch.tensor(1.0),
+                                 is_closed = False)
+            shapes.append(path)
+            path_group = pydiffvg.ShapeGroup(shape_ids = torch.tensor([len(shapes) - 1]),
+                                             fill_color = None,
+                                             stroke_color = torch.tensor([random.random(),
+                                                                          random.random(),
+                                                                          random.random(),
+                                                                          random.random()]))
+            shape_groups.append(path_group)
+    scene_args = pydiffvg.RenderFunction.serialize_scene(\
+        canvas_width, canvas_height, shapes, shape_groups)
+    render = pydiffvg.RenderFunction.apply
+    img = render(canvas_width, # width
+                 canvas_height, # height
+                 2,   # num_samples_x
+                 2,   # num_samples_y
+                 0,   # seed
+                 None,
+                 *scene_args)
+    pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/init.png', gamma=gamma)
+    points_vars = []
+    stroke_width_vars = []
+    color_vars = []
+    for path in shapes:
+        path.points.requires_grad = True
+        points_vars.append(path.points)
+    if not args.use_blob:
+        for path in shapes:
+            path.stroke_width.requires_grad = True
+            stroke_width_vars.append(path.stroke_width)
+    if args.use_blob:
+        for group in shape_groups:
+            group.fill_color.requires_grad = True
+            color_vars.append(group.fill_color)
+    else:
+        for group in shape_groups:
+            group.stroke_color.requires_grad = True
+            color_vars.append(group.stroke_color)
+    # Optimize
+    points_optim = torch.optim.Adam(points_vars, lr=1.0)
+    if len(stroke_width_vars) > 0:
+        width_optim = torch.optim.Adam(stroke_width_vars, lr=0.1)
+    color_optim = torch.optim.Adam(color_vars, lr=0.01)
+    # Adam iterations.
+    for t in range(args.num_iter):
+        print('iteration:', t)
+        points_optim.zero_grad()
+        if len(stroke_width_vars) > 0:
+            width_optim.zero_grad()
+        color_optim.zero_grad()
+        # Forward pass: render the image.
+        scene_args = pydiffvg.RenderFunction.serialize_scene(\
+            canvas_width, canvas_height, shapes, shape_groups)
+        img = render(canvas_width, # width
+                     canvas_height, # height
+                     2,   # num_samples_x
+                     2,   # num_samples_y
+                     t,   # seed
+                     None,
+                     *scene_args)
+        # Compose img with white background
+        img = img[:, :, 3:4] * img[:, :, :3] + torch.ones(img.shape[0], img.shape[1], 3, device = pydiffvg.get_device()) * (1 - img[:, :, 3:4])
+        # Save the intermediate render.
+        pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/iter_{}.png'.format(t), gamma=gamma)
+        img = img[:, :, :3]
+        # Convert img from HWC to NCHW
+        img = img.unsqueeze(0)
+        img = img.permute(0, 3, 1, 2) # NHWC -> NCHW
+        if args.use_lpips_loss:
+            loss = perception_loss(img, target) + (img.mean() - target.mean()).pow(2)
+        else:
+            loss = (img - target).pow(2).mean()
+        print('render loss:', loss.item())
+        # Backpropagate the gradients.
+        loss.backward()
+        # Take a gradient descent step.
+        points_optim.step()
+        if len(stroke_width_vars) > 0:
+            width_optim.step()
+        color_optim.step()
+        if len(stroke_width_vars) > 0:
+            for path in shapes:
+                path.stroke_width.data.clamp_(1.0, max_width)
+        if args.use_blob:
+            for group in shape_groups:
+                group.fill_color.data.clamp_(0.0, 1.0)
+        else:
+            for group in shape_groups:
+                group.stroke_color.data.clamp_(0.0, 1.0)
+        if t % 10 == 0 or t == args.num_iter - 1:
+            pydiffvg.save_svg('results/painterly_rendering/iter_{}.svg'.format(t),
+                              canvas_width, canvas_height, shapes, shape_groups)
+    # Render the final result.
+    img = render(target.shape[1], # width
+                 target.shape[0], # height
+                 2,   # num_samples_x
+                 2,   # num_samples_y
+                 0,   # seed
+                 None,
+                 *scene_args)
+    # Save the intermediate render.
+    pydiffvg.imwrite(img.cpu(), 'results/painterly_rendering/final.png'.format(t), gamma=gamma)
+    # Convert the intermediate renderings to a video.
+    from subprocess import call
+    call(["ffmpeg", "-framerate", "24", "-i",
+        "results/painterly_rendering/iter_%d.png", "-vb", "20M",
+        "results/painterly_rendering/out.mp4"])
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("target", help="target image path")
+    parser.add_argument("--num_paths", type=int, default=512)
+    parser.add_argument("--max_width", type=float, default=2.0)
+    parser.add_argument("--use_lpips_loss", dest='use_lpips_loss', action='store_true')
+    parser.add_argument("--num_iter", type=int, default=500)
+    parser.add_argument("--use_blob", dest='use_blob', action='store_true')
+    args = parser.parse_args()
+    main(args)

DiffVG/parallel.cpp ADDED Viewed

	@@ -0,0 +1,273 @@

+#include "parallel.h"
+#include <list>
+#include <thread>
+#include <condition_variable>
+#include <vector>
+#include <cassert>
+// From https://github.com/mmp/pbrt-v3/blob/master/src/core/parallel.cpp
+static std::vector<std::thread> threads;
+static bool shutdownThreads = false;
+struct ParallelForLoop;
+static ParallelForLoop *workList = nullptr;
+static std::mutex workListMutex;
+struct ParallelForLoop {
+    ParallelForLoop(std::function<void(int64_t)> func1D, int64_t maxIndex, int chunkSize)
+        : func1D(std::move(func1D)), maxIndex(maxIndex), chunkSize(chunkSize) {
+    }
+    ParallelForLoop(const std::function<void(Vector2i)> &f, const Vector2i count)
+        : func2D(f), maxIndex(count[0] * count[1]), chunkSize(1) {
+        nX = count[0];
+    }
+    std::function<void(int64_t)> func1D;
+    std::function<void(Vector2i)> func2D;
+    const int64_t maxIndex;
+    const int chunkSize;
+    int64_t nextIndex = 0;
+    int activeWorkers = 0;
+    ParallelForLoop *next = nullptr;
+    int nX = -1;
+    bool Finished() const {
+        return nextIndex >= maxIndex && activeWorkers == 0;
+    }
+};
+void Barrier::Wait() {
+    std::unique_lock<std::mutex> lock(mutex);
+    assert(count > 0);
+    if (--count == 0) {
+        // This is the last thread to reach the barrier; wake up all of the
+        // other ones before exiting.
+        cv.notify_all();
+    } else {
+        // Otherwise there are still threads that haven't reached it. Give
+        // up the lock and wait to be notified.
+        cv.wait(lock, [this] { return count == 0; });
+    }
+}
+static std::condition_variable workListCondition;
+static void worker_thread_func(const int tIndex, std::shared_ptr<Barrier> barrier) {
+    ThreadIndex = tIndex;
+    // The main thread sets up a barrier so that it can be sure that all
+    // workers have called ProfilerWorkerThreadInit() before it continues
+    // (and actually starts the profiling system).
+    barrier->Wait();
+    // Release our reference to the Barrier so that it's freed once all of
+    // the threads have cleared it.
+    barrier.reset();
+    std::unique_lock<std::mutex> lock(workListMutex);
+    while (!shutdownThreads) {
+        if (!workList) {
+            // Sleep until there are more tasks to run
+            workListCondition.wait(lock);
+        } else {
+            // Get work from _workList_ and run loop iterations
+            ParallelForLoop &loop = *workList;
+            // Run a chunk of loop iterations for _loop_
+            // Find the set of loop iterations to run next
+            int64_t indexStart = loop.nextIndex;
+            int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+            // Update _loop_ to reflect iterations this thread will run
+            loop.nextIndex = indexEnd;
+            if (loop.nextIndex == loop.maxIndex)
+                workList = loop.next;
+            loop.activeWorkers++;
+            // Run loop indices in _[indexStart, indexEnd)_
+            lock.unlock();
+            for (int64_t index = indexStart; index < indexEnd; ++index) {
+                if (loop.func1D) {
+                    loop.func1D(index);
+                }
+                // Handle other types of loops
+                else {
+                    assert(loop.func2D != nullptr);
+                    loop.func2D(Vector2i{int(index % loop.nX),
+                                         int(index / loop.nX)});
+                }
+            }
+            lock.lock();
+            // Update _loop_ to reflect completion of iterations
+            loop.activeWorkers--;
+            if (loop.Finished()) {
+                workListCondition.notify_all();
+            }
+        }
+    }
+}
+void parallel_for_host(const std::function<void(int64_t)> &func,
+                       int64_t count,
+                       int chunkSize) {
+    // Run iterations immediately if not using threads or if _count_ is small
+    if (threads.empty() || count < chunkSize) {
+        for (int64_t i = 0; i < count; ++i) {
+            func(i);
+        }
+        return;
+    }
+    // Create and enqueue _ParallelForLoop_ for this loop
+    ParallelForLoop loop(func, count, chunkSize);
+    workListMutex.lock();
+    loop.next = workList;
+    workList = &loop;
+    workListMutex.unlock();
+    // Notify worker threads of work to be done
+    std::unique_lock<std::mutex> lock(workListMutex);
+    workListCondition.notify_all();
+    // Help out with parallel loop iterations in the current thread
+    while (!loop.Finished()) {
+        // Run a chunk of loop iterations for _loop_
+        // Find the set of loop iterations to run next
+        int64_t indexStart = loop.nextIndex;
+        int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+        // Update _loop_ to reflect iterations this thread will run
+        loop.nextIndex = indexEnd;
+        if (loop.nextIndex == loop.maxIndex) {
+            workList = loop.next;
+        }
+        loop.activeWorkers++;
+        // Run loop indices in _[indexStart, indexEnd)_
+        lock.unlock();
+        for (int64_t index = indexStart; index < indexEnd; ++index) {
+            if (loop.func1D) {
+                loop.func1D(index);
+            }
+            // Handle other types of loops
+            else {
+                assert(loop.func2D != nullptr);
+                loop.func2D(Vector2i{int(index % loop.nX),
+                                     int(index / loop.nX)});
+            }
+        }
+        lock.lock();
+        // Update _loop_ to reflect completion of iterations
+        loop.activeWorkers--;
+    }
+}
+thread_local int ThreadIndex;
+void parallel_for_host(
+        std::function<void(Vector2i)> func, const Vector2i count) {
+    // Launch worker threads if needed
+    if (threads.empty() || count.x * count.y <= 1) {
+        for (int y = 0; y < count.y; ++y) {
+            for (int x = 0; x < count.x; ++x) {
+                func(Vector2i{x, y});
+            }
+        }
+        return;
+    }
+    ParallelForLoop loop(std::move(func), count);
+    {
+        std::lock_guard<std::mutex> lock(workListMutex);
+        loop.next = workList;
+        workList = &loop;
+    }
+    std::unique_lock<std::mutex> lock(workListMutex);
+    workListCondition.notify_all();
+    // Help out with parallel loop iterations in the current thread
+    while (!loop.Finished()) {
+        // Run a chunk of loop iterations for _loop_
+        // Find the set of loop iterations to run next
+        int64_t indexStart = loop.nextIndex;
+        int64_t indexEnd = std::min(indexStart + loop.chunkSize, loop.maxIndex);
+        // Update _loop_ to reflect iterations this thread will run
+        loop.nextIndex = indexEnd;
+        if (loop.nextIndex == loop.maxIndex) {
+            workList = loop.next;
+        }
+        loop.activeWorkers++;
+        // Run loop indices in _[indexStart, indexEnd)_
+        lock.unlock();
+        for (int64_t index = indexStart; index < indexEnd; ++index) {
+            if (loop.func1D) {
+                loop.func1D(index);
+            }
+            // Handle other types of loops
+            else {
+                assert(loop.func2D != nullptr);
+                loop.func2D(Vector2i{int(index % loop.nX),
+                                     int(index / loop.nX)});
+            }
+        }
+        lock.lock();
+        // Update _loop_ to reflect completion of iterations
+        loop.activeWorkers--;
+    }
+}
+int num_system_cores() {
+    // return 1;
+    int ret = std::thread::hardware_concurrency();
+    if (ret == 0) {
+        return 16;
+    }
+    return ret;
+}
+void parallel_init() {
+    assert(threads.size() == 0);
+    int nThreads = num_system_cores();
+    ThreadIndex = 0;
+    // Create a barrier so that we can be sure all worker threads get past
+    // their call to ProfilerWorkerThreadInit() before we return from this
+    // function.  In turn, we can be sure that the profiling system isn't
+    // started until after all worker threads have done that.
+    std::shared_ptr<Barrier> barrier = std::make_shared<Barrier>(nThreads);
+    // Launch one fewer worker thread than the total number we want doing
+    // work, since the main thread helps out, too.
+    for (int i = 0; i < nThreads - 1; ++i) {
+        threads.push_back(std::thread(worker_thread_func, i + 1, barrier));
+    }
+    barrier->Wait();
+}
+void parallel_cleanup() {
+    if (threads.empty()) {
+        return;
+    }
+    {
+        std::lock_guard<std::mutex> lock(workListMutex);
+        shutdownThreads = true;
+        workListCondition.notify_all();
+    }
+    for (std::thread &thread : threads) {
+        thread.join();
+    }
+    threads.erase(threads.begin(), threads.end());
+    shutdownThreads = false;
+}

DiffVG/parallel.h ADDED Viewed

	@@ -0,0 +1,91 @@

+#pragma once
+#include "vector.h"
+#include <mutex>
+#include <condition_variable>
+#include <functional>
+#include <atomic>
+#include <cstdint>
+#include <cassert>
+#include <algorithm>
+// From https://github.com/mmp/pbrt-v3/blob/master/src/core/parallel.h
+class Barrier {
+  public:
+    Barrier(int count) : count(count) { assert(count > 0); }
+    ~Barrier() { assert(count == 0); }
+    void Wait();
+  private:
+    std::mutex mutex;
+    std::condition_variable cv;
+    int count;
+};
+void parallel_for_host(const std::function<void(int64_t)> &func,
+                       int64_t count,
+                       int chunkSize = 1);
+extern thread_local int ThreadIndex;
+void parallel_for_host(
+    std::function<void(Vector2i)> func, const Vector2i count);
+int num_system_cores();
+void parallel_init();
+void parallel_cleanup();
+#ifdef __CUDACC__
+template <typename T>
+__global__ void parallel_for_device_kernel(T functor, int count) {
+    auto idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= count) {
+        return;
+    }
+    functor(idx);
+}
+template <typename T>
+inline void parallel_for_device(T functor,
+                                int count,
+                                int work_per_thread = 256) {
+    if (count <= 0) {
+        return;
+    }
+    auto block_size = work_per_thread;
+    auto block_count = idiv_ceil(count, block_size);
+    parallel_for_device_kernel<T><<<block_count, block_size>>>(functor, count);
+}
+#endif
+template <typename T>
+inline void parallel_for(T functor,
+                         int count,
+                         bool use_gpu,
+                         int work_per_thread = -1) {
+    if (work_per_thread == -1) {
+        work_per_thread = use_gpu ? 64 : 256;
+    }
+    if (count <= 0) {
+        return;
+    }
+    if (use_gpu) {
+#ifdef __CUDACC__
+        auto block_size = work_per_thread;
+        auto block_count = idiv_ceil(count, block_size);
+        parallel_for_device_kernel<T><<<block_count, block_size>>>(functor, count);
+#else
+        throw std::runtime_error("diffvg not compiled with GPU");
+        assert(false);
+#endif
+    } else {
+        auto num_threads = idiv_ceil(count, work_per_thread);
+        parallel_for_host([&](int thread_index) {
+            auto id_offset = work_per_thread * thread_index;
+            auto work_end = std::min(id_offset + work_per_thread, count);
+            for (int work_id = id_offset; work_id < work_end; work_id++) {
+                auto idx = work_id;
+                assert(idx < count);
+                functor(idx);
+            }
+        }, num_threads);
+    }
+}

DiffVG/pcg.h ADDED Viewed

	@@ -0,0 +1,40 @@

+#pragma once
+#include "diffvg.h"
+// http://www.pcg-random.org/download.html
+struct pcg32_state {
+    uint64_t state;
+    uint64_t inc;
+};
+DEVICE inline uint32_t next_pcg32(pcg32_state *rng) {
+    uint64_t oldstate = rng->state;
+    // Advance internal state
+    rng->state = oldstate * 6364136223846793005ULL + (rng->inc|1);
+    // Calculate output function (XSH RR), uses old state for max ILP
+    uint32_t xorshifted = ((oldstate >> 18u) ^ oldstate) >> 27u;
+    uint32_t rot = oldstate >> 59u;
+    return (xorshifted >> rot) | (xorshifted << ((-rot) & 31));
+}
+// https://github.com/wjakob/pcg32/blob/master/pcg32.h
+DEVICE inline float next_pcg32_float(pcg32_state *rng) {
+    union {
+        uint32_t u;
+        float f;
+    } x;
+    x.u = (next_pcg32(rng) >> 9) | 0x3f800000u;
+    return x.f - 1.0f;
+}
+// Initialize each pixel with a PCG rng with a different stream
+DEVICE inline pcg32_state init_pcg32(int idx, uint64_t seed) {
+    pcg32_state state;
+    state.state = 0U;
+    state.inc = (((uint64_t)idx + 1) << 1u) | 1u;
+    next_pcg32(&state);
+    state.state += (0x853c49e6748fea9bULL + seed);
+    next_pcg32(&state);
+    return state;
+}

DiffVG/poetry.lock ADDED Viewed

The diff for this file is too large to render. See raw diff

DiffVG/ptr.h ADDED Viewed

	@@ -0,0 +1,23 @@

+#pragma once
+#include <cstddef>
+/**
+ * Python doesn't have a pointer type, therefore we create a pointer wrapper
+ * see https://stackoverflow.com/questions/48982143/returning-and-passing-around-raw-pod-pointers-arrays-with-python-c-and-pyb?rq=1
+ */
+template <typename T>
+class ptr {
+public:
+    ptr() : p(nullptr) {}
+    ptr(T* p) : p(p) {}
+    ptr(std::size_t p) : p((T*)p) {}
+    ptr(const ptr& other) : ptr(other.p) {}
+    T* operator->() const { return p; }
+    T* get() const { return p; }
+    void destroy() { delete p; }
+    bool is_null() const { return p == nullptr; }
+    size_t as_size_t() const {return (size_t)p;}
+private:
+    T* p;
+};

DiffVG/pybind11/.appveyor.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+version: 1.0.{build}
+image:
+- Visual Studio 2015
+test: off
+skip_branch_with_pr: true
+build:
+  parallel: true
+platform:
+- x86
+environment:
+  matrix:
+  - PYTHON: 36
+    CONFIG: Debug
+  - PYTHON: 27
+    CONFIG: Debug
+install:
+- ps: |
+    $env:CMAKE_GENERATOR = "Visual Studio 14 2015"
+    if ($env:PLATFORM -eq "x64") { $env:PYTHON = "$env:PYTHON-x64" }
+    $env:PATH = "C:\Python$env:PYTHON\;C:\Python$env:PYTHON\Scripts\;$env:PATH"
+    python -W ignore -m pip install --upgrade pip wheel
+    python -W ignore -m pip install pytest numpy --no-warn-script-location
+- ps: |
+    Start-FileDownload 'https://gitlab.com/libeigen/eigen/-/archive/3.3.7/eigen-3.3.7.zip'
+    7z x eigen-3.3.7.zip -y > $null
+    $env:CMAKE_INCLUDE_PATH = "eigen-3.3.7;$env:CMAKE_INCLUDE_PATH"
+build_script:
+- cmake -G "%CMAKE_GENERATOR%" -A "%CMAKE_ARCH%"
+    -DCMAKE_CXX_STANDARD=14
+    -DPYBIND11_WERROR=ON
+    -DDOWNLOAD_CATCH=ON
+    -DCMAKE_SUPPRESS_REGENERATION=1
+    .
+- set MSBuildLogger="C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
+- cmake --build . --config %CONFIG% --target pytest -- /m /v:m /logger:%MSBuildLogger%
+- cmake --build . --config %CONFIG% --target cpptest -- /m /v:m /logger:%MSBuildLogger%
+on_failure: if exist "tests\test_cmake_build" type tests\test_cmake_build\*.log*

DiffVG/pybind11/.cmake-format.yaml ADDED Viewed

	@@ -0,0 +1,73 @@

+parse:
+  additional_commands:
+    pybind11_add_module:
+      flags:
+        - THIN_LTO
+        - MODULE
+        - SHARED
+        - NO_EXTRAS
+        - EXCLUDE_FROM_ALL
+        - SYSTEM
+format:
+  line_width: 99
+  tab_size: 2
+  # If an argument group contains more than this many sub-groups
+  # (parg or kwarg groups) then force it to a vertical layout.
+  max_subgroups_hwrap: 2
+  # If a positional argument group contains more than this many
+  # arguments, then force it to a vertical layout.
+  max_pargs_hwrap: 6
+  # If a cmdline positional group consumes more than this many
+  # lines without nesting, then invalidate the layout (and nest)
+  max_rows_cmdline: 2
+  separate_ctrl_name_with_space: false
+  separate_fn_name_with_space: false
+  dangle_parens: false
+  # If the trailing parenthesis must be 'dangled' on its on
+  # 'line, then align it to this reference: `prefix`: the start'
+  # 'of the statement,  `prefix-indent`: the start of the'
+  # 'statement, plus one indentation  level, `child`: align to'
+  # the column of the arguments
+  dangle_align: prefix
+  # If the statement spelling length (including space and
+  # parenthesis) is smaller than this amount, then force reject
+  # nested layouts.
+  min_prefix_chars: 4
+  # If the statement spelling length (including space and
+  # parenthesis) is larger than the tab width by more than this
+  # amount, then force reject un-nested layouts.
+  max_prefix_chars: 10
+  # If a candidate layout is wrapped horizontally but it exceeds
+  # this many lines, then reject the layout.
+  max_lines_hwrap: 2
+  line_ending: unix
+  # Format command names consistently as 'lower' or 'upper' case
+  command_case: canonical
+  # Format keywords consistently as 'lower' or 'upper' case
+  # unchanged is valid too
+  keyword_case: 'upper'
+  # A list of command names which should always be wrapped
+  always_wrap: []
+  # If true, the argument lists which are known to be sortable
+  # will be sorted lexicographically
+  enable_sort: true
+  # If true, the parsers may infer whether or not an argument
+  # list is sortable (without annotation).
+  autosort: false
+# Causes a few issues - can be solved later, possibly.
+markup:
+  enable_markup: false

DiffVG/pybind11/.github/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,171 @@

+Thank you for your interest in this project! Please refer to the following
+sections on how to contribute code and bug reports.
+### Reporting bugs
+Before submitting a question or bug report, please take a moment of your time
+and ensure that your issue isn't already discussed in the project documentation
+provided at [pybind11.readthedocs.org][] or in the [issue tracker][]. You can
+also check [gitter][] to see if it came up before.
+Assuming that you have identified a previously unknown problem or an important
+question, it's essential that you submit a self-contained and minimal piece of
+code that reproduces the problem. In other words: no external dependencies,
+isolate the function(s) that cause breakage, submit matched and complete C++
+and Python snippets that can be easily compiled and run in isolation; or
+ideally make a small PR with a failing test case that can be used as a starting
+point.
+## Pull requests
+Contributions are submitted, reviewed, and accepted using GitHub pull requests.
+Please refer to [this article][using pull requests] for details and adhere to
+the following rules to make the process as smooth as possible:
+* Make a new branch for every feature you're working on.
+* Make small and clean pull requests that are easy to review but make sure they
+  do add value by themselves.
+* Add tests for any new functionality and run the test suite (`cmake --build
+  build --target pytest`) to ensure that no existing features break.
+* Please run [`pre-commit`][pre-commit] to check your code matches the
+  project style. (Note that `gawk` is required.) Use `pre-commit run
+  --all-files` before committing (or use installed-mode, check pre-commit docs)
+  to verify your code passes before pushing to save time.
+* This project has a strong focus on providing general solutions using a
+  minimal amount of code, thus small pull requests are greatly preferred.
+### Licensing of contributions
+pybind11 is provided under a BSD-style license that can be found in the
+``LICENSE`` file. By using, distributing, or contributing to this project, you
+agree to the terms and conditions of this license.
+You are under no obligation whatsoever to provide any bug fixes, patches, or
+upgrades to the features, functionality or performance of the source code
+("Enhancements") to anyone; however, if you choose to make your Enhancements
+available either publicly, or directly to the author of this software, without
+imposing a separate written license agreement for such Enhancements, then you
+hereby grant the following license: a non-exclusive, royalty-free perpetual
+license to install, use, modify, prepare derivative works, incorporate into
+other computer software, distribute, and sublicense such enhancements or
+derivative works thereof, in binary and source code form.
+## Development of pybind11
+To setup an ideal development environment, run the following commands on a
+system with CMake 3.14+:
+```bash
+python3 -m venv venv
+source venv/bin/activate
+pip install -r tests/requirements.txt
+cmake -S . -B build -DDOWNLOAD_CATCH=ON -DDOWNLOAD_EIGEN=ON
+cmake --build build -j4
+```
+Tips:
+* You can use `virtualenv` (from PyPI) instead of `venv` (which is Python 3
+  only).
+* You can select any name for your environment folder; if it contains "env" it
+  will be ignored by git.
+* If you don’t have CMake 3.14+, just add “cmake” to the pip install command.
+* You can use `-DPYBIND11_FINDPYTHON=ON` to use FindPython on CMake 3.12+
+* In classic mode, you may need to set `-DPYTHON_EXECUTABLE=/path/to/python`.
+  FindPython uses `-DPython_ROOT_DIR=/path/to` or
+  `-DPython_EXECUTABLE=/path/to/python`.
+### Configuration options
+In CMake, configuration options are given with “-D”. Options are stored in the
+build directory, in the `CMakeCache.txt` file, so they are remembered for each
+build directory. Two selections are special - the generator, given with `-G`,
+and the compiler, which is selected based on environment variables `CXX` and
+similar, or `-DCMAKE_CXX_COMPILER=`. Unlike the others, these cannot be changed
+after the initial run.
+The valid options are:
+* `-DCMAKE_BUILD_TYPE`: Release, Debug, MinSizeRel, RelWithDebInfo
+* `-DPYBIND11_FINDPYTHON=ON`: Use CMake 3.12+’s FindPython instead of the
+  classic, deprecated, custom FindPythonLibs
+* `-DPYBIND11_NOPYTHON=ON`: Disable all Python searching (disables tests)
+* `-DBUILD_TESTING=ON`: Enable the tests
+* `-DDOWNLOAD_CATCH=ON`: Download catch to build the C++ tests
+* `-DOWNLOAD_EIGEN=ON`: Download Eigen for the NumPy tests
+* `-DPYBIND11_INSTALL=ON/OFF`: Enable the install target (on by default for the
+  master project)
+* `-DUSE_PYTHON_INSTALL_DIR=ON`: Try to install into the python dir
+<details><summary>A few standard CMake tricks: (click to expand)</summary><p>
+* Use `cmake --build build -v` to see the commands used to build the files.
+* Use `cmake build -LH` to list the CMake options with help.
+* Use `ccmake` if available to see a curses (terminal) gui, or `cmake-gui` for
+  a completely graphical interface (not present in the PyPI package).
+* Use `cmake --build build -j12` to build with 12 cores (for example).
+* Use `-G` and the name of a generator to use something different. `cmake
+  --help` lists the generators available.
+      - On Unix, setting `CMAKE_GENERATER=Ninja` in your environment will give
+        you automatic mulithreading on all your CMake projects!
+* Open the `CMakeLists.txt` with QtCreator to generate for that IDE.
+* You can use `-DCMAKE_EXPORT_COMPILE_COMMANDS=ON` to generate the `.json` file
+  that some tools expect.
+</p></details>
+To run the tests, you can "build" the check target:
+```bash
+cmake --build build --target check
+```
+`--target` can be spelled `-t` in CMake 3.15+. You can also run individual
+tests with these targets:
+* `pytest`: Python tests only
+* `cpptest`: C++ tests only
+* `test_cmake_build`: Install / subdirectory tests
+If you want to build just a subset of tests, use
+`-DPYBIND11_TEST_OVERRIDE="test_callbacks.cpp;test_pickling.cpp"`. If this is
+empty, all tests will be built.
+### Formatting
+All formatting is handled by pre-commit.
+Install with brew (macOS) or pip (any OS):
+```bash
+# Any OS
+python3 -m pip install pre-commit
+# OR macOS with homebrew:
+brew install pre-commit
+```
+Then, you can run it on the items you've added to your staging area, or all
+files:
+```bash
+pre-commit run
+# OR
+pre-commit run --all-files
+```
+And, if you want to always use it, you can install it as a git hook (hence the
+name, pre-commit):
+```bash
+pre-commit install
+```
+[pre-commit]: https://pre-commit.com
+[pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/latest
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[gitter]: https://gitter.im/pybind/Lobby
+[using pull requests]: https://help.github.com/articles/using-pull-requests

DiffVG/pybind11/.github/ISSUE_TEMPLATE/bug-report.md ADDED Viewed

	@@ -0,0 +1,28 @@

+---
+name: Bug Report
+about: File an issue about a bug
+title: "[BUG] "
+---
+Make sure you've completed the following steps before submitting your issue -- thank you!
+1. Make sure you've read the [documentation][]. Your issue may be addressed there.
+2. Search the [issue tracker][] to verify that this hasn't already been reported. +1 or comment there if it has.
+3. Consider asking first in the [Gitter chat room][].
+4. Include a self-contained and minimal piece of code that reproduces the problem. If that's not possible, try to make the description as clear as possible.
+    a. If possible, make a PR with a new, failing test to give us a starting point to work on!
+[documentation]: https://pybind11.readthedocs.io
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+*After reading, remove this checklist and the template text in parentheses below.*
+## Issue description
+(Provide a short description, state the expected behavior and what actually happens.)
+## Reproducible example code
+(The code should be minimal, have no external dependencies, isolate the function(s) that cause breakage. Submit matched and complete C++ and Python snippets that can be easily compiled and run to diagnose the issue.)

DiffVG/pybind11/.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1,5 @@

+blank_issues_enabled: false
+contact_links:
+  - name: Gitter room
+    url: https://gitter.im/pybind/Lobby
+    about: A room for discussing pybind11 with an active community

DiffVG/pybind11/.github/ISSUE_TEMPLATE/feature-request.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+name: Feature Request
+about: File an issue about adding a feature
+title: "[FEAT] "
+---
+Make sure you've completed the following steps before submitting your issue -- thank you!
+1. Check if your feature has already been mentioned / rejected / planned in other issues.
+2. If those resources didn't help, consider asking in the [Gitter chat room][] to see if this is interesting / useful to a larger audience and possible to implement reasonably,
+4. If you have a useful feature that passes the previous items (or not suitable for chat), please fill in the details below.
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+*After reading, remove this checklist.*

DiffVG/pybind11/.github/ISSUE_TEMPLATE/question.md ADDED Viewed

	@@ -0,0 +1,21 @@

+---
+name: Question
+about: File an issue about unexplained behavior
+title: "[QUESTION] "
+---
+If you have a question, please check the following first:
+1. Check if your question has already been answered in the [FAQ][] section.
+2. Make sure you've read the [documentation][]. Your issue may be addressed there.
+3. If those resources didn't help and you only have a short question (not a bug report), consider asking in the [Gitter chat room][]
+4. Search the [issue tracker][], including the closed issues, to see if your question has already been asked/answered. +1 or comment if it has been asked but has no answer.
+5. If you have a more complex question which is not answered in the previous items (or not suitable for chat), please fill in the details below.
+6. Include a self-contained and minimal piece of code that illustrates your question. If that's not possible, try to make the description as clear as possible.
+[FAQ]: http://pybind11.readthedocs.io/en/latest/faq.html
+[documentation]: https://pybind11.readthedocs.io
+[issue tracker]: https://github.com/pybind/pybind11/issues
+[Gitter chat room]: https://gitter.im/pybind/Lobby
+*After reading, remove this checklist.*

DiffVG/pybind11/.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,359 @@

+name: CI
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - master
+      - stable
+      - v*
+jobs:
+  standard:
+    strategy:
+      matrix:
+        runs-on: [ubuntu-latest, windows-latest, macos-latest]
+        arch: [x64]
+        python:
+        - 2.7
+        - 3.5
+        - 3.8
+        - pypy2
+        - pypy3
+        include:
+          - runs-on: ubuntu-latest
+            python: 3.6
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+          - runs-on: windows-2016
+            python: 3.7
+            arch: x86
+            args2: >
+              -DCMAKE_CXX_FLAGS="/permissive- /EHsc /GR"
+          - runs-on: windows-latest
+            python: 3.6
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+          - runs-on: windows-latest
+            python: 3.7
+            arch: x64
+          - runs-on: ubuntu-latest
+            python: 3.9-dev
+            arch: x64
+          - runs-on: macos-latest
+            python: 3.9-dev
+            arch: x64
+            args: >
+              -DPYBIND11_FINDPYTHON=ON
+        exclude:
+            # Currently 32bit only, and we build 64bit
+          - runs-on: windows-latest
+            python: pypy2
+            arch: x64
+          - runs-on: windows-latest
+            python: pypy3
+            arch: x64
+            # Currently broken on embed_test
+          - runs-on: windows-latest
+            python: 3.8
+            arch: x64
+          - runs-on: windows-latest
+            python: 3.9-dev
+            arch: x64
+    name: "🐍 ${{ matrix.python }} • ${{ matrix.runs-on }} • ${{ matrix.arch }} ${{ matrix.args }}"
+    runs-on: ${{ matrix.runs-on }}
+    continue-on-error: ${{ endsWith(matrix.python, 'dev') }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python }}
+        architecture: ${{ matrix.arch }}
+    - name: Setup Boost (Windows / Linux latest)
+      run: echo "::set-env name=BOOST_ROOT::$BOOST_ROOT_1_72_0"
+    - name: Update CMake
+      uses: jwlawson/actions-setup-cmake@v1.3
+    - name: Cache wheels
+      if: runner.os == 'macOS'
+      uses: actions/cache@v2
+      with:
+        # This path is specific to macOS - we really only need it for PyPy NumPy wheels
+        # See https://github.com/actions/cache/blob/master/examples.md#python---pip
+        # for ways to do this more generally
+        path: ~/Library/Caches/pip
+        # Look to see if there is a cache hit for the corresponding requirements file
+        key: ${{ runner.os }}-pip-${{ matrix.python }}-${{ matrix.arch }}-${{ hashFiles('tests/requirements.txt') }}
+    - name: Prepare env
+      run: python -m pip install -r tests/requirements.txt --prefer-binary
+    - name: Setup annotations
+      run: python -m pip install pytest-github-actions-annotate-failures
+    - name: Configure C++11 ${{ matrix.args }}
+      run: >
+        cmake -S . -B .
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=11
+        ${{ matrix.args }}
+    - name: Build C++11
+      run: cmake --build . -j 2
+    - name: Python tests C++11
+      run: cmake --build . --target pytest -j 2
+    - name: C++11 tests
+      run: cmake --build .  --target cpptest -j 2
+    - name: Interface test C++11
+      run: cmake --build . --target test_cmake_build
+    - name: Clean directory
+      run: git clean -fdx
+    - name: Configure ${{ matrix.args2 }}
+      run: >
+        cmake -S . -B build2
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=17
+        ${{ matrix.args }}
+        ${{ matrix.args2 }}
+    - name: Build
+      run: cmake --build build2 -j 2
+    - name: Python tests
+      run: cmake --build build2 --target pytest
+    - name: C++ tests
+      run: cmake --build build2 --target cpptest
+    - name: Interface test
+      run: cmake --build build2 --target test_cmake_build
+  clang:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        clang:
+          - 3.6
+          - 3.7
+          - 3.9
+          - 5
+          - 7
+          - 9
+          - dev
+    name: "🐍 3 • Clang ${{ matrix.clang }} • x64"
+    container: "silkeh/clang:${{ matrix.clang }}"
+    steps:
+    - uses: actions/checkout@v2
+    - name: Add wget and python3
+      run: apt-get update && apt-get install -y python3-dev python3-numpy python3-pytest libeigen3-dev
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Build
+      run: cmake --build build -j 2
+    - name: Python tests
+      run: cmake --build build --target pytest
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+  gcc:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        gcc:
+          - 7
+          - latest
+    name: "🐍 3 • GCC ${{ matrix.gcc }} • x64"
+    container: "gcc:${{ matrix.gcc }}"
+    steps:
+    - uses: actions/checkout@v1
+    - name: Add Python 3
+      run: apt-get update; apt-get install -y python3-dev python3-numpy python3-pytest python3-pip libeigen3-dev
+    - name: Update pip
+      run: python3 -m pip install --upgrade pip
+    - name: Setup CMake 3.18
+      uses: jwlawson/actions-setup-cmake@v1.3
+      with:
+        cmake-version: 3.18
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DCMAKE_CXX_STANDARD=11
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Build
+      run: cmake --build build -j 2
+    - name: Python tests
+      run: cmake --build build --target pytest
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+  centos:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        centos:
+          - 7  # GCC 4.8
+          - 8
+    name: "🐍 3 • CentOS ${{ matrix.centos }} • x64"
+    container: "centos:${{ matrix.centos }}"
+    steps:
+    - uses: actions/checkout@v2
+    - name: Add Python 3
+      run: yum update -y && yum install -y python3-devel gcc-c++ make git
+    - name: Update pip
+      run: python3 -m pip install --upgrade pip
+    - name: Install dependencies
+      run: python3 -m pip install cmake -r tests/requirements.txt --prefer-binary
+    - name: Configure
+      shell: bash
+      run: >
+        cmake -S . -B build
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DDOWNLOAD_EIGEN=ON
+        -DCMAKE_CXX_STANDARD=11
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Build
+      run: cmake --build build -j 2
+    - name: Python tests
+      run: cmake --build build --target pytest
+    - name: C++ tests
+      run: cmake --build build --target cpptest
+    - name: Interface test
+      run: cmake --build build --target test_cmake_build
+  install-classic:
+    name: "🐍 3.5 • Debian • x86 •  Install"
+    runs-on: ubuntu-latest
+    container: i386/debian:stretch
+    steps:
+    - uses: actions/checkout@v1
+    - name: Install requirements
+      run: |
+        apt-get update
+        apt-get install -y git make cmake g++ libeigen3-dev python3-dev python3-pip
+        pip3 install "pytest==3.1.*"
+    - name: Configure for install
+      run: >
+        cmake .
+        -DPYBIND11_INSTALL=1 -DPYBIND11_TEST=0
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+    - name: Make and install
+      run: make install
+    - name: Copy tests to new directory
+      run: cp -a tests /pybind11-tests
+    - name: Make a new test directory
+      run: mkdir /build-tests
+    - name: Configure tests
+      run: >
+        cmake ../pybind11-tests
+        -DDOWNLOAD_CATCH=ON
+        -DPYBIND11_WERROR=ON
+        -DPYTHON_EXECUTABLE=$(python3 -c "import sys; print(sys.executable)")
+      working-directory: /build-tests
+    - name: Run tests
+      run: make pytest -j 2
+      working-directory: /build-tests
+  doxygen:
+    name: "Documentation build test"
+    runs-on: ubuntu-latest
+    container: alpine:3.12
+    steps:
+    - uses: actions/checkout@v2
+    - name: Install system requirements
+      run: apk add doxygen python3-dev
+    - name: Ensure pip
+      run: python3 -m ensurepip
+    - name: Install docs & setup requirements
+      run: python3 -m pip install -r docs/requirements.txt pytest setuptools
+    - name: Build docs
+      run: python3 -m sphinx -W -b html docs docs/.build
+    - name: Make SDist
+      run: python3 setup.py sdist
+    - name: Compare Dists (headers only)
+      run: |
+        python3 -m pip install --user -U ./dist/*
+        installed=$(python3 -c "import pybind11; print(pybind11.get_include(True) + '/pybind11')")
+        diff -rq $installed ./include/pybind11

DiffVG/pybind11/.github/workflows/configure.yml ADDED Viewed

	@@ -0,0 +1,78 @@

+name: Config
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+      - master
+      - stable
+      - v*
+jobs:
+  cmake:
+    strategy:
+      fail-fast: false
+      matrix:
+        runs-on: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [x64]
+        cmake: [3.18]
+        include:
+        - runs-on: ubuntu-latest
+          arch: x64
+          cmake: 3.4
+        - runs-on: macos-latest
+          arch: x64
+          cmake: 3.7
+        - runs-on: windows-2016
+          arch: x86
+          cmake: 3.8
+        - runs-on: windows-2016
+          arch: x86
+          cmake: 3.18
+    name: 🐍 3.7 • CMake ${{ matrix.cmake }} • ${{ matrix.runs-on }}
+    runs-on: ${{ matrix.runs-on }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python 3.7
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.7
+        architecture: ${{ matrix.arch }}
+    - name: Prepare env
+      run: python -m pip install -r tests/requirements.txt
+    - name: Setup CMake ${{ matrix.cmake }}
+      uses: jwlawson/actions-setup-cmake@v1.3
+      with:
+        cmake-version: ${{ matrix.cmake }}
+    - name: Make build directories
+      run: mkdir "build dir"
+    - name: Configure
+      working-directory: build dir
+      shell: bash
+      run: >
+        cmake ..
+        -DPYBIND11_WERROR=ON
+        -DDOWNLOAD_CATCH=ON
+        -DPYTHON_EXECUTABLE=$(python -c "import sys; print(sys.executable)")
+    - name: Build
+      working-directory: build dir
+      if: github.event_name == 'workflow_dispatch'
+      run: cmake --build . --config Release
+    - name: Test
+      working-directory: build dir
+      if: github.event_name == 'workflow_dispatch'
+      run: cmake --build . --config Release --target check

DiffVG/pybind11/.github/workflows/format.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+name: Format
+on:
+  workflow_dispatch:
+  pull_request:
+  push:
+    branches:
+    - master
+    - stable
+    - "v*"
+jobs:
+  pre-commit:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: actions/setup-python@v2
+    - uses: pre-commit/action@v2.0.0

DiffVG/pybind11/.gitignore ADDED Viewed

	@@ -0,0 +1,41 @@

+CMakeCache.txt
+CMakeFiles
+Makefile
+cmake_install.cmake
+cmake_uninstall.cmake
+.DS_Store
+*.so
+*.pyd
+*.dll
+*.sln
+*.sdf
+*.opensdf
+*.vcxproj
+*.vcxproj.user
+*.filters
+example.dir
+Win32
+x64
+Release
+Debug
+.vs
+CTestTestfile.cmake
+Testing
+autogen
+MANIFEST
+/.ninja_*
+/*.ninja
+/docs/.build
+*.py[co]
+*.egg-info
+*~
+.*.swp
+.DS_Store
+/dist
+/*build*
+.cache/
+sosize-*.txt
+pybind11Config*.cmake
+pybind11Targets.cmake
+/*env*
+/.vscode

DiffVG/pybind11/.gitmodules ADDED Viewed

	@@ -0,0 +1,3 @@

+[submodule "tools/clang"]
+	path = tools/clang
+	url = ../../wjakob/clang-cindex-python3.git

DiffVG/pybind11/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,44 @@

+repos:
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v3.1.0
+  hooks:
+  - id: check-added-large-files
+  - id: check-case-conflict
+  - id: check-merge-conflict
+  - id: check-symlinks
+  - id: check-yaml
+  - id: debug-statements
+  - id: end-of-file-fixer
+  - id: mixed-line-ending
+  - id: requirements-txt-fixer
+  - id: trailing-whitespace
+  - id: fix-encoding-pragma
+- repo: https://github.com/Lucas-C/pre-commit-hooks
+  rev: v1.1.9
+  hooks:
+  - id: remove-tabs
+- repo: https://gitlab.com/pycqa/flake8
+  rev: 3.8.3
+  hooks:
+  - id: flake8
+    additional_dependencies: [flake8-bugbear, pep8-naming]
+    exclude: ^(docs/.*|tools/.*)$
+- repo: https://github.com/cheshirekow/cmake-format-precommit
+  rev: v0.6.11
+  hooks:
+  - id: cmake-format
+    additional_dependencies: [pyyaml]
+    types: [file]
+    files: (\.cmake|CMakeLists.txt)(.in)?$
+- repo: local
+  hooks:
+  - id: check-style
+    name: Classic check-style
+    language: system
+    types:
+    - c++
+    entry: ./tools/check-style.sh

DiffVG/pybind11/.readthedocs.yml ADDED Viewed

	@@ -0,0 +1,3 @@

+python:
+  version: 3
+requirements_file: docs/requirements.txt

DiffVG/pybind11/CMakeLists.txt ADDED Viewed

	@@ -0,0 +1,271 @@

+# CMakeLists.txt -- Build system for the pybind11 modules
+#
+# Copyright (c) 2015 Wenzel Jakob <wenzel@inf.ethz.ch>
+#
+# All rights reserved. Use of this source code is governed by a
+# BSD-style license that can be found in the LICENSE file.
+cmake_minimum_required(VERSION 3.4)
+# The `cmake_minimum_required(VERSION 3.4...3.18)` syntax does not work with
+# some versions of VS that have a patched CMake 3.11. This forces us to emulate
+# the behavior using the following workaround:
+if(${CMAKE_VERSION} VERSION_LESS 3.18)
+  cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
+else()
+  cmake_policy(VERSION 3.18)
+endif()
+# Extract project version from source
+file(STRINGS "${CMAKE_CURRENT_SOURCE_DIR}/include/pybind11/detail/common.h"
+     pybind11_version_defines REGEX "#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) ")
+foreach(ver ${pybind11_version_defines})
+  if(ver MATCHES [[#define PYBIND11_VERSION_(MAJOR|MINOR|PATCH) +([^ ]+)$]])
+    set(PYBIND11_VERSION_${CMAKE_MATCH_1} "${CMAKE_MATCH_2}")
+  endif()
+endforeach()
+if(PYBIND11_VERSION_PATCH MATCHES [[([a-zA-Z]+)]])
+  set(pybind11_VERSION_TYPE "${CMAKE_MATCH_1}")
+endif()
+string(REGEX MATCH "[0-9]+" PYBIND11_VERSION_PATCH "${PYBIND11_VERSION_PATCH}")
+project(
+  pybind11
+  LANGUAGES CXX
+  VERSION "${PYBIND11_VERSION_MAJOR}.${PYBIND11_VERSION_MINOR}.${PYBIND11_VERSION_PATCH}")
+# Standard includes
+include(GNUInstallDirs)
+include(CMakePackageConfigHelpers)
+include(CMakeDependentOption)
+if(NOT pybind11_FIND_QUIETLY)
+  message(STATUS "pybind11 v${pybind11_VERSION} ${pybind11_VERSION_TYPE}")
+endif()
+# Check if pybind11 is being used directly or via add_subdirectory
+if(CMAKE_SOURCE_DIR STREQUAL PROJECT_SOURCE_DIR)
+  ### Warn if not an out-of-source builds
+  if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+    set(lines
+        "You are building in-place. If that is not what you intended to "
+        "do, you can clean the source directory with:\n"
+        "rm -r CMakeCache.txt CMakeFiles/ cmake_uninstall.cmake pybind11Config.cmake "
+        "pybind11ConfigVersion.cmake tests/CMakeFiles/\n")
+    message(AUTHOR_WARNING ${lines})
+  endif()
+  set(PYBIND11_MASTER_PROJECT ON)
+  if(OSX AND CMAKE_VERSION VERSION_LESS 3.7)
+    # Bug in macOS CMake < 3.7 is unable to download catch
+    message(WARNING "CMAKE 3.7+ needed on macOS to download catch, and newer HIGHLY recommended")
+  elseif(WINDOWS AND CMAKE_VERSION VERSION_LESS 3.8)
+    # Only tested with 3.8+ in CI.
+    message(WARNING "CMAKE 3.8+ tested on Windows, previous versions untested")
+  endif()
+  message(STATUS "CMake ${CMAKE_VERSION}")
+  if(CMAKE_CXX_STANDARD)
+    set(CMAKE_CXX_EXTENSIONS OFF)
+    set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  endif()
+else()
+  set(PYBIND11_MASTER_PROJECT OFF)
+  set(pybind11_system SYSTEM)
+endif()
+# Options
+option(PYBIND11_INSTALL "Install pybind11 header files?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_TEST "Build pybind11 test suite?" ${PYBIND11_MASTER_PROJECT})
+option(PYBIND11_NOPYTHON "Disable search for Python" OFF)
+cmake_dependent_option(
+  USE_PYTHON_INCLUDE_DIR
+  "Install pybind11 headers in Python include directory instead of default installation prefix"
+  OFF "PYBIND11_INSTALL" OFF)
+cmake_dependent_option(PYBIND11_FINDPYTHON "Force new FindPython" OFF
+                       "NOT CMAKE_VERSION VERSION_LESS 3.12" OFF)
+# NB: when adding a header don't forget to also add it to setup.py
+set(PYBIND11_HEADERS
+    include/pybind11/detail/class.h
+    include/pybind11/detail/common.h
+    include/pybind11/detail/descr.h
+    include/pybind11/detail/init.h
+    include/pybind11/detail/internals.h
+    include/pybind11/detail/typeid.h
+    include/pybind11/attr.h
+    include/pybind11/buffer_info.h
+    include/pybind11/cast.h
+    include/pybind11/chrono.h
+    include/pybind11/common.h
+    include/pybind11/complex.h
+    include/pybind11/options.h
+    include/pybind11/eigen.h
+    include/pybind11/embed.h
+    include/pybind11/eval.h
+    include/pybind11/iostream.h
+    include/pybind11/functional.h
+    include/pybind11/numpy.h
+    include/pybind11/operators.h
+    include/pybind11/pybind11.h
+    include/pybind11/pytypes.h
+    include/pybind11/stl.h
+    include/pybind11/stl_bind.h)
+# Compare with grep and warn if mismatched
+if(PYBIND11_MASTER_PROJECT AND NOT CMAKE_VERSION VERSION_LESS 3.12)
+  file(
+    GLOB_RECURSE _pybind11_header_check
+    LIST_DIRECTORIES false
+    RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}"
+    CONFIGURE_DEPENDS "include/pybind11/*.h")
+  set(_pybind11_here_only ${PYBIND11_HEADERS})
+  set(_pybind11_disk_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_here_only ${_pybind11_header_check})
+  list(REMOVE_ITEM _pybind11_disk_only ${PYBIND11_HEADERS})
+  if(_pybind11_here_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS has extra files:" ${_pybind11_here_only})
+  endif()
+  if(_pybind11_disk_only)
+    message(AUTHOR_WARNING "PYBIND11_HEADERS is missing files:" ${_pybind11_disk_only})
+  endif()
+endif()
+# CMake 3.12 added list(TRANSFORM <list> PREPEND
+# But we can't use it yet
+string(REPLACE "include/" "${CMAKE_CURRENT_SOURCE_DIR}/include/" PYBIND11_HEADERS
+               "${PYBIND11_HEADERS}")
+# Cache variables so pybind11_add_module can be used in parent projects
+set(PYBIND11_INCLUDE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/include"
+    CACHE INTERNAL "")
+# Note: when creating targets, you cannot use if statements at configure time -
+# you need generator expressions, because those will be placed in the target file.
+# You can also place ifs *in* the Config.in, but not here.
+# This section builds targets, but does *not* touch Python
+# Build the headers-only target (no Python included):
+# (long name used here to keep this from clashing in subdirectory mode)
+add_library(pybind11_headers INTERFACE)
+add_library(pybind11::pybind11_headers ALIAS pybind11_headers) # to match exported target
+add_library(pybind11::headers ALIAS pybind11_headers) # easier to use/remember
+include("${CMAKE_CURRENT_SOURCE_DIR}/tools/pybind11Common.cmake")
+if(NOT PYBIND11_MASTER_PROJECT AND NOT pybind11_FIND_QUIETLY)
+  message(STATUS "Using pybind11: (version \"${pybind11_VERSION}\" ${pybind11_VERSION_TYPE})")
+endif()
+# Relative directory setting
+if(USE_PYTHON_INCLUDE_DIR AND DEFINED Python_INCLUDE_DIRS)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${Python_INCLUDE_DIRS})
+elseif(USE_PYTHON_INCLUDE_DIR AND DEFINED PYTHON_INCLUDE_DIR)
+  file(RELATIVE_PATH CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_PREFIX} ${PYTHON_INCLUDE_DIRS})
+endif()
+# Fill in headers target
+target_include_directories(
+  pybind11_headers ${pybind11_system} INTERFACE $<BUILD_INTERFACE:${PYBIND11_INCLUDE_DIR}>
+                                                $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+target_compile_features(pybind11_headers INTERFACE cxx_inheriting_constructors cxx_user_literals
+                                                   cxx_right_angle_brackets)
+if(PYBIND11_INSTALL)
+  install(DIRECTORY ${PYBIND11_INCLUDE_DIR}/pybind11 DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+  # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
+  set(PYBIND11_CMAKECONFIG_INSTALL_DIR
+      "share/cmake/${PROJECT_NAME}"
+      CACHE STRING "install path for pybind11Config.cmake")
+  configure_package_config_file(
+    tools/${PROJECT_NAME}Config.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+    INSTALL_DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+  if(CMAKE_VERSION VERSION_LESS 3.14)
+    # Remove CMAKE_SIZEOF_VOID_P from ConfigVersion.cmake since the library does
+    # not depend on architecture specific settings or libraries.
+    set(_PYBIND11_CMAKE_SIZEOF_VOID_P ${CMAKE_SIZEOF_VOID_P})
+    unset(CMAKE_SIZEOF_VOID_P)
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion)
+    set(CMAKE_SIZEOF_VOID_P ${_PYBIND11_CMAKE_SIZEOF_VOID_P})
+  else()
+    # CMake 3.14+ natively supports header-only libraries
+    write_basic_package_version_file(
+      ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+      VERSION ${PROJECT_VERSION}
+      COMPATIBILITY AnyNewerVersion ARCH_INDEPENDENT)
+  endif()
+  install(
+    FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+          ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake
+          tools/FindPythonLibsNew.cmake
+          tools/pybind11Common.cmake
+          tools/pybind11Tools.cmake
+          tools/pybind11NewTools.cmake
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+  if(NOT PYBIND11_EXPORT_NAME)
+    set(PYBIND11_EXPORT_NAME "${PROJECT_NAME}Targets")
+  endif()
+  install(TARGETS pybind11_headers EXPORT "${PYBIND11_EXPORT_NAME}")
+  install(
+    EXPORT "${PYBIND11_EXPORT_NAME}"
+    NAMESPACE "pybind11::"
+    DESTINATION ${PYBIND11_CMAKECONFIG_INSTALL_DIR})
+  # Uninstall target
+  if(PYBIND11_MASTER_PROJECT)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/tools/cmake_uninstall.cmake.in"
+                   "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY)
+    add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} -P
+                                        ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake)
+  endif()
+endif()
+# BUILD_TESTING takes priority, but only if this is the master project
+if(PYBIND11_MASTER_PROJECT AND DEFINED BUILD_TESTING)
+  if(BUILD_TESTING)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+else()
+  if(PYBIND11_TEST)
+    if(_pybind11_nopython)
+      message(FATAL_ERROR "Cannot activate tests in NOPYTHON mode")
+    else()
+      add_subdirectory(tests)
+    endif()
+  endif()
+endif()
+# Better symmetry with find_package(pybind11 CONFIG) mode.
+if(NOT PYBIND11_MASTER_PROJECT)
+  set(pybind11_FOUND
+      TRUE
+      CACHE INTERNAL "true if pybind11 and all required components found on the system")
+  set(pybind11_INCLUDE_DIR
+      "${PYBIND11_INCLUDE_DIR}"
+      CACHE INTERNAL "Directory where pybind11 headers are located")
+endif()

DiffVG/pybind11/LICENSE ADDED Viewed

	@@ -0,0 +1,29 @@

+Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>, All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its contributors
+   may be used to endorse or promote products derived from this software
+   without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Please also refer to the file .github/CONTRIBUTING.md, which clarifies licensing of
+external contributions to this project including patches, pull requests, etc.

DiffVG/pybind11/MANIFEST.in ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ recursive-include include/pybind11 *.h
2	+ include LICENSE README.md .github/CONTRIBUTING.md

DiffVG/pybind11/README.md ADDED Viewed

	@@ -0,0 +1,143 @@

+![pybind11 logo](https://github.com/pybind/pybind11/raw/master/docs/pybind11-logo.png)
+# pybind11 — Seamless operability between C++11 and Python
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=master)](http://pybind11.readthedocs.org/en/master/?badge=master)
+[![Documentation Status](https://readthedocs.org/projects/pybind11/badge/?version=stable)](http://pybind11.readthedocs.org/en/stable/?badge=stable)
+[![Gitter chat](https://img.shields.io/gitter/room/gitterHQ/gitter.svg)](https://gitter.im/pybind/Lobby)
+[![CI](https://github.com/pybind/pybind11/workflows/CI/badge.svg)](https://github.com/pybind/pybind11/actions)
+[![Build status](https://ci.appveyor.com/api/projects/status/riaj54pn4h08xy40?svg=true)](https://ci.appveyor.com/project/wjakob/pybind11)
+**pybind11** is a lightweight header-only library that exposes C++ types in
+Python and vice versa, mainly to create Python bindings of existing C++ code.
+Its goals and syntax are similar to the excellent [Boost.Python][] library by
+David Abrahams: to minimize boilerplate code in traditional extension modules
+by inferring type information using compile-time introspection.
+The main issue with Boost.Python—and the reason for creating such a similar
+project—is Boost. Boost is an enormously large and complex suite of utility
+libraries that works with almost every C++ compiler in existence. This
+compatibility has its cost: arcane template tricks and workarounds are
+necessary to support the oldest and buggiest of compiler specimens. Now that
+C++11-compatible compilers are widely available, this heavy machinery has
+become an excessively large and unnecessary dependency.
+Think of this library as a tiny self-contained version of Boost.Python with
+everything stripped away that isn't relevant for binding generation. Without
+comments, the core header files only require ~4K lines of code and depend on
+Python (2.7 or 3.5+, or PyPy) and the C++ standard library. This compact
+implementation was possible thanks to some of the new C++11 language features
+(specifically: tuples, lambda functions and variadic templates). Since its
+creation, this library has grown beyond Boost.Python in many ways, leading to
+dramatically simpler binding code in many common situations.
+Tutorial and reference documentation is provided at
+[pybind11.readthedocs.org][].  A PDF version of the manual is available
+[here][docs-pdf].
+## Core features
+pybind11 can map the following core C++ features to Python:
+- Functions accepting and returning custom data structures per value, reference, or pointer
+- Instance methods and static methods
+- Overloaded functions
+- Instance attributes and static attributes
+- Arbitrary exception types
+- Enumerations
+- Callbacks
+- Iterators and ranges
+- Custom operators
+- Single and multiple inheritance
+- STL data structures
+- Smart pointers with reference counting like `std::shared_ptr`
+- Internal references with correct reference counting
+- C++ classes with virtual (and pure virtual) methods can be extended in Python
+## Goodies
+In addition to the core functionality, pybind11 provides some extra goodies:
+- Python 2.7, 3.5+, and PyPy (tested on 7.3) are supported with an implementation-agnostic
+  interface.
+- It is possible to bind C++11 lambda functions with captured variables. The
+  lambda capture data is stored inside the resulting Python function object.
+- pybind11 uses C++11 move constructors and move assignment operators whenever
+  possible to efficiently transfer custom data types.
+- It's easy to expose the internal storage of custom data types through
+  Pythons' buffer protocols. This is handy e.g. for fast conversion between
+  C++ matrix classes like Eigen and NumPy without expensive copy operations.
+- pybind11 can automatically vectorize functions so that they are transparently
+  applied to all entries of one or more NumPy array arguments.
+- Python's slice-based access and assignment operations can be supported with
+  just a few lines of code.
+- Everything is contained in just a few header files; there is no need to link
+  against any additional libraries.
+- Binaries are generally smaller by a factor of at least 2 compared to
+  equivalent bindings generated by Boost.Python. A recent pybind11 conversion
+  of PyRosetta, an enormous Boost.Python binding project,
+  [reported][pyrosetta-report] a binary size reduction of **5.4x** and compile
+  time reduction by **5.8x**.
+- Function signatures are precomputed at compile time (using `constexpr`),
+  leading to smaller binaries.
+- With little extra effort, C++ types can be pickled and unpickled similar to
+  regular Python objects.
+## Supported compilers
+1. Clang/LLVM 3.3 or newer (for Apple Xcode's clang, this is 5.0.0 or newer)
+2. GCC 4.8 or newer
+3. Microsoft Visual Studio 2015 Update 3 or newer
+4. Intel C++ compiler 17 or newer (16 with pybind11 v2.0 and 15 with pybind11
+   v2.0 and a [workaround][intel-15-workaround])
+5. Cygwin/GCC (tested on 2.5.1)
+## About
+This project was created by [Wenzel Jakob](http://rgl.epfl.ch/people/wjakob).
+Significant features and/or improvements to the code were contributed by
+Jonas Adler,
+Lori A. Burns,
+Sylvain Corlay,
+Trent Houliston,
+Axel Huebl,
+@hulucc,
+Sergey Lyskov
+Johan Mabille,
+Tomasz Miąsko,
+Dean Moldovan,
+Ben Pritchard,
+Jason Rhinelander,
+Boris Schäling,
+Pim Schellart,
+Henry Schreiner,
+Ivan Smirnov, and
+Patrick Stewart.
+### Contributing
+See the [contributing guide][] for information on building and contributing to
+pybind11.
+### License
+pybind11 is provided under a BSD-style license that can be found in the
+[`LICENSE`][] file. By using, distributing, or contributing to this project,
+you agree to the terms and conditions of this license.
+[pybind11.readthedocs.org]: http://pybind11.readthedocs.org/en/master
+[docs-pdf]: https://media.readthedocs.org/pdf/pybind11/master/pybind11.pdf
+[Boost.Python]: http://www.boost.org/doc/libs/1_58_0/libs/python/doc/
+[pyrosetta-report]: http://graylab.jhu.edu/RosettaCon2016/PyRosetta-4.pdf
+[contributing guide]:  https://github.com/pybind/pybind11/blob/master/.github/CONTRIBUTING.md
+[`LICENSE`]: https://github.com/pybind/pybind11/blob/master/LICENSE
+[intel-15-workaround]: https://github.com/pybind/pybind11/issues/276

DiffVG/pybind11/docs/Doxyfile ADDED Viewed

	@@ -0,0 +1,22 @@

+PROJECT_NAME           = pybind11
+INPUT                  = ../include/pybind11/
+RECURSIVE              = YES
+GENERATE_HTML          = NO
+GENERATE_LATEX         = NO
+GENERATE_XML           = YES
+XML_OUTPUT             = .build/doxygenxml
+XML_PROGRAMLISTING     = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+EXPAND_AS_DEFINED      = PYBIND11_RUNTIME_EXCEPTION
+ALIASES                = "rst=\verbatim embed:rst"
+ALIASES               += "endrst=\endverbatim"
+QUIET                  = YES
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = NO
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS \
+                         PY_MAJOR_VERSION=3

DiffVG/pybind11/docs/_static/theme_overrides.css ADDED Viewed

	@@ -0,0 +1,11 @@

+.wy-table-responsive table td,
+.wy-table-responsive table th {
+    white-space: initial !important;
+}
+.rst-content table.docutils td {
+    vertical-align: top !important;
+}
+div[class^='highlight'] pre {
+    white-space: pre;
+    white-space: pre-wrap;
+}

DiffVG/pybind11/docs/advanced/cast/chrono.rst ADDED Viewed

	@@ -0,0 +1,81 @@

+Chrono
+======
+When including the additional header file :file:`pybind11/chrono.h` conversions
+from C++11 chrono datatypes to python datetime objects are automatically enabled.
+This header also enables conversions of python floats (often from sources such
+as ``time.monotonic()``, ``time.perf_counter()`` and ``time.process_time()``)
+into durations.
+An overview of clocks in C++11
+------------------------------
+A point of confusion when using these conversions is the differences between
+clocks provided in C++11. There are three clock types defined by the C++11
+standard and users can define their own if needed. Each of these clocks have
+different properties and when converting to and from python will give different
+results.
+The first clock defined by the standard is ``std::chrono::system_clock``. This
+clock measures the current date and time. However, this clock changes with to
+updates to the operating system time. For example, if your time is synchronised
+with a time server this clock will change. This makes this clock a poor choice
+for timing purposes but good for measuring the wall time.
+The second clock defined in the standard is ``std::chrono::steady_clock``.
+This clock ticks at a steady rate and is never adjusted. This makes it excellent
+for timing purposes, however the value in this clock does not correspond to the
+current date and time. Often this clock will be the amount of time your system
+has been on, although it does not have to be. This clock will never be the same
+clock as the system clock as the system clock can change but steady clocks
+cannot.
+The third clock defined in the standard is ``std::chrono::high_resolution_clock``.
+This clock is the clock that has the highest resolution out of the clocks in the
+system. It is normally a typedef to either the system clock or the steady clock
+but can be its own independent clock. This is important as when using these
+conversions as the types you get in python for this clock might be different
+depending on the system.
+If it is a typedef of the system clock, python will get datetime objects, but if
+it is a different clock they will be timedelta objects.
+Provided conversions
+--------------------
+.. rubric:: C++ to Python
+- ``std::chrono::system_clock::time_point`` → ``datetime.datetime``
+    System clock times are converted to python datetime instances. They are
+    in the local timezone, but do not have any timezone information attached
+    to them (they are naive datetime objects).
+- ``std::chrono::duration`` → ``datetime.timedelta``
+    Durations are converted to timedeltas, any precision in the duration
+    greater than microseconds is lost by rounding towards zero.
+- ``std::chrono::[other_clocks]::time_point`` → ``datetime.timedelta``
+    Any clock time that is not the system clock is converted to a time delta.
+    This timedelta measures the time from the clocks epoch to now.
+.. rubric:: Python to C++
+- ``datetime.datetime`` or ``datetime.date`` or ``datetime.time`` → ``std::chrono::system_clock::time_point``
+    Date/time objects are converted into system clock timepoints. Any
+    timezone information is ignored and the type is treated as a naive
+    object.
+- ``datetime.timedelta`` → ``std::chrono::duration``
+    Time delta are converted into durations with microsecond precision.
+- ``datetime.timedelta`` → ``std::chrono::[other_clocks]::time_point``
+    Time deltas that are converted into clock timepoints are treated as
+    the amount of time from the start of the clocks epoch.
+- ``float`` → ``std::chrono::duration``
+    Floats that are passed to C++ as durations be interpreted as a number of
+    seconds. These will be converted to the duration using ``duration_cast``
+    from the float.
+- ``float`` → ``std::chrono::[other_clocks]::time_point``
+    Floats that are passed to C++ as time points will be interpreted as the
+    number of seconds from the start of the clocks epoch.

DiffVG/pybind11/docs/advanced/cast/custom.rst ADDED Viewed

	@@ -0,0 +1,91 @@

+Custom type casters
+===================
+In very rare cases, applications may require custom type casters that cannot be
+expressed using the abstractions provided by pybind11, thus requiring raw
+Python C API calls. This is fairly advanced usage and should only be pursued by
+experts who are familiar with the intricacies of Python reference counting.
+The following snippets demonstrate how this works for a very simple ``inty``
+type that that should be convertible from Python types that provide a
+``__int__(self)`` method.
+.. code-block:: cpp
+    struct inty { long long_value; };
+    void print(inty s) {
+        std::cout << s.long_value << std::endl;
+    }
+The following Python snippet demonstrates the intended usage from the Python side:
+.. code-block:: python
+    class A:
+        def __int__(self):
+            return 123
+    from example import print
+    print(A())
+To register the necessary conversion routines, it is necessary to add
+a partial overload to the ``pybind11::detail::type_caster<T>`` template.
+Although this is an implementation detail, adding partial overloads to this
+type is explicitly allowed.
+.. code-block:: cpp
+    namespace pybind11 { namespace detail {
+        template <> struct type_caster<inty> {
+        public:
+            /**
+             * This macro establishes the name 'inty' in
+             * function signatures and declares a local variable
+             * 'value' of type inty
+             */
+            PYBIND11_TYPE_CASTER(inty, _("inty"));
+            /**
+             * Conversion part 1 (Python->C++): convert a PyObject into a inty
+             * instance or return false upon failure. The second argument
+             * indicates whether implicit conversions should be applied.
+             */
+            bool load(handle src, bool) {
+                /* Extract PyObject from handle */
+                PyObject *source = src.ptr();
+                /* Try converting into a Python integer value */
+                PyObject *tmp = PyNumber_Long(source);
+                if (!tmp)
+                    return false;
+                /* Now try to convert into a C++ int */
+                value.long_value = PyLong_AsLong(tmp);
+                Py_DECREF(tmp);
+                /* Ensure return code was OK (to avoid out-of-range errors etc) */
+                return !(value.long_value == -1 && !PyErr_Occurred());
+            }
+            /**
+             * Conversion part 2 (C++ -> Python): convert an inty instance into
+             * a Python object. The second and third arguments are used to
+             * indicate the return value policy and parent object (for
+             * ``return_value_policy::reference_internal``) and are generally
+             * ignored by implicit casters.
+             */
+            static handle cast(inty src, return_value_policy /* policy */, handle /* parent */) {
+                return PyLong_FromLong(src.long_value);
+            }
+        };
+    }} // namespace pybind11::detail
+.. note::
+    A ``type_caster<T>`` defined with ``PYBIND11_TYPE_CASTER(T, ...)`` requires
+    that ``T`` is default-constructible (``value`` is first default constructed
+    and then ``load()`` assigns to it).
+.. warning::
+    When using custom type casters, it's important to declare them consistently
+    in every compilation unit of the Python extension module. Otherwise,
+    undefined behavior can ensue.

DiffVG/pybind11/docs/advanced/cast/eigen.rst ADDED Viewed

	@@ -0,0 +1,310 @@

+Eigen
+#####
+`Eigen <http://eigen.tuxfamily.org>`_ is C++ header-based library for dense and
+sparse linear algebra. Due to its popularity and widespread adoption, pybind11
+provides transparent conversion and limited mapping support between Eigen and
+Scientific Python linear algebra data types.
+To enable the built-in Eigen support you must include the optional header file
+:file:`pybind11/eigen.h`.
+Pass-by-value
+=============
+When binding a function with ordinary Eigen dense object arguments (for
+example, ``Eigen::MatrixXd``), pybind11 will accept any input value that is
+already (or convertible to) a ``numpy.ndarray`` with dimensions compatible with
+the Eigen type, copy its values into a temporary Eigen variable of the
+appropriate type, then call the function with this temporary variable.
+Sparse matrices are similarly copied to or from
+``scipy.sparse.csr_matrix``/``scipy.sparse.csc_matrix`` objects.
+Pass-by-reference
+=================
+One major limitation of the above is that every data conversion implicitly
+involves a copy, which can be both expensive (for large matrices) and disallows
+binding functions that change their (Matrix) arguments.  Pybind11 allows you to
+work around this by using Eigen's ``Eigen::Ref<MatrixType>`` class much as you
+would when writing a function taking a generic type in Eigen itself (subject to
+some limitations discussed below).
+When calling a bound function accepting a ``Eigen::Ref<const MatrixType>``
+type, pybind11 will attempt to avoid copying by using an ``Eigen::Map`` object
+that maps into the source ``numpy.ndarray`` data: this requires both that the
+data types are the same (e.g. ``dtype='float64'`` and ``MatrixType::Scalar`` is
+``double``); and that the storage is layout compatible.  The latter limitation
+is discussed in detail in the section below, and requires careful
+consideration: by default, numpy matrices and Eigen matrices are *not* storage
+compatible.
+If the numpy matrix cannot be used as is (either because its types differ, e.g.
+passing an array of integers to an Eigen parameter requiring doubles, or
+because the storage is incompatible), pybind11 makes a temporary copy and
+passes the copy instead.
+When a bound function parameter is instead ``Eigen::Ref<MatrixType>`` (note the
+lack of ``const``), pybind11 will only allow the function to be called if it
+can be mapped *and* if the numpy array is writeable (that is
+``a.flags.writeable`` is true).  Any access (including modification) made to
+the passed variable will be transparently carried out directly on the
+``numpy.ndarray``.
+This means you can can write code such as the following and have it work as
+expected:
+.. code-block:: cpp
+    void scale_by_2(Eigen::Ref<Eigen::VectorXd> v) {
+        v *= 2;
+    }
+Note, however, that you will likely run into limitations due to numpy and
+Eigen's difference default storage order for data; see the below section on
+:ref:`storage_orders` for details on how to bind code that won't run into such
+limitations.
+.. note::
+    Passing by reference is not supported for sparse types.
+Returning values to Python
+==========================
+When returning an ordinary dense Eigen matrix type to numpy (e.g.
+``Eigen::MatrixXd`` or ``Eigen::RowVectorXf``) pybind11 keeps the matrix and
+returns a numpy array that directly references the Eigen matrix: no copy of the
+data is performed.  The numpy array will have ``array.flags.owndata`` set to
+``False`` to indicate that it does not own the data, and the lifetime of the
+stored Eigen matrix will be tied to the returned ``array``.
+If you bind a function with a non-reference, ``const`` return type (e.g.
+``const Eigen::MatrixXd``), the same thing happens except that pybind11 also
+sets the numpy array's ``writeable`` flag to false.
+If you return an lvalue reference or pointer, the usual pybind11 rules apply,
+as dictated by the binding function's return value policy (see the
+documentation on :ref:`return_value_policies` for full details).  That means,
+without an explicit return value policy, lvalue references will be copied and
+pointers will be managed by pybind11.  In order to avoid copying, you should
+explicitly specify an appropriate return value policy, as in the following
+example:
+.. code-block:: cpp
+    class MyClass {
+        Eigen::MatrixXd big_mat = Eigen::MatrixXd::Zero(10000, 10000);
+    public:
+        Eigen::MatrixXd &getMatrix() { return big_mat; }
+        const Eigen::MatrixXd &viewMatrix() { return big_mat; }
+    };
+    // Later, in binding code:
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def("copy_matrix", &MyClass::getMatrix) // Makes a copy!
+        .def("get_matrix", &MyClass::getMatrix, py::return_value_policy::reference_internal)
+        .def("view_matrix", &MyClass::viewMatrix, py::return_value_policy::reference_internal)
+        ;
+.. code-block:: python
+    a = MyClass()
+    m = a.get_matrix()   # flags.writeable = True,  flags.owndata = False
+    v = a.view_matrix()  # flags.writeable = False, flags.owndata = False
+    c = a.copy_matrix()  # flags.writeable = True,  flags.owndata = True
+    # m[5,6] and v[5,6] refer to the same element, c[5,6] does not.
+Note in this example that ``py::return_value_policy::reference_internal`` is
+used to tie the life of the MyClass object to the life of the returned arrays.
+You may also return an ``Eigen::Ref``, ``Eigen::Map`` or other map-like Eigen
+object (for example, the return value of ``matrix.block()`` and related
+methods) that map into a dense Eigen type.  When doing so, the default
+behaviour of pybind11 is to simply reference the returned data: you must take
+care to ensure that this data remains valid!  You may ask pybind11 to
+explicitly *copy* such a return value by using the
+``py::return_value_policy::copy`` policy when binding the function.  You may
+also use ``py::return_value_policy::reference_internal`` or a
+``py::keep_alive`` to ensure the data stays valid as long as the returned numpy
+array does.
+When returning such a reference of map, pybind11 additionally respects the
+readonly-status of the returned value, marking the numpy array as non-writeable
+if the reference or map was itself read-only.
+.. note::
+    Sparse types are always copied when returned.
+.. _storage_orders:
+Storage orders
+==============
+Passing arguments via ``Eigen::Ref`` has some limitations that you must be
+aware of in order to effectively pass matrices by reference.  First and
+foremost is that the default ``Eigen::Ref<MatrixType>`` class requires
+contiguous storage along columns (for column-major types, the default in Eigen)
+or rows if ``MatrixType`` is specifically an ``Eigen::RowMajor`` storage type.
+The former, Eigen's default, is incompatible with ``numpy``'s default row-major
+storage, and so you will not be able to pass numpy arrays to Eigen by reference
+without making one of two changes.
+(Note that this does not apply to vectors (or column or row matrices): for such
+types the "row-major" and "column-major" distinction is meaningless).
+The first approach is to change the use of ``Eigen::Ref<MatrixType>`` to the
+more general ``Eigen::Ref<MatrixType, 0, Eigen::Stride<Eigen::Dynamic,
+Eigen::Dynamic>>`` (or similar type with a fully dynamic stride type in the
+third template argument).  Since this is a rather cumbersome type, pybind11
+provides a ``py::EigenDRef<MatrixType>`` type alias for your convenience (along
+with EigenDMap for the equivalent Map, and EigenDStride for just the stride
+type).
+This type allows Eigen to map into any arbitrary storage order.  This is not
+the default in Eigen for performance reasons: contiguous storage allows
+vectorization that cannot be done when storage is not known to be contiguous at
+compile time.  The default ``Eigen::Ref`` stride type allows non-contiguous
+storage along the outer dimension (that is, the rows of a column-major matrix
+or columns of a row-major matrix), but not along the inner dimension.
+This type, however, has the added benefit of also being able to map numpy array
+slices.  For example, the following (contrived) example uses Eigen with a numpy
+slice to multiply by 2 all coefficients that are both on even rows (0, 2, 4,
+...) and in columns 2, 5, or 8:
+.. code-block:: cpp
+    m.def("scale", [](py::EigenDRef<Eigen::MatrixXd> m, double c) { m *= c; });
+.. code-block:: python
+    # a = np.array(...)
+    scale_by_2(myarray[0::2, 2:9:3])
+The second approach to avoid copying is more intrusive: rearranging the
+underlying data types to not run into the non-contiguous storage problem in the
+first place.  In particular, that means using matrices with ``Eigen::RowMajor``
+storage, where appropriate, such as:
+.. code-block:: cpp
+    using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+    // Use RowMatrixXd instead of MatrixXd
+Now bound functions accepting ``Eigen::Ref<RowMatrixXd>`` arguments will be
+callable with numpy's (default) arrays without involving a copying.
+You can, alternatively, change the storage order that numpy arrays use by
+adding the ``order='F'`` option when creating an array:
+.. code-block:: python
+    myarray = np.array(source, order='F')
+Such an object will be passable to a bound function accepting an
+``Eigen::Ref<MatrixXd>`` (or similar column-major Eigen type).
+One major caveat with this approach, however, is that it is not entirely as
+easy as simply flipping all Eigen or numpy usage from one to the other: some
+operations may alter the storage order of a numpy array.  For example, ``a2 =
+array.transpose()`` results in ``a2`` being a view of ``array`` that references
+the same data, but in the opposite storage order!
+While this approach allows fully optimized vectorized calculations in Eigen, it
+cannot be used with array slices, unlike the first approach.
+When *returning* a matrix to Python (either a regular matrix, a reference via
+``Eigen::Ref<>``, or a map/block into a matrix), no special storage
+consideration is required: the created numpy array will have the required
+stride that allows numpy to properly interpret the array, whatever its storage
+order.
+Failing rather than copying
+===========================
+The default behaviour when binding ``Eigen::Ref<const MatrixType>`` Eigen
+references is to copy matrix values when passed a numpy array that does not
+conform to the element type of ``MatrixType`` or does not have a compatible
+stride layout.  If you want to explicitly avoid copying in such a case, you
+should bind arguments using the ``py::arg().noconvert()`` annotation (as
+described in the :ref:`nonconverting_arguments` documentation).
+The following example shows an example of arguments that don't allow data
+copying to take place:
+.. code-block:: cpp
+    // The method and function to be bound:
+    class MyClass {
+        // ...
+        double some_method(const Eigen::Ref<const MatrixXd> &matrix) { /* ... */ }
+    };
+    float some_function(const Eigen::Ref<const MatrixXf> &big,
+                        const Eigen::Ref<const MatrixXf> &small) {
+        // ...
+    }
+    // The associated binding code:
+    using namespace pybind11::literals; // for "arg"_a
+    py::class_<MyClass>(m, "MyClass")
+        // ... other class definitions
+        .def("some_method", &MyClass::some_method, py::arg().noconvert());
+    m.def("some_function", &some_function,
+        "big"_a.noconvert(), // <- Don't allow copying for this arg
+        "small"_a            // <- This one can be copied if needed
+    );
+With the above binding code, attempting to call the the ``some_method(m)``
+method on a ``MyClass`` object, or attempting to call ``some_function(m, m2)``
+will raise a ``RuntimeError`` rather than making a temporary copy of the array.
+It will, however, allow the ``m2`` argument to be copied into a temporary if
+necessary.
+Note that explicitly specifying ``.noconvert()`` is not required for *mutable*
+Eigen references (e.g. ``Eigen::Ref<MatrixXd>`` without ``const`` on the
+``MatrixXd``): mutable references will never be called with a temporary copy.
+Vectors versus column/row matrices
+==================================
+Eigen and numpy have fundamentally different notions of a vector.  In Eigen, a
+vector is simply a matrix with the number of columns or rows set to 1 at
+compile time (for a column vector or row vector, respectively).  Numpy, in
+contrast, has comparable 2-dimensional 1xN and Nx1 arrays, but *also* has
+1-dimensional arrays of size N.
+When passing a 2-dimensional 1xN or Nx1 array to Eigen, the Eigen type must
+have matching dimensions: That is, you cannot pass a 2-dimensional Nx1 numpy
+array to an Eigen value expecting a row vector, or a 1xN numpy array as a
+column vector argument.
+On the other hand, pybind11 allows you to pass 1-dimensional arrays of length N
+as Eigen parameters.  If the Eigen type can hold a column vector of length N it
+will be passed as such a column vector.  If not, but the Eigen type constraints
+will accept a row vector, it will be passed as a row vector.  (The column
+vector takes precedence when both are supported, for example, when passing a
+1D numpy array to a MatrixXd argument).  Note that the type need not be
+explicitly a vector: it is permitted to pass a 1D numpy array of size 5 to an
+Eigen ``Matrix<double, Dynamic, 5>``: you would end up with a 1x5 Eigen matrix.
+Passing the same to an ``Eigen::MatrixXd`` would result in a 5x1 Eigen matrix.
+When returning an Eigen vector to numpy, the conversion is ambiguous: a row
+vector of length 4 could be returned as either a 1D array of length 4, or as a
+2D array of size 1x4.  When encountering such a situation, pybind11 compromises
+by considering the returned Eigen type: if it is a compile-time vector--that
+is, the type has either the number of rows or columns set to 1 at compile
+time--pybind11 converts to a 1D numpy array when returning the value.  For
+instances that are a vector only at run-time (e.g. ``MatrixXd``,
+``Matrix<float, Dynamic, 4>``), pybind11 returns the vector as a 2D array to
+numpy.  If this isn't want you want, you can use ``array.reshape(...)`` to get
+a view of the same data in the desired dimensions.
+.. seealso::
+    The file :file:`tests/test_eigen.cpp` contains a complete example that
+    shows how to pass Eigen sparse and dense data types in more detail.

DiffVG/pybind11/docs/advanced/cast/functional.rst ADDED Viewed

	@@ -0,0 +1,109 @@

+Functional
+##########
+The following features must be enabled by including :file:`pybind11/functional.h`.
+Callbacks and passing anonymous functions
+=========================================
+The C++11 standard brought lambda functions and the generic polymorphic
+function wrapper ``std::function<>`` to the C++ programming language, which
+enable powerful new ways of working with functions. Lambda functions come in
+two flavors: stateless lambda function resemble classic function pointers that
+link to an anonymous piece of code, while stateful lambda functions
+additionally depend on captured variables that are stored in an anonymous
+*lambda closure object*.
+Here is a simple example of a C++ function that takes an arbitrary function
+(stateful or stateless) with signature ``int -> int`` as an argument and runs
+it with the value 10.
+.. code-block:: cpp
+    int func_arg(const std::function<int(int)> &f) {
+        return f(10);
+    }
+The example below is more involved: it takes a function of signature ``int -> int``
+and returns another function of the same kind. The return value is a stateful
+lambda function, which stores the value ``f`` in the capture object and adds 1 to
+its return value upon execution.
+.. code-block:: cpp
+    std::function<int(int)> func_ret(const std::function<int(int)> &f) {
+        return [f](int i) {
+            return f(i) + 1;
+        };
+    }
+This example demonstrates using python named parameters in C++ callbacks which
+requires using ``py::cpp_function`` as a wrapper. Usage is similar to defining
+methods of classes:
+.. code-block:: cpp
+    py::cpp_function func_cpp() {
+        return py::cpp_function([](int i) { return i+1; },
+           py::arg("number"));
+    }
+After including the extra header file :file:`pybind11/functional.h`, it is almost
+trivial to generate binding code for all of these functions.
+.. code-block:: cpp
+    #include <pybind11/functional.h>
+    PYBIND11_MODULE(example, m) {
+        m.def("func_arg", &func_arg);
+        m.def("func_ret", &func_ret);
+        m.def("func_cpp", &func_cpp);
+    }
+The following interactive session shows how to call them from Python.
+.. code-block:: pycon
+    $ python
+    >>> import example
+    >>> def square(i):
+    ...     return i * i
+    ...
+    >>> example.func_arg(square)
+    100L
+    >>> square_plus_1 = example.func_ret(square)
+    >>> square_plus_1(4)
+    17L
+    >>> plus_1 = func_cpp()
+    >>> plus_1(number=43)
+    44L
+.. warning::
+    Keep in mind that passing a function from C++ to Python (or vice versa)
+    will instantiate a piece of wrapper code that translates function
+    invocations between the two languages. Naturally, this translation
+    increases the computational cost of each function call somewhat. A
+    problematic situation can arise when a function is copied back and forth
+    between Python and C++ many times in a row, in which case the underlying
+    wrappers will accumulate correspondingly. The resulting long sequence of
+    C++ -> Python -> C++ -> ... roundtrips can significantly decrease
+    performance.
+    There is one exception: pybind11 detects case where a stateless function
+    (i.e. a function pointer or a lambda function without captured variables)
+    is passed as an argument to another C++ function exposed in Python. In this
+    case, there is no overhead. Pybind11 will extract the underlying C++
+    function pointer from the wrapped function to sidestep a potential C++ ->
+    Python -> C++ roundtrip. This is demonstrated in :file:`tests/test_callbacks.cpp`.
+.. note::
+    This functionality is very useful when generating bindings for callbacks in
+    C++ libraries (e.g. GUI libraries, asynchronous networking libraries, etc.).
+    The file :file:`tests/test_callbacks.cpp` contains a complete example
+    that demonstrates how to work with callbacks and anonymous functions in
+    more detail.

DiffVG/pybind11/docs/advanced/cast/index.rst ADDED Viewed

	@@ -0,0 +1,41 @@

+Type conversions
+################
+Apart from enabling cross-language function calls, a fundamental problem
+that a binding tool like pybind11 must address is to provide access to
+native Python types in C++ and vice versa. There are three fundamentally
+different ways to do this—which approach is preferable for a particular type
+depends on the situation at hand.
+1. Use a native C++ type everywhere. In this case, the type must be wrapped
+   using pybind11-generated bindings so that Python can interact with it.
+2. Use a native Python type everywhere. It will need to be wrapped so that
+   C++ functions can interact with it.
+3. Use a native C++ type on the C++ side and a native Python type on the
+   Python side. pybind11 refers to this as a *type conversion*.
+   Type conversions are the most "natural" option in the sense that native
+   (non-wrapped) types are used everywhere. The main downside is that a copy
+   of the data must be made on every Python ↔ C++ transition: this is
+   needed since the C++ and Python versions of the same type generally won't
+   have the same memory layout.
+   pybind11 can perform many kinds of conversions automatically. An overview
+   is provided in the table ":ref:`conversion_table`".
+The following subsections discuss the differences between these options in more
+detail. The main focus in this section is on type conversions, which represent
+the last case of the above list.
+.. toctree::
+   :maxdepth: 1
+   overview
+   strings
+   stl
+   functional
+   chrono
+   eigen
+   custom

DiffVG/pybind11/docs/advanced/cast/overview.rst ADDED Viewed

	@@ -0,0 +1,165 @@

+Overview
+########
+.. rubric:: 1. Native type in C++, wrapper in Python
+Exposing a custom C++ type using :class:`py::class_` was covered in detail
+in the :doc:`/classes` section. There, the underlying data structure is
+always the original C++ class while the :class:`py::class_` wrapper provides
+a Python interface. Internally, when an object like this is sent from C++ to
+Python, pybind11 will just add the outer wrapper layer over the native C++
+object. Getting it back from Python is just a matter of peeling off the
+wrapper.
+.. rubric:: 2. Wrapper in C++, native type in Python
+This is the exact opposite situation. Now, we have a type which is native to
+Python, like a ``tuple`` or a ``list``. One way to get this data into C++ is
+with the :class:`py::object` family of wrappers. These are explained in more
+detail in the :doc:`/advanced/pycpp/object` section. We'll just give a quick
+example here:
+.. code-block:: cpp
+    void print_list(py::list my_list) {
+        for (auto item : my_list)
+            std::cout << item << " ";
+    }
+.. code-block:: pycon
+    >>> print_list([1, 2, 3])
+    1 2 3
+The Python ``list`` is not converted in any way -- it's just wrapped in a C++
+:class:`py::list` class. At its core it's still a Python object. Copying a
+:class:`py::list` will do the usual reference-counting like in Python.
+Returning the object to Python will just remove the thin wrapper.
+.. rubric:: 3. Converting between native C++ and Python types
+In the previous two cases we had a native type in one language and a wrapper in
+the other. Now, we have native types on both sides and we convert between them.
+.. code-block:: cpp
+    void print_vector(const std::vector<int> &v) {
+        for (auto item : v)
+            std::cout << item << "\n";
+    }
+.. code-block:: pycon
+    >>> print_vector([1, 2, 3])
+    1 2 3
+In this case, pybind11 will construct a new ``std::vector<int>`` and copy each
+element from the Python ``list``. The newly constructed object will be passed
+to ``print_vector``. The same thing happens in the other direction: a new
+``list`` is made to match the value returned from C++.
+Lots of these conversions are supported out of the box, as shown in the table
+below. They are very convenient, but keep in mind that these conversions are
+fundamentally based on copying data. This is perfectly fine for small immutable
+types but it may become quite expensive for large data structures. This can be
+avoided by overriding the automatic conversion with a custom wrapper (i.e. the
+above-mentioned approach 1). This requires some manual effort and more details
+are available in the :ref:`opaque` section.
+.. _conversion_table:
+List of all builtin conversions
+-------------------------------
+The following basic data types are supported out of the box (some may require
+an additional extension header to be included). To pass other data structures
+as arguments and return values, refer to the section on binding :ref:`classes`.
++------------------------------------+---------------------------+-------------------------------+
+|  Data type                         |  Description              | Header file                   |
++====================================+===========================+===============================+
+| ``int8_t``, ``uint8_t``            | 8-bit integers            | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int16_t``, ``uint16_t``          | 16-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int32_t``, ``uint32_t``          | 32-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``int64_t``, ``uint64_t``          | 64-bit integers           | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``ssize_t``, ``size_t``            | Platform-dependent size   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``float``, ``double``              | Floating point types      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``bool``                           | Two-state Boolean type    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char``                           | Character literal         | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char16_t``                       | UTF-16 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``char32_t``                       | UTF-32 character literal  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``wchar_t``                        | Wide character literal    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char *``                   | UTF-8 string literal      | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char16_t *``               | UTF-16 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const char32_t *``               | UTF-32 string literal     | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``const wchar_t *``                | Wide string literal       | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string``                    | STL dynamic UTF-8 string  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u16string``                 | STL dynamic UTF-16 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::u32string``                 | STL dynamic UTF-32 string | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::wstring``                   | STL dynamic wide string   | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::string_view``,              | STL C++17 string views    | :file:`pybind11/pybind11.h`   |
+| ``std::u16string_view``, etc.      |                           |                               |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::pair<T1, T2>``              | Pair of two custom types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::tuple<...>``                | Arbitrary tuple of types  | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::reference_wrapper<...>``    | Reference type wrapper    | :file:`pybind11/pybind11.h`   |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::complex<T>``                | Complex numbers           | :file:`pybind11/complex.h`    |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::array<T, Size>``            | STL static array          | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::vector<T>``                 | STL dynamic array         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::deque<T>``                  | STL double-ended queue    | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::valarray<T>``               | STL value array           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::list<T>``                   | STL linked list           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::map<T1, T2>``               | STL ordered map           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_map<T1, T2>``     | STL unordered map         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::set<T>``                    | STL ordered set           | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::unordered_set<T>``          | STL unordered set         | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::optional<T>``               | STL optional type (C++17) | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::experimental::optional<T>`` | STL optional type (exp.)  | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::variant<...>``              | Type-safe union (C++17)   | :file:`pybind11/stl.h`        |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::function<...>``             | STL polymorphic function  | :file:`pybind11/functional.h` |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::duration<...>``     | STL time duration         | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``std::chrono::time_point<...>``   | STL date/time             | :file:`pybind11/chrono.h`     |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Matrix<...>``             | Eigen: dense matrix       | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::Map<...>``                | Eigen: mapped memory      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+
+| ``Eigen::SparseMatrix<...>``       | Eigen: sparse matrix      | :file:`pybind11/eigen.h`      |
++------------------------------------+---------------------------+-------------------------------+

DiffVG/pybind11/docs/advanced/cast/stl.rst ADDED Viewed

	@@ -0,0 +1,240 @@

+STL containers
+##############
+Automatic conversion
+====================
+When including the additional header file :file:`pybind11/stl.h`, conversions
+between ``std::vector<>``/``std::deque<>``/``std::list<>``/``std::array<>``,
+``std::set<>``/``std::unordered_set<>``, and
+``std::map<>``/``std::unordered_map<>`` and the Python ``list``, ``set`` and
+``dict`` data structures are automatically enabled. The types ``std::pair<>``
+and ``std::tuple<>`` are already supported out of the box with just the core
+:file:`pybind11/pybind11.h` header.
+The major downside of these implicit conversions is that containers must be
+converted (i.e. copied) on every Python->C++ and C++->Python transition, which
+can have implications on the program semantics and performance. Please read the
+next sections for more details and alternative approaches that avoid this.
+.. note::
+    Arbitrary nesting of any of these types is possible.
+.. seealso::
+    The file :file:`tests/test_stl.cpp` contains a complete
+    example that demonstrates how to pass STL data types in more detail.
+.. _cpp17_container_casters:
+C++17 library containers
+========================
+The :file:`pybind11/stl.h` header also includes support for ``std::optional<>``
+and ``std::variant<>``. These require a C++17 compiler and standard library.
+In C++14 mode, ``std::experimental::optional<>`` is supported if available.
+Various versions of these containers also exist for C++11 (e.g. in Boost).
+pybind11 provides an easy way to specialize the ``type_caster`` for such
+types:
+.. code-block:: cpp
+    // `boost::optional` as an example -- can be any `std::optional`-like container
+    namespace pybind11 { namespace detail {
+        template <typename T>
+        struct type_caster<boost::optional<T>> : optional_caster<boost::optional<T>> {};
+    }}
+The above should be placed in a header file and included in all translation units
+where automatic conversion is needed. Similarly, a specialization can be provided
+for custom variant types:
+.. code-block:: cpp
+    // `boost::variant` as an example -- can be any `std::variant`-like container
+    namespace pybind11 { namespace detail {
+        template <typename... Ts>
+        struct type_caster<boost::variant<Ts...>> : variant_caster<boost::variant<Ts...>> {};
+        // Specifies the function used to visit the variant -- `apply_visitor` instead of `visit`
+        template <>
+        struct visit_helper<boost::variant> {
+            template <typename... Args>
+            static auto call(Args &&...args) -> decltype(boost::apply_visitor(args...)) {
+                return boost::apply_visitor(args...);
+            }
+        };
+    }} // namespace pybind11::detail
+The ``visit_helper`` specialization is not required if your ``name::variant`` provides
+a ``name::visit()`` function. For any other function name, the specialization must be
+included to tell pybind11 how to visit the variant.
+.. note::
+    pybind11 only supports the modern implementation of ``boost::variant``
+    which makes use of variadic templates. This requires Boost 1.56 or newer.
+    Additionally, on Windows, MSVC 2017 is required because ``boost::variant``
+    falls back to the old non-variadic implementation on MSVC 2015.
+.. _opaque:
+Making opaque types
+===================
+pybind11 heavily relies on a template matching mechanism to convert parameters
+and return values that are constructed from STL data types such as vectors,
+linked lists, hash tables, etc. This even works in a recursive manner, for
+instance to deal with lists of hash maps of pairs of elementary and custom
+types, etc.
+However, a fundamental limitation of this approach is that internal conversions
+between Python and C++ types involve a copy operation that prevents
+pass-by-reference semantics. What does this mean?
+Suppose we bind the following function
+.. code-block:: cpp
+    void append_1(std::vector<int> &v) {
+       v.push_back(1);
+    }
+and call it from Python, the following happens:
+.. code-block:: pycon
+   >>> v = [5, 6]
+   >>> append_1(v)
+   >>> print(v)
+   [5, 6]
+As you can see, when passing STL data structures by reference, modifications
+are not propagated back the Python side. A similar situation arises when
+exposing STL data structures using the ``def_readwrite`` or ``def_readonly``
+functions:
+.. code-block:: cpp
+    /* ... definition ... */
+    class MyClass {
+        std::vector<int> contents;
+    };
+    /* ... binding code ... */
+    py::class_<MyClass>(m, "MyClass")
+        .def(py::init<>())
+        .def_readwrite("contents", &MyClass::contents);
+In this case, properties can be read and written in their entirety. However, an
+``append`` operation involving such a list type has no effect:
+.. code-block:: pycon
+   >>> m = MyClass()
+   >>> m.contents = [5, 6]
+   >>> print(m.contents)
+   [5, 6]
+   >>> m.contents.append(7)
+   >>> print(m.contents)
+   [5, 6]
+Finally, the involved copy operations can be costly when dealing with very
+large lists. To deal with all of the above situations, pybind11 provides a
+macro named ``PYBIND11_MAKE_OPAQUE(T)`` that disables the template-based
+conversion machinery of types, thus rendering them *opaque*. The contents of
+opaque objects are never inspected or extracted, hence they *can* be passed by
+reference. For instance, to turn ``std::vector<int>`` into an opaque type, add
+the declaration
+.. code-block:: cpp
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+before any binding code (e.g. invocations to ``class_::def()``, etc.). This
+macro must be specified at the top level (and outside of any namespaces), since
+it instantiates a partial template overload. If your binding code consists of
+multiple compilation units, it must be present in every file (typically via a
+common header) preceding any usage of ``std::vector<int>``. Opaque types must
+also have a corresponding ``class_`` declaration to associate them with a name
+in Python, and to define a set of available operations, e.g.:
+.. code-block:: cpp
+    py::class_<std::vector<int>>(m, "IntVector")
+        .def(py::init<>())
+        .def("clear", &std::vector<int>::clear)
+        .def("pop_back", &std::vector<int>::pop_back)
+        .def("__len__", [](const std::vector<int> &v) { return v.size(); })
+        .def("__iter__", [](std::vector<int> &v) {
+           return py::make_iterator(v.begin(), v.end());
+        }, py::keep_alive<0, 1>()) /* Keep vector alive while iterator is used */
+        // ....
+.. seealso::
+    The file :file:`tests/test_opaque_types.cpp` contains a complete
+    example that demonstrates how to create and expose opaque types using
+    pybind11 in more detail.
+.. _stl_bind:
+Binding STL containers
+======================
+The ability to expose STL containers as native Python objects is a fairly
+common request, hence pybind11 also provides an optional header file named
+:file:`pybind11/stl_bind.h` that does exactly this. The mapped containers try
+to match the behavior of their native Python counterparts as much as possible.
+The following example showcases usage of :file:`pybind11/stl_bind.h`:
+.. code-block:: cpp
+    // Don't forget this
+    #include <pybind11/stl_bind.h>
+    PYBIND11_MAKE_OPAQUE(std::vector<int>);
+    PYBIND11_MAKE_OPAQUE(std::map<std::string, double>);
+    // ...
+    // later in binding code:
+    py::bind_vector<std::vector<int>>(m, "VectorInt");
+    py::bind_map<std::map<std::string, double>>(m, "MapStringDouble");
+When binding STL containers pybind11 considers the types of the container's
+elements to decide whether the container should be confined to the local module
+(via the :ref:`module_local` feature).  If the container element types are
+anything other than already-bound custom types bound without
+``py::module_local()`` the container binding will have ``py::module_local()``
+applied.  This includes converting types such as numeric types, strings, Eigen
+types; and types that have not yet been bound at the time of the stl container
+binding.  This module-local binding is designed to avoid potential conflicts
+between module bindings (for example, from two separate modules each attempting
+to bind ``std::vector<int>`` as a python type).
+It is possible to override this behavior to force a definition to be either
+module-local or global.  To do so, you can pass the attributes
+``py::module_local()`` (to make the binding module-local) or
+``py::module_local(false)`` (to make the binding global) into the
+``py::bind_vector`` or ``py::bind_map`` arguments:
+.. code-block:: cpp
+    py::bind_vector<std::vector<int>>(m, "VectorInt", py::module_local(false));
+Note, however, that such a global binding would make it impossible to load this
+module at the same time as any other pybind module that also attempts to bind
+the same container type (``std::vector<int>`` in the above example).
+See :ref:`module_local` for more details on module-local bindings.
+.. seealso::
+    The file :file:`tests/test_stl_binders.cpp` shows how to use the
+    convenience STL container wrappers.

DiffVG/pybind11/docs/advanced/cast/strings.rst ADDED Viewed

	@@ -0,0 +1,305 @@

+Strings, bytes and Unicode conversions
+######################################
+.. note::
+    This section discusses string handling in terms of Python 3 strings. For
+    Python 2.7, replace all occurrences of ``str`` with ``unicode`` and
+    ``bytes`` with ``str``.  Python 2.7 users may find it best to use ``from
+    __future__ import unicode_literals`` to avoid unintentionally using ``str``
+    instead of ``unicode``.
+Passing Python strings to C++
+=============================
+When a Python ``str`` is passed from Python to a C++ function that accepts
+``std::string`` or ``char *`` as arguments, pybind11 will encode the Python
+string to UTF-8. All Python ``str`` can be encoded in UTF-8, so this operation
+does not fail.
+The C++ language is encoding agnostic. It is the responsibility of the
+programmer to track encodings. It's often easiest to simply `use UTF-8
+everywhere <http://utf8everywhere.org/>`_.
+.. code-block:: c++
+    m.def("utf8_test",
+        [](const std::string &s) {
+            cout << "utf-8 is icing on the cake.\n";
+            cout << s;
+        }
+    );
+    m.def("utf8_charptr",
+        [](const char *s) {
+            cout << "My favorite food is\n";
+            cout << s;
+        }
+    );
+.. code-block:: python
+    >>> utf8_test('🎂')
+    utf-8 is icing on the cake.
+    🎂
+    >>> utf8_charptr('🍕')
+    My favorite food is
+    🍕
+.. note::
+    Some terminal emulators do not support UTF-8 or emoji fonts and may not
+    display the example above correctly.
+The results are the same whether the C++ function accepts arguments by value or
+reference, and whether or not ``const`` is used.
+Passing bytes to C++
+--------------------
+A Python ``bytes`` object will be passed to C++ functions that accept
+``std::string`` or ``char*`` *without* conversion.  On Python 3, in order to
+make a function *only* accept ``bytes`` (and not ``str``), declare it as taking
+a ``py::bytes`` argument.
+Returning C++ strings to Python
+===============================
+When a C++ function returns a ``std::string`` or ``char*`` to a Python caller,
+**pybind11 will assume that the string is valid UTF-8** and will decode it to a
+native Python ``str``, using the same API as Python uses to perform
+``bytes.decode('utf-8')``. If this implicit conversion fails, pybind11 will
+raise a ``UnicodeDecodeError``.
+.. code-block:: c++
+    m.def("std_string_return",
+        []() {
+            return std::string("This string needs to be UTF-8 encoded");
+        }
+    );
+.. code-block:: python
+    >>> isinstance(example.std_string_return(), str)
+    True
+Because UTF-8 is inclusive of pure ASCII, there is never any issue with
+returning a pure ASCII string to Python. If there is any possibility that the
+string is not pure ASCII, it is necessary to ensure the encoding is valid
+UTF-8.
+.. warning::
+    Implicit conversion assumes that a returned ``char *`` is null-terminated.
+    If there is no null terminator a buffer overrun will occur.
+Explicit conversions
+--------------------
+If some C++ code constructs a ``std::string`` that is not a UTF-8 string, one
+can perform a explicit conversion and return a ``py::str`` object. Explicit
+conversion has the same overhead as implicit conversion.
+.. code-block:: c++
+    // This uses the Python C API to convert Latin-1 to Unicode
+    m.def("str_output",
+        []() {
+            std::string s = "Send your r\xe9sum\xe9 to Alice in HR"; // Latin-1
+            py::str py_s = PyUnicode_DecodeLatin1(s.data(), s.length());
+            return py_s;
+        }
+    );
+.. code-block:: python
+    >>> str_output()
+    'Send your résumé to Alice in HR'
+The `Python C API
+<https://docs.python.org/3/c-api/unicode.html#built-in-codecs>`_ provides
+several built-in codecs.
+One could also use a third party encoding library such as libiconv to transcode
+to UTF-8.
+Return C++ strings without conversion
+-------------------------------------
+If the data in a C++ ``std::string`` does not represent text and should be
+returned to Python as ``bytes``, then one can return the data as a
+``py::bytes`` object.
+.. code-block:: c++
+    m.def("return_bytes",
+        []() {
+            std::string s("\xba\xd0\xba\xd0");  // Not valid UTF-8
+            return py::bytes(s);  // Return the data without transcoding
+        }
+    );
+.. code-block:: python
+    >>> example.return_bytes()
+    b'\xba\xd0\xba\xd0'
+Note the asymmetry: pybind11 will convert ``bytes`` to ``std::string`` without
+encoding, but cannot convert ``std::string`` back to ``bytes`` implicitly.
+.. code-block:: c++
+    m.def("asymmetry",
+        [](std::string s) {  // Accepts str or bytes from Python
+            return s;  // Looks harmless, but implicitly converts to str
+        }
+    );
+.. code-block:: python
+    >>> isinstance(example.asymmetry(b"have some bytes"), str)
+    True
+    >>> example.asymmetry(b"\xba\xd0\xba\xd0")  # invalid utf-8 as bytes
+    UnicodeDecodeError: 'utf-8' codec can't decode byte 0xba in position 0: invalid start byte
+Wide character strings
+======================
+When a Python ``str`` is passed to a C++ function expecting ``std::wstring``,
+``wchar_t*``, ``std::u16string`` or ``std::u32string``, the ``str`` will be
+encoded to UTF-16 or UTF-32 depending on how the C++ compiler implements each
+type, in the platform's native endianness. When strings of these types are
+returned, they are assumed to contain valid UTF-16 or UTF-32, and will be
+decoded to Python ``str``.
+.. code-block:: c++
+    #define UNICODE
+    #include <windows.h>
+    m.def("set_window_text",
+        [](HWND hwnd, std::wstring s) {
+            // Call SetWindowText with null-terminated UTF-16 string
+            ::SetWindowText(hwnd, s.c_str());
+        }
+    );
+    m.def("get_window_text",
+        [](HWND hwnd) {
+            const int buffer_size = ::GetWindowTextLength(hwnd) + 1;
+            auto buffer = std::make_unique< wchar_t[] >(buffer_size);
+            ::GetWindowText(hwnd, buffer.data(), buffer_size);
+            std::wstring text(buffer.get());
+            // wstring will be converted to Python str
+            return text;
+        }
+    );
+.. warning::
+    Wide character strings may not work as described on Python 2.7 or Python
+    3.3 compiled with ``--enable-unicode=ucs2``.
+Strings in multibyte encodings such as Shift-JIS must transcoded to a
+UTF-8/16/32 before being returned to Python.
+Character literals
+==================
+C++ functions that accept character literals as input will receive the first
+character of a Python ``str`` as their input. If the string is longer than one
+Unicode character, trailing characters will be ignored.
+When a character literal is returned from C++ (such as a ``char`` or a
+``wchar_t``), it will be converted to a ``str`` that represents the single
+character.
+.. code-block:: c++
+    m.def("pass_char", [](char c) { return c; });
+    m.def("pass_wchar", [](wchar_t w) { return w; });
+.. code-block:: python
+    >>> example.pass_char('A')
+    'A'
+While C++ will cast integers to character types (``char c = 0x65;``), pybind11
+does not convert Python integers to characters implicitly. The Python function
+``chr()`` can be used to convert integers to characters.
+.. code-block:: python
+    >>> example.pass_char(0x65)
+    TypeError
+    >>> example.pass_char(chr(0x65))
+    'A'
+If the desire is to work with an 8-bit integer, use ``int8_t`` or ``uint8_t``
+as the argument type.
+Grapheme clusters
+-----------------
+A single grapheme may be represented by two or more Unicode characters. For
+example 'é' is usually represented as U+00E9 but can also be expressed as the
+combining character sequence U+0065 U+0301 (that is, the letter 'e' followed by
+a combining acute accent). The combining character will be lost if the
+two-character sequence is passed as an argument, even though it renders as a
+single grapheme.
+.. code-block:: python
+    >>> example.pass_wchar('é')
+    'é'
+    >>> combining_e_acute = 'e' + '\u0301'
+    >>> combining_e_acute
+    'é'
+    >>> combining_e_acute == 'é'
+    False
+    >>> example.pass_wchar(combining_e_acute)
+    'e'
+Normalizing combining characters before passing the character literal to C++
+may resolve *some* of these issues:
+.. code-block:: python
+    >>> example.pass_wchar(unicodedata.normalize('NFC', combining_e_acute))
+    'é'
+In some languages (Thai for example), there are `graphemes that cannot be
+expressed as a single Unicode code point
+<http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>`_, so there is
+no way to capture them in a C++ character type.
+C++17 string views
+==================
+C++17 string views are automatically supported when compiling in C++17 mode.
+They follow the same rules for encoding and decoding as the corresponding STL
+string type (for example, a ``std::u16string_view`` argument will be passed
+UTF-16-encoded data, and a returned ``std::string_view`` will be decoded as
+UTF-8).
+References
+==========
+* `The Absolute Minimum Every Software Developer Absolutely, Positively Must Know About Unicode and Character Sets (No Excuses!) <https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/>`_
+* `C++ - Using STL Strings at Win32 API Boundaries <https://msdn.microsoft.com/en-ca/magazine/mt238407.aspx>`_