webrtc-yolov10n

Running on T4

App Files Files Community

freddyaboulton HF staff commited on Sep 25, 2024

Commit

8cfdd9d

•

1 Parent(s): c530e94

fix

Browse files

Files changed (7) hide show

MobileNetSSD_deploy.caffemodel +0 -3
MobileNetSSD_deploy.prototxt.txt +0 -1912
README.md +40 -9
app.py +27 -72
inference.py +146 -0
requirements.txt +3 -2
utils.py +237 -0

MobileNetSSD_deploy.caffemodel DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:761c86fbae3d8361dd454f7c740a964f62975ed32f4324b8b85994edec30f6af
-size 23147564

MobileNetSSD_deploy.prototxt.txt DELETED Viewed

@@ -1,1912 +0,0 @@
-name: "MobileNet-SSD"
-input: "data"
-input_shape {
-  dim: 1
-  dim: 3
-  dim: 300
-  dim: 300
-}
-layer {
-  name: "conv0"
-  type: "Convolution"
-  bottom: "data"
-  top: "conv0"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 32
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv0/relu"
-  type: "ReLU"
-  bottom: "conv0"
-  top: "conv0"
-}
-layer {
-  name: "conv1/dw"
-  type: "Convolution"
-  bottom: "conv0"
-  top: "conv1/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 32
-    pad: 1
-    kernel_size: 3
-    group: 32
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv1/dw/relu"
-  type: "ReLU"
-  bottom: "conv1/dw"
-  top: "conv1/dw"
-}
-layer {
-  name: "conv1"
-  type: "Convolution"
-  bottom: "conv1/dw"
-  top: "conv1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv1/relu"
-  type: "ReLU"
-  bottom: "conv1"
-  top: "conv1"
-}
-layer {
-  name: "conv2/dw"
-  type: "Convolution"
-  bottom: "conv1"
-  top: "conv2/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 64
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    group: 64
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv2/dw/relu"
-  type: "ReLU"
-  bottom: "conv2/dw"
-  top: "conv2/dw"
-}
-layer {
-  name: "conv2"
-  type: "Convolution"
-  bottom: "conv2/dw"
-  top: "conv2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv2/relu"
-  type: "ReLU"
-  bottom: "conv2"
-  top: "conv2"
-}
-layer {
-  name: "conv3/dw"
-  type: "Convolution"
-  bottom: "conv2"
-  top: "conv3/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    group: 128
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv3/dw/relu"
-  type: "ReLU"
-  bottom: "conv3/dw"
-  top: "conv3/dw"
-}
-layer {
-  name: "conv3"
-  type: "Convolution"
-  bottom: "conv3/dw"
-  top: "conv3"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv3/relu"
-  type: "ReLU"
-  bottom: "conv3"
-  top: "conv3"
-}
-layer {
-  name: "conv4/dw"
-  type: "Convolution"
-  bottom: "conv3"
-  top: "conv4/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    group: 128
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv4/dw/relu"
-  type: "ReLU"
-  bottom: "conv4/dw"
-  top: "conv4/dw"
-}
-layer {
-  name: "conv4"
-  type: "Convolution"
-  bottom: "conv4/dw"
-  top: "conv4"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv4/relu"
-  type: "ReLU"
-  bottom: "conv4"
-  top: "conv4"
-}
-layer {
-  name: "conv5/dw"
-  type: "Convolution"
-  bottom: "conv4"
-  top: "conv5/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    group: 256
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv5/dw/relu"
-  type: "ReLU"
-  bottom: "conv5/dw"
-  top: "conv5/dw"
-}
-layer {
-  name: "conv5"
-  type: "Convolution"
-  bottom: "conv5/dw"
-  top: "conv5"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv5/relu"
-  type: "ReLU"
-  bottom: "conv5"
-  top: "conv5"
-}
-layer {
-  name: "conv6/dw"
-  type: "Convolution"
-  bottom: "conv5"
-  top: "conv6/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    group: 256
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv6/dw/relu"
-  type: "ReLU"
-  bottom: "conv6/dw"
-  top: "conv6/dw"
-}
-layer {
-  name: "conv6"
-  type: "Convolution"
-  bottom: "conv6/dw"
-  top: "conv6"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv6/relu"
-  type: "ReLU"
-  bottom: "conv6"
-  top: "conv6"
-}
-layer {
-  name: "conv7/dw"
-  type: "Convolution"
-  bottom: "conv6"
-  top: "conv7/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv7/dw/relu"
-  type: "ReLU"
-  bottom: "conv7/dw"
-  top: "conv7/dw"
-}
-layer {
-  name: "conv7"
-  type: "Convolution"
-  bottom: "conv7/dw"
-  top: "conv7"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv7/relu"
-  type: "ReLU"
-  bottom: "conv7"
-  top: "conv7"
-}
-layer {
-  name: "conv8/dw"
-  type: "Convolution"
-  bottom: "conv7"
-  top: "conv8/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv8/dw/relu"
-  type: "ReLU"
-  bottom: "conv8/dw"
-  top: "conv8/dw"
-}
-layer {
-  name: "conv8"
-  type: "Convolution"
-  bottom: "conv8/dw"
-  top: "conv8"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv8/relu"
-  type: "ReLU"
-  bottom: "conv8"
-  top: "conv8"
-}
-layer {
-  name: "conv9/dw"
-  type: "Convolution"
-  bottom: "conv8"
-  top: "conv9/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv9/dw/relu"
-  type: "ReLU"
-  bottom: "conv9/dw"
-  top: "conv9/dw"
-}
-layer {
-  name: "conv9"
-  type: "Convolution"
-  bottom: "conv9/dw"
-  top: "conv9"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv9/relu"
-  type: "ReLU"
-  bottom: "conv9"
-  top: "conv9"
-}
-layer {
-  name: "conv10/dw"
-  type: "Convolution"
-  bottom: "conv9"
-  top: "conv10/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv10/dw/relu"
-  type: "ReLU"
-  bottom: "conv10/dw"
-  top: "conv10/dw"
-}
-layer {
-  name: "conv10"
-  type: "Convolution"
-  bottom: "conv10/dw"
-  top: "conv10"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv10/relu"
-  type: "ReLU"
-  bottom: "conv10"
-  top: "conv10"
-}
-layer {
-  name: "conv11/dw"
-  type: "Convolution"
-  bottom: "conv10"
-  top: "conv11/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv11/dw/relu"
-  type: "ReLU"
-  bottom: "conv11/dw"
-  top: "conv11/dw"
-}
-layer {
-  name: "conv11"
-  type: "Convolution"
-  bottom: "conv11/dw"
-  top: "conv11"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv11/relu"
-  type: "ReLU"
-  bottom: "conv11"
-  top: "conv11"
-}
-layer {
-  name: "conv12/dw"
-  type: "Convolution"
-  bottom: "conv11"
-  top: "conv12/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    group: 512
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv12/dw/relu"
-  type: "ReLU"
-  bottom: "conv12/dw"
-  top: "conv12/dw"
-}
-layer {
-  name: "conv12"
-  type: "Convolution"
-  bottom: "conv12/dw"
-  top: "conv12"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 1024
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv12/relu"
-  type: "ReLU"
-  bottom: "conv12"
-  top: "conv12"
-}
-layer {
-  name: "conv13/dw"
-  type: "Convolution"
-  bottom: "conv12"
-  top: "conv13/dw"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 1024
-    pad: 1
-    kernel_size: 3
-    group: 1024
-    engine: CAFFE
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv13/dw/relu"
-  type: "ReLU"
-  bottom: "conv13/dw"
-  top: "conv13/dw"
-}
-layer {
-  name: "conv13"
-  type: "Convolution"
-  bottom: "conv13/dw"
-  top: "conv13"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 1024
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv13/relu"
-  type: "ReLU"
-  bottom: "conv13"
-  top: "conv13"
-}
-layer {
-  name: "conv14_1"
-  type: "Convolution"
-  bottom: "conv13"
-  top: "conv14_1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv14_1/relu"
-  type: "ReLU"
-  bottom: "conv14_1"
-  top: "conv14_1"
-}
-layer {
-  name: "conv14_2"
-  type: "Convolution"
-  bottom: "conv14_1"
-  top: "conv14_2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 512
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv14_2/relu"
-  type: "ReLU"
-  bottom: "conv14_2"
-  top: "conv14_2"
-}
-layer {
-  name: "conv15_1"
-  type: "Convolution"
-  bottom: "conv14_2"
-  top: "conv15_1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv15_1/relu"
-  type: "ReLU"
-  bottom: "conv15_1"
-  top: "conv15_1"
-}
-layer {
-  name: "conv15_2"
-  type: "Convolution"
-  bottom: "conv15_1"
-  top: "conv15_2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv15_2/relu"
-  type: "ReLU"
-  bottom: "conv15_2"
-  top: "conv15_2"
-}
-layer {
-  name: "conv16_1"
-  type: "Convolution"
-  bottom: "conv15_2"
-  top: "conv16_1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv16_1/relu"
-  type: "ReLU"
-  bottom: "conv16_1"
-  top: "conv16_1"
-}
-layer {
-  name: "conv16_2"
-  type: "Convolution"
-  bottom: "conv16_1"
-  top: "conv16_2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 256
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv16_2/relu"
-  type: "ReLU"
-  bottom: "conv16_2"
-  top: "conv16_2"
-}
-layer {
-  name: "conv17_1"
-  type: "Convolution"
-  bottom: "conv16_2"
-  top: "conv17_1"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 64
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv17_1/relu"
-  type: "ReLU"
-  bottom: "conv17_1"
-  top: "conv17_1"
-}
-layer {
-  name: "conv17_2"
-  type: "Convolution"
-  bottom: "conv17_1"
-  top: "conv17_2"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 128
-    pad: 1
-    kernel_size: 3
-    stride: 2
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv17_2/relu"
-  type: "ReLU"
-  bottom: "conv17_2"
-  top: "conv17_2"
-}
-layer {
-  name: "conv11_mbox_loc"
-  type: "Convolution"
-  bottom: "conv11"
-  top: "conv11_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 12
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv11_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv11_mbox_loc"
-  top: "conv11_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv11_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv11_mbox_loc_perm"
-  top: "conv11_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv11_mbox_conf"
-  type: "Convolution"
-  bottom: "conv11"
-  top: "conv11_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 63
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv11_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv11_mbox_conf"
-  top: "conv11_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv11_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv11_mbox_conf_perm"
-  top: "conv11_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv11_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv11"
-  bottom: "data"
-  top: "conv11_mbox_priorbox"
-  prior_box_param {
-    min_size: 60.0
-    aspect_ratio: 2.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv13_mbox_loc"
-  type: "Convolution"
-  bottom: "conv13"
-  top: "conv13_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv13_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv13_mbox_loc"
-  top: "conv13_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv13_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv13_mbox_loc_perm"
-  top: "conv13_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv13_mbox_conf"
-  type: "Convolution"
-  bottom: "conv13"
-  top: "conv13_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 126
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv13_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv13_mbox_conf"
-  top: "conv13_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv13_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv13_mbox_conf_perm"
-  top: "conv13_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv13_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv13"
-  bottom: "data"
-  top: "conv13_mbox_priorbox"
-  prior_box_param {
-    min_size: 105.0
-    max_size: 150.0
-    aspect_ratio: 2.0
-    aspect_ratio: 3.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv14_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv14_2"
-  top: "conv14_2_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv14_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv14_2_mbox_loc"
-  top: "conv14_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv14_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv14_2_mbox_loc_perm"
-  top: "conv14_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv14_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv14_2"
-  top: "conv14_2_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 126
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv14_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv14_2_mbox_conf"
-  top: "conv14_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv14_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv14_2_mbox_conf_perm"
-  top: "conv14_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv14_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv14_2"
-  bottom: "data"
-  top: "conv14_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 150.0
-    max_size: 195.0
-    aspect_ratio: 2.0
-    aspect_ratio: 3.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv15_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv15_2"
-  top: "conv15_2_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv15_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv15_2_mbox_loc"
-  top: "conv15_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv15_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv15_2_mbox_loc_perm"
-  top: "conv15_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv15_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv15_2"
-  top: "conv15_2_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 126
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv15_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv15_2_mbox_conf"
-  top: "conv15_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv15_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv15_2_mbox_conf_perm"
-  top: "conv15_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv15_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv15_2"
-  bottom: "data"
-  top: "conv15_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 195.0
-    max_size: 240.0
-    aspect_ratio: 2.0
-    aspect_ratio: 3.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv16_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv16_2"
-  top: "conv16_2_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv16_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv16_2_mbox_loc"
-  top: "conv16_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv16_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv16_2_mbox_loc_perm"
-  top: "conv16_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv16_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv16_2"
-  top: "conv16_2_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 126
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv16_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv16_2_mbox_conf"
-  top: "conv16_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv16_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv16_2_mbox_conf_perm"
-  top: "conv16_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv16_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv16_2"
-  bottom: "data"
-  top: "conv16_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 240.0
-    max_size: 285.0
-    aspect_ratio: 2.0
-    aspect_ratio: 3.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "conv17_2_mbox_loc"
-  type: "Convolution"
-  bottom: "conv17_2"
-  top: "conv17_2_mbox_loc"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 24
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv17_2_mbox_loc_perm"
-  type: "Permute"
-  bottom: "conv17_2_mbox_loc"
-  top: "conv17_2_mbox_loc_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv17_2_mbox_loc_flat"
-  type: "Flatten"
-  bottom: "conv17_2_mbox_loc_perm"
-  top: "conv17_2_mbox_loc_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv17_2_mbox_conf"
-  type: "Convolution"
-  bottom: "conv17_2"
-  top: "conv17_2_mbox_conf"
-  param {
-    lr_mult: 1.0
-    decay_mult: 1.0
-  }
-  param {
-    lr_mult: 2.0
-    decay_mult: 0.0
-  }
-  convolution_param {
-    num_output: 126
-    kernel_size: 1
-    weight_filler {
-      type: "msra"
-    }
-    bias_filler {
-      type: "constant"
-      value: 0.0
-    }
-  }
-}
-layer {
-  name: "conv17_2_mbox_conf_perm"
-  type: "Permute"
-  bottom: "conv17_2_mbox_conf"
-  top: "conv17_2_mbox_conf_perm"
-  permute_param {
-    order: 0
-    order: 2
-    order: 3
-    order: 1
-  }
-}
-layer {
-  name: "conv17_2_mbox_conf_flat"
-  type: "Flatten"
-  bottom: "conv17_2_mbox_conf_perm"
-  top: "conv17_2_mbox_conf_flat"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "conv17_2_mbox_priorbox"
-  type: "PriorBox"
-  bottom: "conv17_2"
-  bottom: "data"
-  top: "conv17_2_mbox_priorbox"
-  prior_box_param {
-    min_size: 285.0
-    max_size: 300.0
-    aspect_ratio: 2.0
-    aspect_ratio: 3.0
-    flip: true
-    clip: false
-    variance: 0.1
-    variance: 0.1
-    variance: 0.2
-    variance: 0.2
-    offset: 0.5
-  }
-}
-layer {
-  name: "mbox_loc"
-  type: "Concat"
-  bottom: "conv11_mbox_loc_flat"
-  bottom: "conv13_mbox_loc_flat"
-  bottom: "conv14_2_mbox_loc_flat"
-  bottom: "conv15_2_mbox_loc_flat"
-  bottom: "conv16_2_mbox_loc_flat"
-  bottom: "conv17_2_mbox_loc_flat"
-  top: "mbox_loc"
-  concat_param {
-    axis: 1
-  }
-}
-layer {
-  name: "mbox_conf"
-  type: "Concat"
-  bottom: "conv11_mbox_conf_flat"
-  bottom: "conv13_mbox_conf_flat"
-  bottom: "conv14_2_mbox_conf_flat"
-  bottom: "conv15_2_mbox_conf_flat"
-  bottom: "conv16_2_mbox_conf_flat"
-  bottom: "conv17_2_mbox_conf_flat"
-  top: "mbox_conf"
-  concat_param {
-    axis: 1
-  }
-}
-layer {
-  name: "mbox_priorbox"
-  type: "Concat"
-  bottom: "conv11_mbox_priorbox"
-  bottom: "conv13_mbox_priorbox"
-  bottom: "conv14_2_mbox_priorbox"
-  bottom: "conv15_2_mbox_priorbox"
-  bottom: "conv16_2_mbox_priorbox"
-  bottom: "conv17_2_mbox_priorbox"
-  top: "mbox_priorbox"
-  concat_param {
-    axis: 2
-  }
-}
-layer {
-  name: "mbox_conf_reshape"
-  type: "Reshape"
-  bottom: "mbox_conf"
-  top: "mbox_conf_reshape"
-  reshape_param {
-    shape {
-      dim: 0
-      dim: -1
-      dim: 21
-    }
-  }
-}
-layer {
-  name: "mbox_conf_softmax"
-  type: "Softmax"
-  bottom: "mbox_conf_reshape"
-  top: "mbox_conf_softmax"
-  softmax_param {
-    axis: 2
-  }
-}
-layer {
-  name: "mbox_conf_flatten"
-  type: "Flatten"
-  bottom: "mbox_conf_softmax"
-  top: "mbox_conf_flatten"
-  flatten_param {
-    axis: 1
-  }
-}
-layer {
-  name: "detection_out"
-  type: "DetectionOutput"
-  bottom: "mbox_loc"
-  bottom: "mbox_conf_flatten"
-  bottom: "mbox_priorbox"
-  top: "detection_out"
-  include {
-    phase: TEST
-  }
-  detection_output_param {
-    num_classes: 21
-    share_location: true
-    background_label_id: 0
-    nms_param {
-      nms_threshold: 0.45
-      top_k: 100
-    }
-    code_type: CENTER_SIZE
-    keep_top_k: 100
-    confidence_threshold: 0.25
-  }
-}

README.md CHANGED Viewed

@@ -1,13 +1,44 @@
 ---
-title: Webrtc
-emoji: 📈
-colorFrom: purple
-colorTo: gray
-sdk: gradio
-sdk_version: 5.0.0b3
-app_file: app.py
-pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 license: mit
+tags:
+- object-detection
+- computer-vision
+- yolov10
+datasets:
+- detection-datasets/coco
+sdk: gradio
+sdk_version: 5.0.0b1
 ---
+### Model Description
+[YOLOv10: Real-Time End-to-End Object Detection](https://arxiv.org/abs/2405.14458v1)
+- arXiv: https://arxiv.org/abs/2405.14458v1
+- github: https://github.com/THU-MIG/yolov10
+### Installation
+```
+pip install supervision git+https://github.com/THU-MIG/yolov10.git
+```
+### Yolov10 Inference
+```python
+from ultralytics import YOLOv10
+import supervision as sv
+import cv2
+IMAGE_PATH = 'dog.jpeg'
+model = YOLOv10.from_pretrained('jameslahm/yolov10{n/s/m/b/l/x}')
+model.predict(IMAGE_PATH, show=True)
+```
+### BibTeX Entry and Citation Info
+ ```
+@article{wang2024yolov10,
+  title={YOLOv10: Real-Time End-to-End Object Detection},
+  author={Wang, Ao and Chen, Hui and Liu, Lihao and Chen, Kai and Lin, Zijia and Han, Jungong and Ding, Guiguang},
+  journal={arXiv preprint arXiv:2405.14458},
+  year={2024}
+}
+```

app.py CHANGED Viewed

@@ -1,10 +1,16 @@
 import gradio as gr
 import cv2
-import numpy as np
 from gradio_webrtc import WebRTC
-from pathlib import Path
 from twilio.rest import Client
 import os
 account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
 auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
@@ -17,72 +23,16 @@ rtc_configuration = {
     "iceTransportPolicy": "relay",
 }
-CLASSES = [
-    "background",
-    "aeroplane",
-    "bicycle",
-    "bird",
-    "boat",
-    "bottle",
-    "bus",
-    "car",
-    "cat",
-    "chair",
-    "cow",
-    "diningtable",
-    "dog",
-    "horse",
-    "motorbike",
-    "person",
-    "pottedplant",
-    "sheep",
-    "sofa",
-    "train",
-    "tvmonitor",
-]
-COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
-directory = Path(__file__).parent
-MODEL = str((directory / "MobileNetSSD_deploy.caffemodel").resolve())
-PROTOTXT = str((directory / "MobileNetSSD_deploy.prototxt.txt").resolve())
-net = cv2.dnn.readNetFromCaffe(PROTOTXT, MODEL)
 def detection(image, conf_threshold=0.3):
-    blob = cv2.dnn.blobFromImage(
-        cv2.resize(image, (300, 300)), 0.007843, (300, 300), 127.5
-    )
-    net.setInput(blob)
-    detections = net.forward()
-    image = cv2.resize(image, (500, 500))
-    (h, w) = image.shape[:2]
-    labels = []
-    for i in np.arange(0, detections.shape[2]):
-        confidence = detections[0, 0, i, 2]
-        if confidence > conf_threshold:
-            # extract the index of the class label from the `detections`,
-            # then compute the (x, y)-coordinates of the bounding box for
-            # the object
-            idx = int(detections[0, 0, i, 1])
-            box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
-            (startX, startY, endX, endY) = box.astype("int")
-            # display the prediction
-            label = f"{CLASSES[idx]}: {round(confidence * 100, 2)}%"
-            labels.append(label)
-            cv2.rectangle(image, (startX, startY), (endX, endY), COLORS[idx], 2)
-            y = startY - 15 if startY - 15 > 15 else startY + 15
-            cv2.putText(
-                image, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2
-            )
-    return image
-css=""".my-group {max-width: 600px !important; max-height: 600 !important;}
                       .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
@@ -90,12 +40,20 @@ with gr.Blocks(css=css) as demo:
     gr.HTML(
         """
     <h1 style='text-align: center'>
-    Image Detection from Webcam Stream (powered by WebRTC ⚡️)
     </h1>
-    """)
     with gr.Column(elem_classes=["my-column"]):
         with gr.Group(elem_classes=["my-group"]):
-            image = WebRTC(label="Strean", rtc_configuration=rtc_configuration)
             conf_threshold = gr.Slider(
                 label="Confidence Threshold",
                 minimum=0.0,
@@ -103,13 +61,10 @@ with gr.Blocks(css=css) as demo:
                 step=0.05,
                 value=0.30,
             )
         image.webrtc_stream(
-            fn=detection,
-            inputs=[image],
-            stream_every=0.05,
-            time_limit=30
         )
-if __name__ == '__main__':
     demo.launch()

 import gradio as gr
 import cv2
+from huggingface_hub import hf_hub_download
 from gradio_webrtc import WebRTC
 from twilio.rest import Client
 import os
+from inference import YOLOv10
+model_file = hf_hub_download(
+    repo_id="onnx-community/yolov10n", filename="onnx/model.onnx"
+)
+model = YOLOv10(model_file)
 account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
 auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
     "iceTransportPolicy": "relay",
 }
+rtc_configuration = None
 def detection(image, conf_threshold=0.3):
+    image = cv2.resize(image, (model.input_width, model.input_height))
+    new_image = model.detect_objects(image, conf_threshold)
+    return new_image
+css = """.my-group {max-width: 600px !important; max-height: 600 !important;}
                       .my-column {display: flex !important; justify-content: center !important; align-items: center !important};"""
     gr.HTML(
         """
     <h1 style='text-align: center'>
+    YOLOv10 Webcam Stream
     </h1>
+    """
+    )
+    gr.HTML(
+        """
+        <h3 style='text-align: center'>
+        <a href='https://arxiv.org/abs/2405.14458' target='_blank'>arXiv</a> | <a href='https://github.com/THU-MIG/yolov10' target='_blank'>github</a>
+        </h3>
+        """
+    )
     with gr.Column(elem_classes=["my-column"]):
         with gr.Group(elem_classes=["my-group"]):
+            image = WebRTC(label="Stream", rtc_configuration=rtc_configuration)
             conf_threshold = gr.Slider(
                 label="Confidence Threshold",
                 minimum=0.0,
                 step=0.05,
                 value=0.30,
             )
         image.webrtc_stream(
+            fn=detection, inputs=[image, conf_threshold], stream_every=0.05, time_limit=30
         )
+if __name__ == "__main__":
     demo.launch()

inference.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import time
+import cv2
+import numpy as np
+import onnxruntime
+from utils import draw_detections
+class YOLOv10:
+    def __init__(self, path):
+        # Initialize model
+        self.initialize_model(path)
+    def __call__(self, image):
+        return self.detect_objects(image)
+    def initialize_model(self, path):
+        self.session = onnxruntime.InferenceSession(
+            path, providers=onnxruntime.get_available_providers()
+        )
+        # Get model info
+        self.get_input_details()
+        self.get_output_details()
+    def detect_objects(self, image, conf_threshold=0.3):
+        input_tensor = self.prepare_input(image)
+        # Perform inference on the image
+        new_image = self.inference(image, input_tensor, conf_threshold)
+        return new_image
+    def prepare_input(self, image):
+        self.img_height, self.img_width = image.shape[:2]
+        input_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        # Resize input image
+        input_img = cv2.resize(input_img, (self.input_width, self.input_height))
+        # Scale input pixel values to 0 to 1
+        input_img = input_img / 255.0
+        input_img = input_img.transpose(2, 0, 1)
+        input_tensor = input_img[np.newaxis, :, :, :].astype(np.float32)
+        return input_tensor
+    def inference(self, image, input_tensor, conf_threshold=0.3):
+        start = time.perf_counter()
+        outputs = self.session.run(
+            self.output_names, {self.input_names[0]: input_tensor}
+        )
+        print(f"Inference time: {(time.perf_counter() - start)*1000:.2f} ms")
+        boxes, scores, class_ids, = self.process_output(outputs, conf_threshold)
+        return self.draw_detections(image, boxes, scores, class_ids)
+    def process_output(self, output, conf_threshold=0.3):
+        predictions = np.squeeze(output[0])
+        # Filter out object confidence scores below threshold
+        scores = predictions[:, 4]
+        predictions = predictions[scores > conf_threshold, :]
+        scores = scores[scores > conf_threshold]
+        if len(scores) == 0:
+            return [], [], []
+        # Get the class with the highest confidence
+        class_ids = np.argmax(predictions[:, 4:], axis=1)
+        # Get bounding boxes for each object
+        boxes = self.extract_boxes(predictions)
+        return boxes, scores, class_ids
+    def extract_boxes(self, predictions):
+        # Extract boxes from predictions
+        boxes = predictions[:, :4]
+        # Scale boxes to original image dimensions
+        boxes = self.rescale_boxes(boxes)
+        # Convert boxes to xyxy format
+        #boxes = xywh2xyxy(boxes)
+        return boxes
+    def rescale_boxes(self, boxes):
+        # Rescale boxes to original image dimensions
+        input_shape = np.array(
+            [self.input_width, self.input_height, self.input_width, self.input_height]
+        )
+        boxes = np.divide(boxes, input_shape, dtype=np.float32)
+        boxes *= np.array(
+            [self.img_width, self.img_height, self.img_width, self.img_height]
+        )
+        return boxes
+    def draw_detections(self, image, boxes, scores, class_ids, draw_scores=True, mask_alpha=0.4):
+        return draw_detections(
+            image, boxes, scores, class_ids, mask_alpha
+        )
+    def get_input_details(self):
+        model_inputs = self.session.get_inputs()
+        self.input_names = [model_inputs[i].name for i in range(len(model_inputs))]
+        self.input_shape = model_inputs[0].shape
+        self.input_height = self.input_shape[2]
+        self.input_width = self.input_shape[3]
+    def get_output_details(self):
+        model_outputs = self.session.get_outputs()
+        self.output_names = [model_outputs[i].name for i in range(len(model_outputs))]
+if __name__ == "__main__":
+    import requests
+    import tempfile
+    from huggingface_hub import hf_hub_download
+    model_file = hf_hub_download(
+        repo_id="onnx-community/yolov10s", filename="onnx/model.onnx"
+    )
+    yolov8_detector = YOLOv10(model_file)
+    with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f:
+        f.write(
+            requests.get(
+                "https://live.staticflickr.com/13/19041780_d6fd803de0_3k.jpg"
+            ).content
+        )
+        f.seek(0)
+        img = cv2.imread(f.name)
+    # # Detect Objects
+    combined_image = yolov8_detector.detect_objects(img)
+    # Draw detections
+    cv2.namedWindow("Output", cv2.WINDOW_NORMAL)
+    cv2.imshow("Output", combined_image)
+    cv2.waitKey(0)

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
 safetensors==0.4.3
 opencv-python
 https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio-5.0.0b3-py3-none-any.whl
-https://gradio-builds.s3.amazonaws.com/webrtc/03/gradio_webrtc-0.0.1-py3-none-any.whl
-twilio

 safetensors==0.4.3
 opencv-python
+twilio
 https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio-5.0.0b3-py3-none-any.whl
+https://huggingface.co/datasets/freddyaboulton/bucket/resolve/main/gradio_webrtc-0.0.1-py3-none-any.whl
+onx-runtime

utils.py ADDED Viewed

	@@ -0,0 +1,237 @@

+import numpy as np
+import cv2
+class_names = [
+    "person",
+    "bicycle",
+    "car",
+    "motorcycle",
+    "airplane",
+    "bus",
+    "train",
+    "truck",
+    "boat",
+    "traffic light",
+    "fire hydrant",
+    "stop sign",
+    "parking meter",
+    "bench",
+    "bird",
+    "cat",
+    "dog",
+    "horse",
+    "sheep",
+    "cow",
+    "elephant",
+    "bear",
+    "zebra",
+    "giraffe",
+    "backpack",
+    "umbrella",
+    "handbag",
+    "tie",
+    "suitcase",
+    "frisbee",
+    "skis",
+    "snowboard",
+    "sports ball",
+    "kite",
+    "baseball bat",
+    "baseball glove",
+    "skateboard",
+    "surfboard",
+    "tennis racket",
+    "bottle",
+    "wine glass",
+    "cup",
+    "fork",
+    "knife",
+    "spoon",
+    "bowl",
+    "banana",
+    "apple",
+    "sandwich",
+    "orange",
+    "broccoli",
+    "carrot",
+    "hot dog",
+    "pizza",
+    "donut",
+    "cake",
+    "chair",
+    "couch",
+    "potted plant",
+    "bed",
+    "dining table",
+    "toilet",
+    "tv",
+    "laptop",
+    "mouse",
+    "remote",
+    "keyboard",
+    "cell phone",
+    "microwave",
+    "oven",
+    "toaster",
+    "sink",
+    "refrigerator",
+    "book",
+    "clock",
+    "vase",
+    "scissors",
+    "teddy bear",
+    "hair drier",
+    "toothbrush",
+]
+# Create a list of colors for each class where each color is a tuple of 3 integer values
+rng = np.random.default_rng(3)
+colors = rng.uniform(0, 255, size=(len(class_names), 3))
+def nms(boxes, scores, iou_threshold):
+    # Sort by score
+    sorted_indices = np.argsort(scores)[::-1]
+    keep_boxes = []
+    while sorted_indices.size > 0:
+        # Pick the last box
+        box_id = sorted_indices[0]
+        keep_boxes.append(box_id)
+        # Compute IoU of the picked box with the rest
+        ious = compute_iou(boxes[box_id, :], boxes[sorted_indices[1:], :])
+        # Remove boxes with IoU over the threshold
+        keep_indices = np.where(ious < iou_threshold)[0]
+        # print(keep_indices.shape, sorted_indices.shape)
+        sorted_indices = sorted_indices[keep_indices + 1]
+    return keep_boxes
+def multiclass_nms(boxes, scores, class_ids, iou_threshold):
+    unique_class_ids = np.unique(class_ids)
+    keep_boxes = []
+    for class_id in unique_class_ids:
+        class_indices = np.where(class_ids == class_id)[0]
+        class_boxes = boxes[class_indices, :]
+        class_scores = scores[class_indices]
+        class_keep_boxes = nms(class_boxes, class_scores, iou_threshold)
+        keep_boxes.extend(class_indices[class_keep_boxes])
+    return keep_boxes
+def compute_iou(box, boxes):
+    # Compute xmin, ymin, xmax, ymax for both boxes
+    xmin = np.maximum(box[0], boxes[:, 0])
+    ymin = np.maximum(box[1], boxes[:, 1])
+    xmax = np.minimum(box[2], boxes[:, 2])
+    ymax = np.minimum(box[3], boxes[:, 3])
+    # Compute intersection area
+    intersection_area = np.maximum(0, xmax - xmin) * np.maximum(0, ymax - ymin)
+    # Compute union area
+    box_area = (box[2] - box[0]) * (box[3] - box[1])
+    boxes_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+    union_area = box_area + boxes_area - intersection_area
+    # Compute IoU
+    iou = intersection_area / union_area
+    return iou
+def xywh2xyxy(x):
+    # Convert bounding box (x, y, w, h) to bounding box (x1, y1, x2, y2)
+    y = np.copy(x)
+    y[..., 0] = x[..., 0] - x[..., 2] / 2
+    y[..., 1] = x[..., 1] - x[..., 3] / 2
+    y[..., 2] = x[..., 0] + x[..., 2] / 2
+    y[..., 3] = x[..., 1] + x[..., 3] / 2
+    return y
+def draw_detections(image, boxes, scores, class_ids, mask_alpha=0.3):
+    det_img = image.copy()
+    img_height, img_width = image.shape[:2]
+    font_size = min([img_height, img_width]) * 0.0006
+    text_thickness = int(min([img_height, img_width]) * 0.001)
+    #det_img = draw_masks(det_img, boxes, class_ids, mask_alpha)
+    # Draw bounding boxes and labels of detections
+    for class_id, box, score in zip(class_ids, boxes, scores):
+        color = colors[class_id]
+        draw_box(det_img, box, color)
+        label = class_names[class_id]
+        caption = f"{label} {int(score * 100)}%"
+        draw_text(det_img, caption, box, color, font_size, text_thickness)
+    return det_img
+def draw_box(
+    image: np.ndarray,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    return cv2.rectangle(image, (x1, y1), (x2, y2), color, thickness)
+def draw_text(
+    image: np.ndarray,
+    text: str,
+    box: np.ndarray,
+    color: tuple[int, int, int] = (0, 0, 255),
+    font_size: float = 0.001,
+    text_thickness: int = 2,
+) -> np.ndarray:
+    x1, y1, x2, y2 = box.astype(int)
+    (tw, th), _ = cv2.getTextSize(
+        text=text,
+        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
+        fontScale=font_size,
+        thickness=text_thickness,
+    )
+    th = int(th * 1.2)
+    cv2.rectangle(image, (x1, y1), (x1 + tw, y1 - th), color, -1)
+    return cv2.putText(
+        image,
+        text,
+        (x1, y1),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_size,
+        (255, 255, 255),
+        text_thickness,
+        cv2.LINE_AA,
+    )
+def draw_masks(
+    image: np.ndarray, boxes: np.ndarray, classes: np.ndarray, mask_alpha: float = 0.3
+) -> np.ndarray:
+    mask_img = image.copy()
+    # Draw bounding boxes and labels of detections
+    for box, class_id in zip(boxes, classes):
+        color = colors[class_id]
+        x1, y1, x2, y2 = box.astype(int)
+        # Draw fill rectangle in mask image
+        cv2.rectangle(mask_img, (x1, y1), (x2, y2), color, -1)
+    return cv2.addWeighted(mask_img, mask_alpha, image, 1 - mask_alpha, 0)