Yolov5+Deepsort+Slowfast实现实时动作检测

Yolov5+Deepsort+Slowfast实现实时动作检测

原Github项目, 本文只是做简单解析

原作者对repo的介绍:

Here are some details about our modification:

  • we choose yolov5 as an object detector instead of Faster R-CNN, it is faster and more convenient
  • we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames
  • our processing speed reached 24.2 FPS at 30 inference batch size (on a single RTX 2080Ti GPU)
  • 从__main__开始分析:

    if __name__ == "__main__":
        parser = argparse.ArgumentParser()
        # 输入
        parser.add_argument('--input', type=str, default="/home/wufan/images/video/vad.mp4",
                            help='test imgs folder or video or camera')
        # 输出目录
        parser.add_argument('--output', type=str, default="output.mp4",
                            help='folder to save result imgs, can not use input folder')
        # object detect config
        parser.add_argument('--imsize', type=int, default=640, help='inference size (pixels)')
        parser.add_argument('--conf', type=float, default=0.4, help='object confidence threshold')
        parser.add_argument('--iou', type=float, default=0.4, help='IOU threshold for NMS')
        parser.add_argument('--device', default='cuda', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
        parser.add_argument('--classes', nargs='+', default=0, type=int,
                            help='filter by class: --class 0, or --class 0 2 3')
        config = parser.parse_args()
    
        print(config)
        main(config)
    

    __main__里设置了输入输出参数以及目标检测的一些参数。

    解析main函数:

    对输入的config参数解析并使用,模型使用yolov5l6,权重下载到本地

        # 载入Yolov5模型
        model = torch.hub.load('ultralytics/yolov5', 'yolov5l6')
        model.conf = config.conf
        model.iou = config.iou
        # 最多探测目标数
        model.max_det = 200
        # 探测类别,设置为只探测人体
        if config.classes:
            model.classes = config.classes
        device = config.device
        imsize = config.imsize
    

    加载Slowfast、Deepsort模型,使用的Slowfast是在AVA2.2上训练的,通过AvaLabeledVideoFramePaths函数获得id到动作的mapping

        # 加载Slowfast resnet50模型
        video_model = slowfast_r50_detection(True).eval().to(device)
        # 加载DeepSORT预训练权重
        deepsort_tracker = DeepSort("deep_sort/deep_sort/deep/checkpoint/ckpt.t7")
        # 加载the id to label mapping for the AVA V2.2 dataset on which the Torch Hub models were fine-tuned
        ava_labelnames, _ = AvaLabeledVideoFramePaths.read_label_map("selfutils/temp.pbtxt")
        coco_color_map = [[random.randint(0, 255) for _ in range(3)] for _ in range(80)]
    

    对输出视频进行相关设置

        vide_save_path = config.output
        # 获取输入视频的尺寸并应用于输出视频
        video = cv2.VideoCapture(config.input)
        width, height = int(video.get(3)), int(video.get(4))
        video.release()
        # MPEG-4编码 .mp4 可指定结果视频的大小
        # 25帧输出
        outputvideo = cv2.VideoWriter(vide_save_path, cv2.VideoWriter_fourcc(*'mp4v'), 25, (width, height))
    
    

    之后载入输出视频并以math.ceil(video.duration)长度进行for循环处理:

    首先对视频进行抽帧处理,通过get_clip()对一秒内的视频进行抽帧,只保留视频图片,将tensor转numpy数组,BGR格式

        # 对视频一秒内进行抽帧得到图像数据
        video_clips = video.get_clip(i, i + 1 - 0.04)
        # return {"video": clip_frames, "frame_indices": frame_indices, "audio": None}
        video_clips = video_clips['video']
        if video_clips is None:
            continue
        img_num = video_clips.shape[1]
        imgs = []
        for j in range(img_num):
            imgs.append(tensor_to_numpy(video_clips[:, j, :, :]))
            # "video": A tensor of the clip's RGB frames with shape: (channel, time, height, width).
            # 将tensor转为numpy数组,BGR格式
    
    def tensor_to_numpy(tensor):
        img = tensor.cpu().numpy().transpose((1, 2, 0))
        return img
    

    yolo预测

        yolo_preds = model(imgs, size=imsize)
        # 每25帧后插入1帧作为预测图像
        yolo_preds.files = [f"img_{i * 25 + k}.jpg" for k in range(img_num)]
    

    使用预训练的Deepsort权重,以yolo预测结果作为输入,用Deepsort的结果代替yolo预测的结果,这里Deepsort是用来给相同id的目标分配动作label的。

        # DeepSORT
        deepsort_outputs = []
        for j in range(len(yolo_preds.pred)):
            temp = deepsort_update(deepsort_tracker, yolo_preds.pred[j].cpu(), yolo_preds.xywh[j][:, 0:4].cpu(),
                                       yolo_preds.imgs[j])
            if len(temp) == 0:
                temp = np.ones((0, 8))
            deepsort_outputs.append(temp.astype(np.float32))
        # 用DeepSORT的结果代替yolo预测的结果
        # we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames
        yolo_preds.pred = deepsort_outputs
        id_to_ava_labels = {}
    
    def deepsort_update(Tracker, pred, xywh, np_img):
        outputs = Tracker.update(xywh, pred[:, 4:5], pred[:, 5].tolist(), cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB))
        return outputs
    

    通过ava_inference_transform()函数对预测输入进行预处理,然后通过调用Slowfast模型进行预测,最后为每个id分配动作类别

        id_to_ava_labels = {}
        if yolo_preds.pred[img_num // 2].shape[0]:
            # 视频和bbox用于Slowfast模型前的预处理
            inputs, inp_boxes, _ = ava_inference_transform(video_clips, yolo_preds.pred[img_num // 2][:, 0:4],crop_size=imsize)
            # 把bbox和一个全为0的数组拼接
            inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0], 1), inp_boxes], dim=1)
            if isinstance(inputs, list):
                # Prepend data sample id for each bounding box.
                inputs = [inp.unsqueeze(0).to(device) for inp in inputs]
            else:
                inputs = inputs.unsqueeze(0).to(device)
            with torch.no_grad():
                slowfaster_preds = video_model(inputs, inp_boxes.to(device))
                slowfaster_preds = slowfaster_preds.cpu()
            # 为每个id分配动作类别
            for tid, avalabel in zip(yolo_preds.pred[img_num // 2][:, 5].tolist(),np.argmax(slowfaster_preds, axis=1).tolist()):
                id_to_ava_labels[tid] = ava_labelnames[avalabel + 1]
    

    最后将结果整合为视频

    def save_yolopreds_tovideo(yolo_preds, id_to_ava_labels, color_map, output_video):
        for i, (im, pred) in enumerate(zip(yolo_preds.imgs, yolo_preds.pred)):
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            if pred.shape[0]:
                for j, (*box, cls, trackid, vx, vy) in enumerate(pred):
                    if int(cls) != 0:
                        ava_label = ''
                    elif trackid in id_to_ava_labels.keys():
                        ava_label = id_to_ava_labels[trackid].split(' ')[0]
                    else:
                        ava_label = 'Unknow'
                    text = '{} {} {}'.format(int(trackid), yolo_preds.names[int(cls)], ava_label)
                    color = color_map[int(cls)]
                    im = plot_one_box(box, im, color, text)
            output_video.write(im.astype(np.uint8))
    
    物联沃分享整理
    物联沃-IOTWORD物联网 » Yolov5+Deepsort+Slowfast实现实时动作检测

    发表回复