代码收藏家技术教程 2022-11-10

Yolov5+Deepsort+Slowfast实现实时动作检测

原Github项目，本文只是做简单解析

原作者对repo的介绍：

Here are some details about our modification:

we choose yolov5 as an object detector instead of Faster R-CNN, it is faster and more convenient

we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames

our processing speed reached 24.2 FPS at 30 inference batch size (on a single RTX 2080Ti GPU)

从__main__开始分析：

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # 输入
    parser.add_argument('--input', type=str, default="/home/wufan/images/video/vad.mp4",
                        help='test imgs folder or video or camera')
    # 输出目录
    parser.add_argument('--output', type=str, default="output.mp4",
                        help='folder to save result imgs, can not use input folder')
    # object detect config
    parser.add_argument('--imsize', type=int, default=640, help='inference size (pixels)')
    parser.add_argument('--conf', type=float, default=0.4, help='object confidence threshold')
    parser.add_argument('--iou', type=float, default=0.4, help='IOU threshold for NMS')
    parser.add_argument('--device', default='cuda', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--classes', nargs='+', default=0, type=int,
                        help='filter by class: --class 0, or --class 0 2 3')
    config = parser.parse_args()

    print(config)
    main(config)

__main__里设置了输入输出参数以及目标检测的一些参数。

解析main函数：

对输入的config参数解析并使用，模型使用yolov5l6，权重下载到本地

    # 载入Yolov5模型
    model = torch.hub.load('ultralytics/yolov5', 'yolov5l6')
    model.conf = config.conf
    model.iou = config.iou
    # 最多探测目标数
    model.max_det = 200
    # 探测类别，设置为只探测人体
    if config.classes:
        model.classes = config.classes
    device = config.device
    imsize = config.imsize

加载Slowfast、Deepsort模型，使用的Slowfast是在AVA2.2上训练的，通过AvaLabeledVideoFramePaths函数获得id到动作的mapping

    # 加载Slowfast resnet50模型
    video_model = slowfast_r50_detection(True).eval().to(device)
    # 加载DeepSORT预训练权重
    deepsort_tracker = DeepSort("deep_sort/deep_sort/deep/checkpoint/ckpt.t7")
    # 加载the id to label mapping for the AVA V2.2 dataset on which the Torch Hub models were fine-tuned
    ava_labelnames, _ = AvaLabeledVideoFramePaths.read_label_map("selfutils/temp.pbtxt")
    coco_color_map = [[random.randint(0, 255) for _ in range(3)] for _ in range(80)]

对输出视频进行相关设置

    vide_save_path = config.output
    # 获取输入视频的尺寸并应用于输出视频
    video = cv2.VideoCapture(config.input)
    width, height = int(video.get(3)), int(video.get(4))
    video.release()
    # MPEG-4编码 .mp4 可指定结果视频的大小
    # 25帧输出
    outputvideo = cv2.VideoWriter(vide_save_path, cv2.VideoWriter_fourcc(*'mp4v'), 25, (width, height))

之后载入输出视频并以math.ceil(video.duration)长度进行for循环处理：

首先对视频进行抽帧处理，通过get_clip()对一秒内的视频进行抽帧，只保留视频图片，将tensor转numpy数组，BGR格式

    # 对视频一秒内进行抽帧得到图像数据
    video_clips = video.get_clip(i, i + 1 - 0.04)
    # return {"video": clip_frames, "frame_indices": frame_indices, "audio": None}
    video_clips = video_clips['video']
    if video_clips is None:
        continue
    img_num = video_clips.shape[1]
    imgs = []
    for j in range(img_num):
        imgs.append(tensor_to_numpy(video_clips[:, j, :, :]))
        # "video": A tensor of the clip's RGB frames with shape: (channel, time, height, width).
        # 将tensor转为numpy数组,BGR格式

def tensor_to_numpy(tensor):
    img = tensor.cpu().numpy().transpose((1, 2, 0))
    return img

yolo预测

    yolo_preds = model(imgs, size=imsize)
    # 每25帧后插入1帧作为预测图像
    yolo_preds.files = [f"img_{i * 25 + k}.jpg" for k in range(img_num)]

使用预训练的Deepsort权重，以yolo预测结果作为输入，用Deepsort的结果代替yolo预测的结果，这里Deepsort是用来给相同id的目标分配动作label的。

    # DeepSORT
    deepsort_outputs = []
    for j in range(len(yolo_preds.pred)):
        temp = deepsort_update(deepsort_tracker, yolo_preds.pred[j].cpu(), yolo_preds.xywh[j][:, 0:4].cpu(),
                                   yolo_preds.imgs[j])
        if len(temp) == 0:
            temp = np.ones((0, 8))
        deepsort_outputs.append(temp.astype(np.float32))
    # 用DeepSORT的结果代替yolo预测的结果
    # we use a tracker(deepsort) to allocate action labels to all objects(with same ids) in different frames
    yolo_preds.pred = deepsort_outputs
    id_to_ava_labels = {}

def deepsort_update(Tracker, pred, xywh, np_img):
    outputs = Tracker.update(xywh, pred[:, 4:5], pred[:, 5].tolist(), cv2.cvtColor(np_img, cv2.COLOR_BGR2RGB))
    return outputs

通过ava_inference_transform()函数对预测输入进行预处理，然后通过调用Slowfast模型进行预测，最后为每个id分配动作类别

    id_to_ava_labels = {}
    if yolo_preds.pred[img_num // 2].shape[0]:
        # 视频和bbox用于Slowfast模型前的预处理
        inputs, inp_boxes, _ = ava_inference_transform(video_clips, yolo_preds.pred[img_num // 2][:, 0:4],crop_size=imsize)
        # 把bbox和一个全为0的数组拼接
        inp_boxes = torch.cat([torch.zeros(inp_boxes.shape[0], 1), inp_boxes], dim=1)
        if isinstance(inputs, list):
            # Prepend data sample id for each bounding box.
            inputs = [inp.unsqueeze(0).to(device) for inp in inputs]
        else:
            inputs = inputs.unsqueeze(0).to(device)
        with torch.no_grad():
            slowfaster_preds = video_model(inputs, inp_boxes.to(device))
            slowfaster_preds = slowfaster_preds.cpu()
        # 为每个id分配动作类别
        for tid, avalabel in zip(yolo_preds.pred[img_num // 2][:, 5].tolist(),np.argmax(slowfaster_preds, axis=1).tolist()):
            id_to_ava_labels[tid] = ava_labelnames[avalabel + 1]

最后将结果整合为视频

def save_yolopreds_tovideo(yolo_preds, id_to_ava_labels, color_map, output_video):
    for i, (im, pred) in enumerate(zip(yolo_preds.imgs, yolo_preds.pred)):
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        if pred.shape[0]:
            for j, (*box, cls, trackid, vx, vy) in enumerate(pred):
                if int(cls) != 0:
                    ava_label = ''
                elif trackid in id_to_ava_labels.keys():
                    ava_label = id_to_ava_labels[trackid].split(' ')[0]
                else:
                    ava_label = 'Unknow'
                text = '{} {} {}'.format(int(trackid), yolo_preds.names[int(cls)], ava_label)
                color = color_map[int(cls)]
                im = plot_one_box(box, im, color, text)
        output_video.write(im.astype(np.uint8))